#!/usr/bin/perl -wT
$ENV{'PATH'} = '/usr/local/bin:/bin';
delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};   # Make %ENV safer

######################################################################
# written by: Belinda Giardine
######################################################################
use strict;

#this converts from 1 set of coordinates to another using a dump from a psl table
#it expects both coordinates to be nts

my $psl = shift @ARGV;
my $seq = shift @ARGV;
my $input = shift @ARGV;
my $pslLine = shift @ARGV if @ARGV;
if (!$psl or !$input or !$seq) { 
   print "USAGE: convert_prot_coors3 file.psl seqName_qName_in_file 999 ['pslLine']\n";
   print "Where the 9's are replaced by the codon position\n",
         "pslLine is optional for programs to send tab separated line from ",
         "a psl file\n";
   exit;
}
if ($input =~ /![0-9]/) {
   print "ERROR needs a integer\n";
}
my $fh;
my @f; #fields in psl file that correspond to origin seq
if (!$pslLine) {
open($fh, '<', $psl) or die "ERROR couldn't open psl file, $psl, $!\n";
while (<$fh>) {
   #format: bin,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert
   #tBaseInsert,strand  qName   qSize   qStart  qEnd    tName   tSize   tStart  tEnd    blockCount
   #blockSizes      qStarts tStarts
   #for blat runs qStarts are codons, for kgProtMap they are nts!
   chomp;
   if (!defined or $_ eq '') { next; }
   if (/\s*#/) { next; } #comment
   @f = split(/\s+/);
   if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; }
   elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads
   elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; }
   if ($f[9] eq $seq) { last; }
}
close $fh or die "ERROR reading psl file, $psl $!\n";
}else {
   @f = split(/\s+/, $pslLine);
}

if ($f[9] ne $seq) {
   print "ERROR couldn't find $seq in $psl have $f[9]\n";
   exit;   
}

my @gbStarts = split(/,/, $f[19]); #these are counting from end of - strands
my @gbLens = split(/,/, $f[18]);
my @chrStarts = split(/,/, $f[20]);
my $totBlocks = $f[17];
my $gbSt = $f[11]; #start of alignment in query sequence
my $chromPos;
my $chr = $f[13];
my $ntPos = ($input * 3) - 2;

if ($f[8] =~ /^\+/) { #plus strand
   my $b;
   for($b = 0; $b < $totBlocks; $b++) {
      if ($ntPos <= $gbStarts[$b] + $gbLens[$b]) {
         if ($ntPos < ($gbStarts[$b] + 1)) { #1 based comparison
            print "ERROR: Couldn't map $seq $input, it is in a gap\n";
            exit;
         }
         my $diff = $ntPos - ($gbStarts[$b] + 1); #nt based difference
         $chromPos = $chrStarts[$b] + $diff; 
         if ($ntPos > ($gbStarts[$b] + $gbLens[$b] - 2)) {
            my $e = $chrStarts[$b] + $gbLens[$b];
            my $len = $e - $chromPos;
            if ($len > 2) { print "ERROR illegal length $len\n"; exit; }
            my $c2 = $chrStarts[$b+1];
            my $c3 = $c2 + (3 - $len);
            #UCSC numbering for second set
            $chromPos++;
            print "ERROR $input maps to last codon of exon or near gap ",
                  "$chr\t$chromPos\t$e\t$c2\t$c3\n";
            exit;
         }
         last;
      }
   }
   if ($b == $totBlocks && $ntPos > $gbStarts[$b-1] + $gbLens[$b-1]) {
      my $pos = $gbStarts[$b-1] + $gbLens[$b-1];
      print "ERROR $input not in gene, end of gene at $pos\n";
      exit;
   }
}else { #minus strand
   #chrom always +?, query can be -
   my $pos = $gbSt; #check for gap at beginning
   my $b;
   $ntPos += 2; #find lowest nt position (chrom coors) in codon
   for($b = $totBlocks - 1; $b >= 0; $b--) {
      $pos += $gbLens[$b];
      #check for gap
      my $gap = 0;
      if ($b != $totBlocks -1 && $gbStarts[$b+1] != $gbStarts[$b] + $gbLens[$b]) {
         $gap = $gbStarts[$b+1] - ($gbStarts[$b] + $gbLens[$b]);
         $pos += $gap;
      }
      if ($ntPos <= $pos) {
         if ($ntPos <= $pos - $gbLens[$b]) {
            print "ERROR: Couldn't map $seq $input, it is in a gap\n";
            exit;
         }
         my $diff = $pos - $ntPos; #nt difference
         $chromPos = $chrStarts[$b] + $diff;
         if ($ntPos < $pos - ($gbLens[$b] - 1) + 2) {
            my $e = $chrStarts[$b] + $gbLens[$b];
            my $len = $e - $chromPos;
            if ($len > 2) { print "ERROR $input maps illegally len=$len\n"; exit; }
            my $c2 = $chrStarts[$b+1];
            my $c3 = $c2 + (3 - $len);
            $chromPos++;
            print "ERROR $input maps to first codon (split) of exon ",
                  "$chr\t$chromPos\t$e\t$c2\t$c3\n";
            exit;
         }
         #zero based start as chrStart is zero based
         last;
      }
   }
   if ($b < 0 && $ntPos > $pos) {
      print "ERROR $input not in gene, end of gene at $pos\n";
      exit;
   }
}

if (!$chromPos) { print "ERROR couldn't find chr position for $input in $seq\n"; exit; }
$chromPos++; #switch to 1 based number
my $end = $chromPos + 2;
print "$chr\t$chromPos\t$end\n";

exit;
