#!/usr/bin/perl -w
$ENV{'PATH'} = '/usr/bin:/cluster/home/giardine/bin/x86_64/';

use strict;

#this reads the inputs from files, generates the HGVS name for the variant
#and computes coordinates using blat and perl script convert_prot_coors2
#requires uniProt database tables: protein, comment, commentType
#requires protDb table: spXref2
#set variables build, protDb, disFile, spFile, kgProtMap below

#Notes on how name is derived from uniProt features
#for len=1
#HGVS name: spId:p.[orig->3letter][start][variant->3letter]
#for len>1 -> indel
#HGVS name: spId:p.[first aa in orig->3letter][start]_
        #[last aa in orig->3letter][start+len-1]delins[variant->3letter]

#several variable changes to switch usage (kgProtMap,psl,converter)
my $build = 'hg18';
my $protDb = 'proteins060115';
my $kgProtMap = 'kgProtMapUniq.txt'; #must set, blat too slow
my $customTrack = 1; #flag whether want custom track with names
my $psl = "kgProtMapUniq.$build.txt"; #'tmp.psl';
my $psl2; #my $psl2 = 'protBlatUniq.txt';
#the one below needs the feature ID as part of spVars
my $fixFeatures; # = 'featureType2.txt'; #features with fix for run togethers
#had to adust IDs, runs didn't match exactly
my %feature;
#skip for now too many errors, better way of chosing alignment?
my $spSeq = 'tmp.fa';
#build controlled by path of nibs
#not currently used
my $nibList = 'nibsFullPath.list';
#version 2 for protBlat, version 3 for kgProtMap
my $converter = './convert_prot_coors3';  #lengths in nts
my $converter2 = './convert_prot_coors2'; #lengths in codons
my %varIdToSpId;
my %aa1To3;
my $aaFile = 'aminoInfoDump';
#format:3letterAbrev 1letterAbrev LongName
my $spFile = 'spVars061002.txt'; #'arVars.txt'; 'spVars.txt';
#disease associations
my $disFile = 'SwissProtDiseaseAssoc.txt';
my %disAssoc;
my %gene;
my %omim;
my %omimId;
my %varIdToMutName;
my %codon;
my %uniProt; #for uniProt disease or polymorphism associations
my $pslLine;
my $coordinateAccuracy = 1;
my $strand; #filled in when get PSL
my $outGV = "gvPosSP.$build.txt";
my $outGV2 = 'gvLinkSP.txt';
my $outGV3 = 'gvAttrSP.txt';
my $outGV4 = 'gvSP.txt';
my $outGV5 = 'sp.ct';
my $outGV6 = 'gvAttrLongSP.txt';

#load hashes used in conversion
loadCodons();
my $fh;
open($fh, '<', $aaFile) or die "Couldn't open $aaFile, $!\n";
while (<$fh>) {
    chomp;
    my @v = split(/\s+/);
    $aa1To3{$v[1]} = $v[0];
}
close $fh or die "Couldn't close $aaFile, $!\n";

#load disease associations
loadDisease();

loadUniprotDisease();
loadOmimTitles();

if ($fixFeatures) { loadFeatureFix(); }

#generate coordinates and print here
#read the spFile and generate the mutation name
open($fh, '<', $spFile) or die "Couldn't open $spFile, $!\n";
my $outFh;
open($outFh, '>', $outGV) or die "Couldn't open $outGV, $!\n";
my $outFh4;
open($outFh4, '>', $outGV4) or die "Couldn't open $outGV4, $!\n";
my $outFh2;
my $prevSpId = 0;
open($outFh2, '>', $outGV2) or die "Couldn't open $outGV2, $!\n";
my $outFh3;
open($outFh3, '>', $outGV3) or die "Couldn't open $outGV3, $!\n";
my $outFh5;
open($outFh5, '>', $outGV5) or die "Couldn't open $outGV5, $!\n";
my $outFh6;
open($outFh6, '>', $outGV6) or die "Couldn't open $outGV6, $!\n";
my $cnt = 0;
while (<$fh>) {
    chomp;
    $cnt++;
    my @v = split(/\t/);
    #format:spId startCodon len variant varId [featureId]
    if ($fixFeatures && scalar @v != 6) { die "Incompatible run, need IDs to run with feature fix\n"; }
    if ($fixFeatures && exists $feature{$v[5]}) {
       $v[3] = $feature{$v[5]};
    }
    my $status;
    my $phenotype;
    if (exists $disAssoc{$v[4]}) {
       if ($disAssoc{$v[4]} ne 'Polymorphism') {
          $status = 'known to be disease-associated';
          $phenotype = $disAssoc{$v[4]};
       }else {
          $status = 'not disease-associated';
       }
    }
    if (!$status) {
       $status = 'likely to be disease-associated';
       if ($v[3] =~ /polymorphism/ && $v[3] =~ /cancer|disease/i) { undef $status; }
       elsif ($v[3] =~ /polymorphism/) { undef $status; }
       elsif ($v[3] =~ /unknown\s*pathological\s*significance/) { undef $status; }
       elsif ($v[3] =~ /pathological\s*significance\s*unknown/) { undef $status; }
       elsif ($v[3] =~ /effect not known/) { undef $status; }
    }
    my $n;
    my $oldAA;
    if ($v[2] == 1) {
        if (!$v[4]) { die "undefined varId\n"; }
        $n = "$v[0]:p.";
        if ($v[3] =~ /(\w) -> (\w)/) {
            my $aaf = $1;
            $oldAA = $aaf;
            my $aat = $2;
            if ($aaf eq $aat) { undef $status; }
            $n .= $aa1To3{$aaf};
            $n .= ($v[1] + 1); #+1 for UCSC numbers
            $n .= $aa1To3{$aat};
        }elsif ($v[3] =~ /Missing/) {
            $n .= ($v[1] + 1) . "del";
        }else {
            print "Couldn't parse '$v[3]'\n";
            next;
        }
    }elsif ($v[2] > 1) {
        $n = "$v[0]:p.";
        if ($v[3] =~ /(\w+) -> (\w+)/) {
            my $aaf = $1;
            $oldAA = $aaf;
            my $aat = $2;
            $aaf =~ /^(\w)/;
            my $firstAA = $1;
            $n .= $aa1To3{$firstAA};
            $n .= ($v[1] + 1);
            $n .= '_';
            $aaf =~ /(\w)$/;
            my $lastAA = $1;
            $n .= $aa1To3{$lastAA};
            $n .= ($v[1] + $v[2]);
            $n .= 'delins';
            my @a = split(/ */, $aat);
            foreach (@a) {
                if (!/[A-Z]/) { last; } #bad aa
                if (!$aa1To3{$_}) { 
                   #die "Died unknown aa is $_\n"; }
                   $n =~ s/delins.*/delins/; #Bad amino acid don't list
                   print "Bad amino acid for $v[0] $v[3] using name=$n\n";
                   last; 
                }
                $n .= $aa1To3{$_};    
            } 
        }elsif ($v[3] =~ /Missing/) {
            $n .= ($v[1] + 1) . "_" . ($v[1] + $v[2]) . "del";
        }else {
            print "Couldn't parse '$v[3]'\n";
            next;
        }
    }else { die "ERROR invalid len $v[2]\n"; }
    #make sure name isn't too long
    #long names are placed in separate attribute table with larger field
    if (length $n > 255) {
       my $full = $n;
       $n =~ s/delins.*/delins/;
       print $outFh6 "$v[4]\tlongName\t$full\n";
    }
    $varIdToMutName{$v[4]} = $n;
    if (!$kgProtMap && $prevSpId ne $v[0]) {
       print "creating psl file for $v[0]\n";
       my $succ = generatePsl($v[0]);
       $prevSpId = $v[0]; #reset so matches psl file
       if (!$succ) { next; } #can't do this one
    }elsif ($kgProtMap && $prevSpId ne $v[0]) {
       $prevSpId = $v[0]; #reset so matches psl file
       #print "TESTING starting new protein $v[0]\n";
       $pslLine = getPslLine($v[0]);
       if (!$pslLine) { 
          print "no alignment for $n\n";
          next;
       }
    }elsif ($kgProtMap && !$pslLine) {
       #print "no alignment still for $n\n";
       next;
    }

    my @chr = getCoordinates($v[1] + 1, $v[0], $pslLine);
    my @chr2;
    if (@chr && $coordinateAccuracy && $chr[0] =~ /maps to first|maps to last/) {
       $chr[0] =~ /(chr[XYM0-9]+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/;
       $chr[0] = $1;
       $chr[1] = $2;
       $chr[2] = $3;
       $chr2[1] = $4;
       $chr2[2] = $5;
    }
    if (!@chr or !$chr[0] or $chr[0] =~ /ERROR/) { 
        print "no coordinates for $n\n"; 
        if (@chr && $chr[0]) { print $chr[0], "\n"; }
        next; 
    }
    my $type = 'substitution';
    if ($n =~ /(\d+)delins/) {
       my $e = $1;
       my @t = getCoordinates($e, $v[0], $pslLine);
       if (@chr2 or !@t or $t[0] =~ /ERROR/) {
           print "no end coordinates for $n\n";
           if (@t) { print $t[0], "\n"; }
           next;
       }
       #new start or end depending on strand
       if ($t[1] < $chr[1]) {
           $chr[1] = $t[1];
       }else {
           $chr[2] = $t[2];
       }
       $type = 'complex';
    }elsif ($n =~ /del/) {
       if ($n =~ /_(\d+)del/) {
          my $e = $1;
          my @t = getCoordinates($e, $v[0], $pslLine);
          if (@chr2 or !@t or $t[0] =~ /ERROR/) {
              print "no end coordinates for $n\n";
              if (@t) { print $t[0], "\n"; }
              next;
          }
          if ($t[1] < $chr[1]) {
             $chr[1] = $t[1];
          }else {
             $chr[2] = $t[2];
          }
       }
       $type = 'deletion';
    }
    $chr[1]--; #UCSC coordinates
    if (!$chr[0]) { die "ERROR Bad coordinates for $v[4] $n $v[3]\n"; }
    if (!@chr or !$chr[0] or $chr[0] =~ /ERROR/) {
        print "no coordinates for $n\n";
        if (@chr && $chr[0]) { print $chr[0], "\n"; }
        next;
    }
    if ($oldAA && !@chr2 && !checkSequence($oldAA, $strand, $chr[0], $chr[1], $chr[2])) {
        #print "sequence mismatch for $n\n";
        next;
    }elsif ($oldAA && !checkSequence($oldAA, $strand, $chr[0], $chr[1], $chr[2], $chr2[1], $chr2[2])) {
        #print "sequence mismatch for $n\n";
        next;
    }

    my $short = $n;
    if (length $n > 64) {
       $short =~ s/delins.*/delins/;
       if (length $short > 64) { die "Couldn't shorten name $n.\n"; }
    }
    print $outFh "$chr[0]\t$chr[1]\t$chr[2]\t$v[4]\t$strand\t$short\n";
    if (@chr2) {
       print $outFh "$chr[0]\t$chr2[1]\t$chr2[2]\t$v[4]\t$strand\t$short\n";
       if ($customTrack) { print $outFh5 "$chr[0]\t$chr2[1]\t$chr2[2]\t$short\n"; }
    }
    print $outFh4 "$v[4]\t$n\tSP\t$type\texon\t$coordinateAccuracy\n";
    if ($status) {
       print $outFh3 "$v[4]\tdisease\t$status\n";
    }
    if ($customTrack) { print $outFh5 "$chr[0]\t$chr[1]\t$chr[2]\t$short\n"; }
    #gvAttr:mutId attrKey attrValue [outFh3]
    #gvLink: id attrType raKey acc displayValue [outFh2]
    #print SwissProt links
    if ($type eq 'complex') { #SwissProt2 search 4
       print $outFh2 "$v[4]\tlinks\tUniProtSearch\t$v[4]\t\n";
    }else { #SP variant 3
       print $outFh2 "$v[4]\tlinks\tUniProtVar\t$v[4]\t\n";
    }
    if ($v[3] =~ /in dbSNP:(\d+)/) {
       my $a = $1;
       print $outFh2 "$v[4]\tlinks\tdbSNP\t$a\t\n";
    }elsif ($v[3] !~ /in allele/ && $v[3] =~ /\((.*)\)/) {
       print $outFh3 "$v[4]\tcomment\t$1\n";
    }
    if ($phenotype) {
       print $outFh3 "$v[4]\tphenoCommon\t$phenotype\n"; 
    }
    if (exists $uniProt{$v[0]}) { #uniProt disease/polymorphism links
       foreach my $k (@{$uniProt{$v[0]}}) {
          print $outFh2 "$v[4]\tgenesDisVars\t$k\t$v[0]\t\n";
       }
    }
    if (exists $omimId{$v[0]}) { #omim title links
       foreach my $o (@{$omimId{$v[0]}}) {
          print $outFh2 "$v[4]\tgenesDisVars\tomimTitle2\t$o\t\n";
       }
    }
}
close $fh or die "Couldn't close $spFile, $!\n";
close $outFh or die "couldn't close outFh, $!\n";
close $outFh3 or die "couldn't close outFh3, $!\n";
close $outFh2 or die "couldn't close outFh2, $!\n";
close $outFh4 or die "couldn't close outFh4, $!\n";
close $outFh5 or die "couldn't close outFh5, $!\n";
close $outFh6 or die "couldn't close outFh6, $!\n";

#print "TESTING read $cnt variants\n";
exit 0;
############################################################################

#get coordinates based on mRNA and protein position
sub getCoordinates {
    my $pos = shift;
    my $acc = shift;
    my $pslLine = shift;
    my @chr;
    my $fh;
    my $command = "$converter $psl $acc $pos '$pslLine' 2>&1 |";
    if (!$coordinateAccuracy) {
       $command = "$converter2 $psl $acc $pos '$pslLine' 2>&1 |";
    }
    open ($fh, $command)
      or die "ERROR Couldn't run convert_prot_coors[2|3], $!\n";
    while (<$fh>) {
        chomp;
        if (/ERROR/) { $chr[0] = $_; last; }
        @chr = split(/\t/);
    }
    close $fh;
    return @chr;
}
####End of subroutine getCoordinates

#this runs blat on the SP transcript to get the psl
sub generatePsl {
   my $spId = shift @_;
   if (system("hgsql -N uniProt > $spSeq <<end\nselect '>$spId'; select val from protein where acc = '$spId';\nend") == -1) {
      die "Couldn't fetch sequence for $spId\n";
      #or return 0;
   }
   my $pfh;
   #this takes about 10 mins to run
   open($pfh, "blat $nibList $spSeq -noHead -q=prot -t=dnax stdout 2>&1 |")
      or die "ERROR couldn't run blat on protein sequence for $spId, $!\n";
   #catch output so can just print first/best match
   my $line;
   while(<$pfh>) {
      if (!$line && /^\d+/) { $line = $_; }
   }
   close $pfh or die "Couldn't finish blat run for $spId, $!\n";
   open($pfh, ">", $psl) or die "Couldn't open psl file for writing\n, $!\n";
   if ($line) { print $pfh $line; }
   close $pfh or die "Couldn't close psl file, $!\n";
   if ($line) { return 1; }
   else { return 0; }
}
####End of subroutine generatePsl

#this gets the fields from the psl file
sub getPslLine {
   my $seq = shift @_;
   my $pfh;
   open($pfh, '<', $psl) or die "ERROR couldn't open psl file, $psl, $!\n";
   my @f;
   my $rv;
   while (<$pfh>) {
      chomp;
      if (!defined or $_ eq '') { next; }
      if (/\s*#/) { next; } #comment
      @f = split(/\s+/);
      if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; }
      elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads
      elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; }
      if ($f[9] eq $seq) { $rv = join("\t", @f); last; }
   }
   $strand = $f[8];
   close $pfh or die "ERROR reading psl file, $psl $!\n";
   if ($rv) { 
      $coordinateAccuracy = 1;
   }elsif ($psl2) {
      $coordinateAccuracy = 0;
      my $matches = 0;
      open($pfh, '<', $psl2) or die "ERROR couldn't open psl file, $psl2, $!\n";
      while (<$pfh>) {
         chomp;
         if (!defined or $_ eq '') { next; }
         if (/\s*#/) { next; } #comment
         @f = split(/\s+/);
         if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; }
         elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads
         elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; }
         #keep alignment with highest matches
         if ($f[9] eq $seq && $f[0] > $matches) { 
            $rv = join("\t", @f);  
            $matches = $f[0];
         }
      }
      close $pfh or die "ERROR reading psl file, $psl2 $!\n";
   }
   return $rv;
}
####End

sub loadCodons {
   open (COD, "codon_amino_info") or die "Couldn't open codon_amino_info, $!\n";
   while (<COD>) {
      chomp;
      my @t = split(/\t/);
      $t[1] =~ s/\s+//g;
      $t[0] =~ s/\s+//g;
      $codon{$t[0]} = $t[1];
   }
   close (COD);
}
####End 

sub loadDisease {
   open (DIS, $disFile) or die "Couldn't open $disFile, $!\n";
   my $c = 0;
   while (<DIS>) {
      chomp;
      my @t = split(/\s+/, $_, 10);
      #gene gene? SPGeneId VarID position oldAA -> newAA diseaseAssoc disease
      if ($t[8] eq 'Polymorphism') {
         $disAssoc{$t[3]} = $t[8];
      }elsif ($t[8] eq 'Disease') {
         $disAssoc{$t[3]} = $t[9];
         if ($disAssoc{$t[3]} =~ /\[MIM:(\d+)\]/) {
            $omim{$t[3]} = $1;
         }
      }
      #$gene{$t[3]} = "$t[0]($t[2])";
      $c++;
   }
   close (DIS) or die "Couldn't finish loading $disFile, $!\n";
   print STDERR "Found $c variants with disease-association data\n";
}
####End

sub checkSequence {
   my $aa = shift;
   my $strand = shift;
   my $chr = shift;
   my $st = shift;
   my $end = shift;
   my $st2 = shift;
   my $end2 = shift;
   my @a = split(/ */, $aa);
   my $checked = 0;
   if ($strand eq '-') { $strand = 'm'; } #switch to nibFrag format
   open (NIB, "nibFrag -upper /cluster/data/$build/nib/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
   my $seq = '';
   while (<NIB>) {
      chomp;
      if (/^>/) { next; } #header
      $seq .= $_;
   }
   close NIB or die "Couldn't finish nibFrag, $!\n";
   if ($st2) {
      open (NIB, "nibFrag -upper /cluster/data/$build/nib/${chr}.nib $st2 $end2 $strand stdout |") or die "Couldn't run nibFrag, $!\n";
      while (<NIB>) {
         chomp;
         if (/^>/) { next; } #header
         $seq .= $_;
      }  
      close NIB or die "Couldn't finish nibFrag, $!\n";
   }
   if ($seq eq '') { return 0; }
   my @s = split(/ */, $seq);
   foreach $aa (@a) {
      my $codon = join("", splice(@s, 0, 3));
      if (!$codon or !exists $codon{$codon} or $codon{$codon} ne $aa1To3{$aa}) {
         print "ERROR mismatch $codon != $aa\n";
         return 0;
      }
      $checked = 1;
   }
   return $checked;
}
####End

sub loadFeatureFix {
   open(FIX, $fixFeatures) or die "Couldn't open $fixFeatures, $!\n";
   while (<FIX>) {
      chomp;
      my @t = split(/\t/);
      $feature{$t[0]} = $t[1];
   }
   close (FIX) or die "Couldn't finish $fixFeatures, $!\n";
}
####End 

sub loadOmimTitles {
   my $sql = 'select accession, extAC from spXref2 where division = "9606" and extDB = "MIM"';
   open(FH, "-|", "hgsql -NB $protDb -e '$sql'") 
	or die "Couldn't fetch omim IDs from db, $!\n";
   while (<FH>) {
       chomp;
       my @t = split(/\s+/);
       push(@{$omimId{$t[0]}}, $t[1]);
   }
   close(FH) or die "Couldn't finish omim IDs, $!\n";
}
####End

sub loadUniprotDisease {
   #get polymorphism and disease IDs
   my $sql = "select id, val from commentType where val in ('DISEASE', 'POLYMORPHISM')";
   my %type;
   my $dis;
   my $poly;
   open(FH, "-|", "hgsql -NB uniProt -e \"$sql\"")
        or die "Couldn't fetch uniProt type IDs from db, $!\n";
   while (<FH>) {
       chomp;
       my @t = split(/\s+/);
       if ($t[1] eq 'DISEASE') {
          $dis = $t[0];
          $type{$t[0]} = 'uniProtDis';
       }else {
          $poly = $t[0];
          $type{$t[0]} = 'uniProtPoly';
       }
   }
   close(FH) or die "Couldn't finish uniProt type IDs, $!\n";
   #now get which IDs go with with which spIDs
   $sql = "select acc, commentType from comment where commentType in ($dis, $poly)";
   open(FH, "-|", "hgsql -NB uniProt -e \"$sql\"")
        or die "Couldn't fetch uniProt comment types from db, $!\n";
   while (<FH>) {
       chomp;
       my @t = split(/\s+/);   
       push(@{$uniProt{$t[0]}}, $type{$t[1]});
   }
   close(FH) or die "Couldn't finish uniProt comment types, $!\n";
}
####End 
