#!/usr/bin/perl -w

#use POSIX;
#use POSIX qw(setsid);
#use POSIX qw(:errno_h :fcntl_h);

use strict;
use LWP::Simple;

#should have 605 mutations?
my $build = 'hg18';
my $genomeVarFile = 'gvARdb.txt';
my $posFile = "gvPosARdb.$build.txt";
my $linkFile = 'gvLinkARdb.txt';
my $attFile = 'gvAttrARdb.txt';
my $badFile = 'unparsedARdb.txt';
my $converter = '../convert_coors2';

#my $geneFile = 'M20132.psl';
my $geneFile = "M20132.$build.psl";
my $faFile = 'M20132.fa';
my $strand;
my %aa = (
'Ala' => 'A',
'Val' => 'V',
'Leu' => 'L',
'Ile' => 'I',
'Pro' => 'P',
'Phe' => 'F',
'Trp' => 'W',
'Met' => 'M',
'Gly' => 'G',
'Ser' => 'S',
'Thr' => 'T',
'Cys' => 'C',
'Tyr' => 'Y',
'Asn' => 'N',
'Gln' => 'Q',
'Asp' => 'D',
'Glu' => 'E',
'Lys' => 'K',
'Arg' => 'R',
'His' => 'H',
'Stop' => 'X'
);
my $fh;
my $fh2;
my $fh3;
my $fh4;
my $bfh;
open($fh, ">", $genomeVarFile) or die "Couldn't open output file, $!\n";
open($fh2, ">", $posFile) or die "Couldn't open output file2, $!\n";
#no links for now
open($fh3, ">", $linkFile) or die "Couldn't open output file3, $!\n";
open($fh4, ">", $attFile) or die "Couldn't open output file4, $!\n";
open($bfh, ">", $badFile) or die "Couldn't open output fileb, $!\n";
my $fcnt = 0;
my $bcnt = 0;
my $verCnt = 0;
my %pheno;
my %phenoId; #phenotype and ID to make unique?
my %disease;  #only set disease-associated once

print "Writing files $genomeVarFile, $posFile, $attFile and $badFile\n";
#ARdb data
#ACC    Phenotype       Mutation type   exon    pathogenicity   CpG     PositionFrom amino acid Toamino acid    nucleotide position     From nucleotide To nucleotide   Bmax    kD      k       Thermolabile    comments        Domain  External genitalia      Family history  Reference       Exon 1 tracts   Poly Gln #     Poly Gly #       Poly Gly
my $srcId = 'ARdb';

my %pkey; #IDs are primary key in gv table
my %name;
my %id; #use first id as ID for mutation, data per patient
while (<>) {
   chomp;
   my $line = $_;
   if (!defined $line or $line eq '' or $line =~ /^\s*#/) { next; } 
   parseDataLine($line);
}

#print out disease 
foreach my $k (keys %disease) {
   my $t = $disease{$k};
   if ($t eq 'both') { 
      $t = 'likely to be phenotype-associated';
   }
   print $fh4 "$k\tdisease\t$t\n";
}

close $fh or die "Couldn't close outfile, $!\n";
close $fh2 or die "Couldn't close outfile2, $!\n";
close $fh3 or die "Couldn't close outfile3, $!\n";
close $fh4 or die "Couldn't close outfile4, $!\n";
close $bfh or die "Couldn't close outfileb, $!\n";

my $unicnt = scalar (keys %name);
print "found numbers for $fcnt entries\n",
      "found $unicnt unique mutation names\n",
      "found $bcnt entries that couldn't parse\n",
      "verified sequence on $verCnt names\n";
print "Found unique phenotypes:\n",
      join(", ", keys %pheno), "\n";
exit 0;

sub parseDataLine {
   my $line = shift @_;
   my @f = split(/\t/, $line);
   foreach (@f) { s/^\s+//; s/\s+$//; }
   #0=id, 1=phenotype, 2=mutations type, 3=location, 9=base, 10=oldnts 11=new
   #4=disease association?
   #protein: 6=position, 7=from amino acid, 8=to amino acid
   my @chr;
   my $disease;
   if ($f[9] && ($f[10] or $f[11])) { 
      $f[2] = lc $f[2];
      if ($f[2] =~ /subst/) { $f[2] = 'substitution'; } #abrev 
      elsif ($f[2] =~ /dupli/) { $f[2] = 'duplication'; }
      elsif ($f[2] eq 'splice') { $f[2] = 'unknown'; }
      #leave insertion and deletion
      if ($f[3] =~ /intron/) { $f[3] = 'intron'; }
      #has domain attached?
      elsif ($f[3] =~ /^\d+\s+\w+$/) { $f[3] = 'exon'; } #by number
      elsif ($f[3] =~ /^\d+\s*$/) { $f[3] = 'exon'; } #by number without domain
      else { $f[3] = 'unknown' . $f[3] . '?'; }
      #build hgvs protein name? cols 6-8
      my $pname;
      if ($f[4]) { 
         $f[4] =~ s/^\s+//;
         $f[4] =~ s/\s+$//;
         if ($f[4] eq '*') { 
            $disease{"ARdb_$f[0]"} = 'phenotype-associated';
         }#else treat as unknown
      }
      if (!$f[6] or !$f[7] or !defined $f[8]) {
         print $bfh $line, "\n";
         $bcnt++;
         return;
      }
      if (!$f[2]) { die "Couldn't find type in $line\n"; }
      if ($f[2] eq 'substitution') { 
         if (!exists $aa{$f[7]} or !exists $aa{$f[8]}) { 
            die "No amino acid symbol for $f[7] or $f[8]\n";
         }
         #$pname = 'AR p.' . $aa{$f[7]} . $f[6] . $aa{$f[8]}; 
         $pname = make_sub_name(@f);
      }elsif ($f[2] eq 'deletion') { $pname = make_del_name(@f); }
      elsif ($f[2] eq 'insertion') { $pname = make_ins_name(@f); }
      elsif ($f[2] eq 'duplication') { $pname = make_dup_name(@f); }
      else { #bad mutation type
         print $bfh $line, "\n";
         $bcnt++;
         return;
      }
      if (!$pname) {
         print $bfh $line, "\n";
         $bcnt++;
         return;
      }
      $name{$pname}++;
      if ($f[9] =~ /\D/) {
          if ($f[2] eq 'deletion' && $pname) { #get coordinates from name
             @chr = convert_hgvs_name($pname);
          }
          if (!@chr or $chr[0] =~ /ERROR/) {
             print $bfh $line, "\n";
             $bcnt++;
             return;
          }
      }else {
         @chr = convert_number($f[9]);
         $chr[2] = $chr[1];
         if ($f[2] eq 'insertion') { $chr[1]--; } #inserted before nt listed
      }
      if ($chr[0] =~ /ERROR/) {
         print $bfh $line, "\n";
         $bcnt++;
         return;
      }
      #adjust to ucsc numbers?
      $chr[1]--;
      if ($name{$pname} == 1) {
         #check sequence based on hgvs style name
         my $ch = sequenceCheck($pname);
         if (!$ch) {
            print $bfh $line, "\n";
            $bcnt++;
            return;
         }elsif ($ch == 2) { $verCnt++; }
         if (exists $pkey{"ARdb_$f[0]"}) {
            print "WARNING duplicate ID ARdb_$f[0] for $pname\n";
         }else {
            $pkey{"ARdb_$f[0]"} = 1;
         }
         print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tARdb_$f[0]\t$strand\t$pname\n";
         print $fh "ARdb_$f[0]\t$pname\t$srcId\t$f[2]\t$f[3]\t1\n";
         #test and add only valid links
         addLink($f[0]);
         $id{$pname} = "ARdb_$f[0]";
         #attr table id, classId, nameId, value (255chars)
         #phenotype=class 6, 
         #name = ? male pseudohermaphroditism ? reproductive system phenotype
         #cancers? tumorigenesis ? prostate tumor ? mammary adenocarcinoma ?
         if ($f[1]) {
            add_phenotype($id{$pname}, $f[1]);
         }
         #add protein change if available
         if ($f[6] && $f[7] && $f[8]) {
            $f[6] =~ s/^0+//; #remove leading zeros
            print $fh4 "ARdb_$f[0]\tprotEffect\tAR p.$f[7]$f[6]$f[8]\n";
            if (!exists $disease{"ARdb_$f[0]"} && $f[7] ne $f[8]) {
               $disease{"ARdb_$f[0]"} = 'likely to be phenotype-associated';
            }
         }
         if (!exists $disease{"ARdb_$f[0]"} && $f[2] =~ /deletion|insertion/
             && $f[3] eq 'exon') {
            $disease{"ARdb_$f[0]"} = 'likely to be phenotype-associated';
         }
         print $fh3 "ARdb_$f[0]\tgeneVarsDis\tomimTitle2\t313700\n";
         $fcnt++;
      }else {
         #print $fh3 "$id{$pname}\t$f[0]\t$srcId\n";
         if ($f[1]) {
            add_phenotype($id{$pname}, $f[1]);
         }
      }
   }elsif ($f[6] && defined $f[7] && defined $f[8] && $f[2] eq 'Substitut.') {
      #protein substitution
      $f[2] = 'substitution';
      $f[6] =~ s/^0+//; #remove leading zeros
      my $pname = make_sub_name(@f);
      my @chr = ('chrX'); #AR gene is on chrX
      my @c;
      #my @c = convert_aa_number($f[6]);
      push(@chr, @c);
      if (!$chr[1]) { #unable to convert
         print $bfh $line, "\n";
         $bcnt++;
      }else {
         $name{$pname}++;
         #adjust to ucsc numbers
         $chr[1]--;
         if ($name{$pname} == 1) {
            if (exists $pkey{"ARdb_$f[0]"}) {
               print "WARNING duplicate ID ARdb_$f[0] for $pname\n";
            }else {
               $pkey{"ARdb_$f[0]"} = 1;
            }
            print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tARdb_$f[0]\t$strand\t$pname\n";
            print $fh "ARdb_$f[0]\t$pname\t$srcId\t$f[2]\t$f[3]\t1\n";
            $id{$pname} = "ARdb_$f[0]";
            print $fh3 "ARdb_$f[0]\tgeneVarsDis\tomimTitle2\t313700\n";
            $fcnt++;
         }
         if ($f[1]) {
            add_phenotype($id{$pname}, $f[1]);
         }
      }
   }else {
      print $bfh $line, "\n";
      $bcnt++;
   }
}
####End 

sub convert_number {
   my $pos = shift; #genomic position
   my @c;
   my $myfh;
   open($myfh, "$converter $geneFile M20132 $pos 2>&1 |")
      or die "ERROR Couldn't run convert_coors, $!\n";
   while (<$myfh>) {
      if (/ERROR/) { $c[0] = $_; last; }
      /is mapped to (\w+\d*) (\d+) (\+|\-)/;
      $c[1] = $2;
      $c[0] = $1;
      $strand = $3;
   }
   if (!$c[0]) { close $myfh; return("ERROR couldn't convert '$pos'", undef); }
   close $myfh or die "ERROR Couldn't finish convert_coors run on $pos got $c[0], $!\n";
   return @c;
}
####End 

sub convert_aa_number {
   my $pos = shift; #amino acid number Met=1
   my $gst;
   my $gend;
   my $aa = 0;
   my $aafh;
   my @coors;
   open ($aafh, $geneFile) or die "Couldn't open gene file, $!\n";
   while (<$aafh>) {
      #BED: chrom start stop name score strand ..
      chomp;
      if (!defined) { next; }
      if (/\s*#/) { next; } #comment
      my @f = split(/\s+/);
      if (scalar @f < 6) { die "ERROR bad gene file format $_\n"; }
      $f[1]++; #switch to 1 based start
      #AR gene is + strand
      push(@coors, ($f[1]..$f[2]));
   }
   close $aafh or die "Problem closing geneFile, $!\n";
   #A of ATG is at 363 of GenBank file
   splice(@coors, 0, 362);
   for(my $i=0; $i <= scalar @coors; $i+=3) {
      $aa++; #amino acid position
      if ($pos == $aa) { 
         $gst = $coors[$i];
         $gend = $coors[$i+2];
         last;
      }
   }

   return ($gst, $gend); #both 1 based numbers
}
####End

#this generates an HGVS style name for substitutions
sub make_sub_name {
   my @c = @_;
   #cols 9-11 dna, 6-8 protein
   my $name;
   if ($c[9] && $c[10] && $c[11]) {
      my $oldnt;
      my $newnt;
      my @nt = split(/ */, $c[10]);
      my @nt2 = split(/ */, $c[11]);
      my $i = 0;
      foreach my $n (@nt) {
         if ($n ne $nt2[$i]) {
            $oldnt = $n;
            $newnt = $nt2[$i];
            last;
         }
         $i++;
      }
      if ($oldnt) {
         $name = 'M20132.1 g.' . $c[9] . $oldnt . '>' . $newnt;
      }
   }
   if (!$name) {
      $name = 'AR p.' . $aa{$c[7]} . $c[6] . $aa{$c[8]};
   }
   return $name;  
}
####End of subroutine make_sub_name

#make a name for deletions
sub make_del_name {
   my @c = @_;
   my $name;
   if ($c[9] && $c[9] =~ /^\d+$/) {
      if ($c[16] =~ /Single nt\. del|1 nt\. del|1 nt del|1nt. deletion|single nt deletion/) {
         $name = 'M20132.1 g.' . $c[9] . 'del';
      } #else { print STDERR "What deleted? $c[9], $c[16]\n"; }
   }elsif ($c[9] =~ /^(\d+)\-(\d+)$/ && defined $c[8] && $c[8] eq '0') {
         my $s = $1;
         my $d = $2;
         my $e = $s + 2;
         if ($e =~ /$d$/) { 
            $name = 'M20132.1 g.' . $s . '_' . $e . 'del';
         } #else {print STDERR "Wrong number? $c[9] $s $e\n"; }
   }elsif (defined $c[8] && $c[8] eq '0') {
      $name = 'AR p.' . $c[6] . 'del' . $aa{$c[7]}; 
   }else { print STDERR "Couldn't do deletion with $c[6] $c[9] $c[16]\n"; }

   return $name;
}
####End 

#get coordinates from hgvs_name that is built
sub convert_hgvs_name { 
   my $name = shift;
   my @c;
   if ($name =~ /M20132.1 g.(\d+)_(\d+)/) {
      my $s = $1;
      my $e = $2;
      my $chr;
      ($chr, $s) = convert_number($s);
      ($chr, $e) = convert_number($e);
      @c = ($chr, $s, $e);
   }else {
      return("ERROR no coordinates", undef, undef);
   }
   return @c;
}
####End 

#make a name for the insertions
sub make_ins_name {
   my @c = @_;
   my $name;
   if ($c[9]) {
      $name = 'M20132.1 g.' . ($c[9] - 1) . '_' . $c[9] . 'ins';
      #get nt(s) inserted?
      if ($c[11] !~ /\+/ && length $c[10] != length $c[11]) {
         my @nt = split(/ */, $c[10]);
         foreach my $n (@nt) {
            $c[11] =~ s/$n//;
         }
         if ($c[11] && $c[11] ne '') { $name .= $c[11]; }
      }elsif ($c[11] =~ /\+([ACTG]+)/) {
         $name .= $1;
      }
   }else {
      $name = 'AR p.' . $c[6] . 'ins' . $aa{$c[7]}; 
   }
   return $name;
}
####End

#make a name for a duplication
sub make_dup_name {
   my @c = @_;
   my $name;
   if ($c[7] && !$c[8]) {
      #must have duplicated this protein
      $name = 'AR p.' . $c[6] . 'dup' . $aa{$c[7]};
   }
   #for now all others are hand edits using comment
   return $name;
}
####End

#add phenotype
sub add_phenotype {
   my $id = shift;
   my $ph = shift;
   #clean up phenotypes
   if (!defined $id) { return; }
   $ph =~ s/>/ /g;
   $ph =~ s/\s+/ /g;
   $ph =~ s/\s+$//;
   $ph =~ s/^\s+//;
   $ph =~ s/Cancer/cancer/;
   my @p;
   if ($ph =~ / and /) {
      @p = split(/ and /, $ph);
      for (my $i = 1; $i <= $#p; $i++) {
         $p[$i] = ucfirst($p[$i]);
      }
   }else {
      $p[0] = $ph;
   }
   #attr table id, classId, nameId, value (255chars)
   foreach my $t (@p) {
      if (!$t) { next; }
      if ($t =~ /\?/) { next; }
      if (!exists $phenoId{$id . $t}) {
         $phenoId{$id . $t}++;
         #spell out AIS? Androgen Insensitivity Syndrome? constitutional androgen insensitivity?
         print $fh4 "$id\tphenoCommon\t$t\n";
         if ($t eq 'Normal') { 
            print $fh4 "$id\tphenoOfficial\tno phenotype detected [MP:0002169]\n";
         }
         #phenotype of 1 patient doesn't prove association
         #if ($t eq 'Normal' && (!exists $disease{$id} or
             #$disease{$id} =~ /likely/)) {
            #$disease{$id} = 'not phenotype-associated';
         #}elsif ($t eq 'Normal' && $disease{$id} eq 'phenotype-associated') {
            #$disease{$id} = 'both';
         #}elsif ($t ne 'Normal' && (!exists $disease{$id} or
             #$disease{$id} =~ /likely/)) {
            #$disease{$id} = 'phenotype-associated';
         #}elsif ($t ne 'Normal' && $disease{$id} =~ /not phenotype-associated/) {
            #$disease{$id} = 'both';
         #}
      }
      $pheno{$t}++;
   }
}
####End

sub sequenceCheck {
   my $name = shift @_;
   my $verified = 0;
   my $bad;

   if ($name =~ /'/) { return undef; } #illegal char can't check
   open(CFH, "../sequenceCheck $faFile '$name' 2>&1 |")
      or die "Couldn't run sequenceCheck, $!\n";
   while (<CFH>) {
      chomp;
      if (/Sequence doesn't match/) { $bad = $_; }
      elsif (/Sequence matched$/) { $verified++; }
   }

##	close(CFH);
##
##	my $status = $?;
##
##	if (WIFEXITED($status)) {
##		print "\$? = $status; exited, status=", WEXITSTATUS($status), "\n";
##	} elsif (WIFSIGNALED($status)) {
##		print "\$? = $status; signalled, signal=", WTERMSIG($status), "\n";
##	} elsif (WIFSTOPPED($status)) {
##		print "\$? = $status; stopped, signal=", WSTOPSIG($status), "\n";
##	} else {
##		die "Too bad.  I don't know what's going on\n";
##	}
	



   close(CFH) 
      or die "Couldn't finish sequenceCheck with ../sequenceCheck $faFile '$name', $!:exit status $?\n";
   if ($bad) {
      print $bfh "#$bad\n";
      return 0;
   }elsif ($verified) {
      return 2;
   }else {
      return 1;
   }
}
####End

sub addLink {
   my $id = shift;
   my $url = "http://srs.ebi.ac.uk/srsbin/cgi-bin/wgetz?-id+sessionId+-e+[ANDROGENR:'ARID']+-enum+1";
   $url =~ s/ARID/$id/;
   my $page = get($url);
   if ($page && $page !~ /error/i) {
      print $fh3 "ARdb_$id\tsrcLink\t$srcId\t$id\t\n";
   }
}
####End

#how to do links:
#http://srs.ebi.ac.uk/srsbin/cgi-bin/wgetz?-id+sessionId+-e+[ANDROGENR:'0002']+-enum+1
#not all entries (<= 0515)

