#!/bin/env perl

# DO NOT EDIT the /cluster/bin/scripts copy of this file -- 
# edit ~/kent/src/hg/utils/automation/blastz-run-ucsc instead.

# Based on Scott Schwartz's bash script blastz-run;
# rewritten in perl and extended to handle .2bit inputs and 
# multi-record fasta for target as well as query by Angie Hinrichs.

# $Id: blastz-run-ucsc,v 1.11 2009/08/13 23:15:57 hiram Exp $

use Getopt::Long;
use strict;
use warnings;

use vars qw/
    $opt_outFormat
    $opt_gz
    $opt_dropSelf
    $opt_help
    /;

sub usage {
  my ($status) = @_;
  my $base = $0;
  $base =~ s/^(.*\/)?//;
    print STDERR "
usage: $base target query DEF out
options:
    -outFormat (psl|axt)  Convert blastz output from lav to psl or axt.
    -gz                   Compress output with gzip.
    -dropSelf             When converting to psl or axt, drop alignment 
                          blocks that cross the diagonal (trivial self-
                          alignments).  
    Unpacks sequence (if necessary), runs blastz and lifts/\"normalizes\" 
    output (if necessary).  
    Target and query can be list files containing one sequence specifier 
    per line, or sequence specifiers.  A sequence specifier is a .2bit, 
    .nib or .fa(.gz) files, possibly with a sequence and range specifier 
    appended.  
    DEF is a Scott Schwartz-style bash script containing blastz parameters.  
    out is the (lifted/\"normalized\") output of blastz (optionally converted 
    to axt or psl, and/or gzipped).  This script does not pay attention to 
    out's file suffixes -- it is up to you to make sure that they are 
    consistent with output format and compression options.
\n";
  exit $status;
}

sub checkOptions {
  # Make sure command line options are valid/supported.
  my $ok = GetOptions("outFormat=s",
		      "gz",
		      "dropSelf",
		      "help");
  &usage(1) if (!$ok);
  &usage(0) if ($opt_help);
  if ($opt_outFormat) {
    $opt_outFormat =~ tr/A-Z/a-z/;
    if ($opt_outFormat eq 'lav') {
      $opt_outFormat = undef;
    } elsif ($opt_outFormat !~ /^(psl|axt)$/) {
      print STDERR "\nUnsupported -outFormat $opt_outFormat.\n";
      &usage(1);
    }
  }
  if ($opt_dropSelf && !($opt_outFormat && $opt_outFormat =~ /^(psl|axt)$/)) {
    print STDERR "\n-dropSelf is only supported for -outFormat axt or psl.\n";
    &usage(1);
  }
}

# my $machtype = $ENV{'MACHTYPE'};
my $machtype = "x86_64";
# This hash will contain the params defined in the Scott-style DEF script:
my %defVars = ();
# Temporary local directory for intermediate files:
my $TMP;
# For lifting, we'll need to know the sequence sizes:
my %tSizes = ();
my %qSizes = ();
my $machType = `uname -m`;
chomp $machType;
$machType = "i386" if ($machType eq "i686");
my $defaultPath = ("/cluster/bin/penn/$machType:/cluster/bin/penn:" .
		   "/cluster/bin/scripts:/cluster/bin/$machType:" .
		   '/bin:/usr/bin');

sub cleanDie {
  # Clean up $TMP (if it has been created) and then die.
  my ($msg) = @_;
  if ($TMP && -d $TMP) {
    system("/bin/sh", "-c", "/bin/rm -rf $TMP");
  }
  die $msg;
}

sub run {
  # Run a command in sh, with PATH specified in %defVars.
  my ($cmd) = @_;
  my $setPath = "export PATH=";
  $setPath .= "$defVars{PATH}:" if (defined $defVars{'PATH'});
  $setPath .= "$defaultPath; ";
  system("/bin/sh", "-c", $setPath . $cmd) == 0 ||
    &cleanDie("Command failed:\n$setPath\n$cmd\n");
}

sub nfsNoodge {
  # Try to access the given file/dir; if it fails, don't die, just 
  # wait a second, try again.  
  # Scott's blastz-run included a similar trick.  
  my ($file) = @_;
  my $ret = system("/bin/sh", "-c", "/bin/ls -d $file > /dev/null");
  if ($ret != 0) {
    sleep 1;
    $ret = system("/bin/sh", "-c", "/bin/ls -d $file > /dev/null");
    sleep 1 if ($ret != 0);
  }
}

sub loadDef {
  # Read parameters from a bash script with Scott's param variable names:
  my ($def) = @_;
  &nfsNoodge($def);
  open(DEF, "$def") || &cleanDie("Can't open def file $def: $!\n");
  while (<DEF>) {
    s/^\s*export\s+//;
    next if (/^\s*#/ || /^\s*$/);
    if (/(\w+)\s*=\s*(.*)/) {
      my ($var, $val) = ($1, $2);
      while ($val =~ /\$(\w+)/) {
	my $subst = $defVars{$1};
	$val =~ s/\$$1/$subst/ if (defined $subst);
      }
      $defVars{$var} = $val;
    }
  }
  close(DEF);
}

sub loadSeqSizes {
  # Load up sequence -> size mapping from $sizeFile into $hashRef.
  my ($sizeFile, $hashRef) = @_;
  &nfsNoodge($sizeFile);
  open(SIZE, "$sizeFile") || &cleanDie("Can't open size file $sizeFile: $!\n");
  while (<SIZE>) {
    chomp;
    my ($seq, $size) = split;
    $hashRef->{$seq} = $size;
  }
  close(SIZE);
}

sub fileBase {
  # Get rid of leading path and .suffix (and .gz too if it's there).
  my ($path) = @_;
  $path =~ s/^(.*\/)?(\S+)\.\w+(\.gz)?/$2/;
  return $path;
}

sub parseFileSpec {
  # Take a file spec with possible seq name, start and end specifiers and 
  # split into components.
  my ($info, $dir) = @_;
  my ($fName, $seq, $start, $end);
  if ($info =~ /^(\S+):(\S+):(\d+)-(\d+)$/) {
    ($fName, $seq, $start, $end) = ($1, $2, $3, $4);
  } elsif ($info =~ /^(\S+):(\d+)-(\d+)$/) {
    ($fName, $seq, $start, $end) = ($1, &fileBase($info), $2, $3);
  } else {
    ($fName, $seq, $start, $end) = ($info, &fileBase($info), 0, 0);
  }
  $fName = "$dir/$fName" if ($fName !~ /^[\/.]/);
  return ($fName, $seq, $start, $end);
}

sub enumerateFileSpecs {
  # Given a sequence file name, sequence file name + offset spec, or 
  # a name of a file containing a list of such file specs, return a 
  # list of file specs.
  my ($listOrSpec, $seqDir) = @_;
  my @specs = ();
  my ($fName, $seq, $start, $end) = &parseFileSpec($listOrSpec, $seqDir);
  if ($fName =~ /\.(fa(\.gz)?|nib|2bit)$/) {
    # It appears to be a single spec not a list.
    push @specs, $listOrSpec;
  } else {
    # It doesn't have a recognized file ending -- it had better be a list.
    # Assume each line contains a single spec.
    &nfsNoodge($listOrSpec);
    open(LST, "$listOrSpec")
      || &cleanDie("Couldn't open list file $listOrSpec: $!\n");
    while (<LST>) {
      chomp;
      push @specs, $_;
    }
    close(LST);
  }
  return @specs;
}

sub collapseFileSpecs {
  # If we get a big old list of query sequences that are not subranged, 
  # then cat them all into one big .fa file so we can invoke blastz 
  # only once, and don't have to lift afterwards.
  my @fileSpecs = @_;
  my $collapsed = 0;
  if (scalar(@fileSpecs) == 1) {
    # Only one spec in the list ==> no collapsing to do.
    return ($collapsed, @fileSpecs);
  }
  # Now see if there are any specs that look like subranges:
  foreach my $spec (@fileSpecs) {
    my ($fname, $seq, $start, $end) = &parseFileSpec($spec, ".");
    if ($start > 0) {
      # Found a subrange; no collapsing.
      return ($collapsed, @fileSpecs);
    }
  }
  # OK, we're in the clear to concatenate all of these complete sequences 
  # onto one local file.
  my $big = "$TMP/bigCollapse.fa";
  # avoiding exec of shell here instead of going through &run
  # and making an empty new file $big
  &run("/bin/cp /dev/null $big");
  foreach my $spec (@fileSpecs) {
    my ($seq, $local) = unpackForBlastz($spec, '.', 'tmp');
    $local =~ s/\[\S+\]$//;
    if ($local =~ /\.nib$/) {
      my $nib = $local;
      $local =~ s/\.nib/.fa/;
      $local =~ s/(.*\/)/tmp./;
      my $size = `nibSize $nib | awk '{print \$3;}'`;
      chomp $size;
      &run("nibFrag -masked $nib 0 $size + $local");
    }
    &run("/bin/cat $local >> $big");
    &run("/bin/rm -f $local") if ($local =~ /^$TMP/);
  }
  $collapsed = 1;
  @fileSpecs = ($big);
  return ($collapsed, @fileSpecs);
}

sub getBlastzParams {
  # Look for BLASTZ_? param specs in %defVars; turn into blastz command line 
  # option string.  (blastz command line option names are all 1-character, 
  # and all 1-char BLASTZ_? DEF params are blastz command line options.)
  my $params = " ";
  foreach my $p (keys %defVars) {
    if ($p =~ /^BLASTZ_(\w)$/) {
      $params .= "$1=$defVars{$p} ";
    }
  }
  return $params;
}

sub blastzRangeSpec {
  # Return a blastz-style range spec unless a null range is passed in.
  my ($start, $end) = @_;
  if ($end != 0) {
    $start += 1;
    return "[$start,$end]";
  } else {
    return "";
  }
}

sub unpackForBlastz {
  # If the sequence file is .fa.gz or .2bit, uncompress/extract fa into 
  # a local file;  if it's something that blastz already supports like 
  # .fa or .nib, translate the offset spec (if any) into blastz's format.
  my ($spec, $seqDir, $prefix) = @_;
  my ($fname, $seq, $start, $end) = &parseFileSpec($spec, $seqDir);
  my $local = "$TMP/$prefix.$seq.$start.$end.fa";
  if ($fname =~ /\.2bit$/) {
    # Don't use twoBitToFa's range extraction capabilities -- that 
    # confuses blastz/normalizeLav.  Just extract the whole thing as 
    # fa and tell blastz what range we're using:
    &run("/cluster/bin/$machtype/twoBitToFa $fname:$seq $local");
    $local .= &blastzRangeSpec($start, $end);
  } elsif ($fname =~ /\.fa(\.gz)?$/) {
    if ($1) {
      &run("/bin/gunzip -c $fname > $local");
    } else {
      $local = $fname;
    }
    $local .= &blastzRangeSpec($start, $end);
  } elsif ($fname =~ /\.nib$/) {
    $local = $fname;
    $local .= &blastzRangeSpec($start, $end);
  } else {
    &cleanDie("Don't know what type of sequence file this is: $fname");
  }
  return ($seq, $local);
}

sub parseBlastzSpec {
  # Extract info back out of blastz spec.  Leave start 1-based.
  my ($spec) = @_;
  my ($fname, $seq, $start, $end);
  if ($spec =~ /^(\S*\/)?(\S+)\.(fa|nib)(\[(\d+),(\d+)\])?$/) {
    $fname = "$1$2.$3";
    $seq = $2;
    if ($4) {
      $start = $5;
      $end = $6;
    } else {
      $start = 0;
      $end = 0;
    }
  } else {
    &cleanDie("Can't parse this blastz spec: $spec");
  }
  return ($fname, $seq, $start, $end);
}

sub emptyLav {
  #  satisify later processing steps with an empty lav result
  my ($raw) = @_;
  open (FH,">$raw") || &cleanDie("emptyLav: can not write to $raw $!\n");
  print FH <<_EOF_
#:lav
d {
  "blastz target query H=2000
     A    C    G    T
    91 -114  -31 -123
  -114  100 -125  -31
   -31 -125  100 -114
  -123  -31 -114   91
  O = 400, E = 30"
}
m {
  n 0
}
#:eof
_EOF_
  ;
  close (FH);
}

sub checkSubRange {
# given an fa or nib file name: file.fa|nib[one_based_start,end]
# verify that bit of sequence has usable sequence for blastz
  my ($faNibFile) = @_;
  my ($file, $seq, $one_start, $end) = &parseBlastzSpec($faNibFile);
  my $start = $one_start - 1;  # 1 based becomes 0 based coord
  my $answer = 0;  # default answer is usable sequence exists
  return($answer) if ((0 == $end) || (($end - $start) > 10000));
  if ($file =~ m/.nib$/) {
    $answer =
      `nibFrag -masked $file $start $end "+" stdout | faSize stdin | grep " 0 upper " | wc -l`;
  } else {
    $answer =
      `faFrag -mixed $file $start $end stdout | faSize stdin | grep " 0 upper " | wc -l`;
  }
  return($answer);
}

sub plainBlastz {
  # just run blastz -- target and query can have "[$start,$end]" specifiers
  # where $start is 1-based.  Put output in file name $raw.
  my ($target, $query, $options, $raw) = @_;
  my $answer = 1;  # default answer is that blastz was run OK
  my $blastz = $defVars{'BLASTZ'} || "lastz";
  #  these variables will be 1 if there is zero usable sequence
  #  this could be a more extensive check than just for zero.
  my $targetNoSequence = 0;
  if ($target =~ m/\[/) {
    $targetNoSequence = &checkSubRange($target);
  } else {
    $targetNoSequence =
	`cat $target | faSize stdin | grep " 0 upper " | wc -l`;
  }
  chomp $targetNoSequence;
  my $queryNoSequence = 0;
  if ($query =~ m/\[/) {
    $queryNoSequence = &checkSubRange($query);
  } else {
    $queryNoSequence =
	`cat $query | faSize stdin | grep " 0 upper " | wc -l`;
  }
  chomp $queryNoSequence;
  if ($targetNoSequence || $queryNoSequence) {
    &emptyLav($raw);
    $answer = 0;  # not really running blastz
  } else {
    &run("$blastz $target $query $options > $raw");
  }
  return($answer);
}

sub selectRpts {
  # This used to be an awk, piped to select_rpts, piped to sort -n.  
  # select_rpts is a little perl script, and getting the DEF PATH into the 
  # pipe command seems more trouble than it's worth -- so inline not only 
  # the awk but also the select_rpts here.  Still pipe to sort -n just in case.
  my ($allRepFile, $start, $end, $selRepFile) = @_;
  &nfsNoodge($allRepFile);
  open(ALL, "<$allRepFile") || &cleanDie("Can't open $allRepFile: $!\n");
  open(SEL, "| sort -n > $selRepFile") 
    || &cleanDie("Can't open pipe to sort -n > $selRepFile: $!\n");
  print SEL "\%:repeats\n";
  while (<ALL>) {
    if (/^\s*\d+/) {
      my @words = split;
      my ($repStart, $repEnd) = ($words[5], $words[6]);
      next if ((! defined $repStart) || (! defined $repEnd));
      $repStart += 1;
      next if ($repStart > $repEnd);
      next if (($repEnd < $start) || ($repStart > $end));
      $repStart = $start if ($repStart < $start);
      $repEnd   = $end   if ($repEnd   > $end);
      $repStart -= ($start - 1);
      $repEnd   -= ($start - 1);
      print SEL "$repStart $repEnd rpt\n";
    }
  }
  close(SEL);
  close(ALL);
}

sub abridgeLinSpecReps {
  # Before running blastz, snip out lineage-specific repeats from each 
  # sequence.  Then run blastz only on that strand.  After running blastz, 
  # fix up the alignment coords to reflect the original sequence including 
  # the lineage-specific repeats.
  # Currently this only works on per-chromosome sequence and repeat files.
  my ($tSpec, $qSpec, $blastzOptions, $raw) = @_;
  my ($tFname, $tSeq, $tStart, $tEnd) = &parseBlastzSpec($tSpec);
  my ($qFname, $qSeq, $qStart, $qEnd) = &parseBlastzSpec($qSpec);

  if ($tEnd == 0 || $qEnd == 0) {
    &cleanDie("When abridging repeats, sequence filenames must be followed " .
	      "by range specs even if the whole sequence is aligned.\n");
  }

  my $S1X = "$defVars{SEQ1_SMSK}/$tSeq.out.spec";
  my $S2X = "$defVars{SEQ2_SMSK}/$qSeq.out.spec";
  &cleanDie("$S1X not found") if (! -f $S1X);
  &cleanDie("$S2X not found") if (! -f $S2X);

  my $TS1X = "$TMP/s1.rpts";
  my $TS2X = "$TMP/s2.rpts";
  my $TS1 = "$TMP/s1.fa";
  my $TS2 = "$TMP/s2.fa";
  my $TS1S = "$TMP/s1.strip.fa";
  my $TS2S = "$TMP/s2.strip.fa";
  my $TS2R = "$TMP/s2.rev.fa";
  my $TS2SR = "$TMP/s2.strip.rev.fa";
  my $TZF = "$TMP/fwd.lav";
  my $TZR = "$TMP/rev.lav";
  my $TZS = "$TMP/std.lav";

  # Get repeats in range:
  &selectRpts($S1X, $tStart, $tEnd, $TS1X);
  &selectRpts($S2X, $qStart, $qEnd, $TS2X);

  # Get sequence in range:
  &run("fasta-subseq $tFname $tStart $tEnd > $TS1");
  &run("fasta-subseq $qFname $qStart $qEnd > $TS2");

  # Snip out the repeats:
  &run("strip_rpts $TS1 $TS1X > $TS1S");
  &run("strip_rpts $TS2 $TS2X > $TS2S");

  # Run blastz on query forward strand and restore repeats:
  my $blastzOK = &plainBlastz($TS1S, $TS2S, "B=0" . $blastzOptions, $TZF);
  if ($blastzOK ) {
      &run("restore_rpts $TZF " .
            "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " .
            "$TS2 \"\\\"$qFname\\\" $qStart $qEnd\" " .
            "$TS1X $TS2X > $raw");
  } else {
    &run("/bin/cp -p $TZF $raw");
  }
  # Run blastz on query reverse strand and restore repeats:
  &run("revcomp $TS2S > $TS2SR");
  $blastzOK = &plainBlastz($TS1S, $TS2SR, "B=0" . $blastzOptions, $TZR);
  if ($blastzOK ) {
      &run("restore_rpts $TZR " .
            "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " .
            "$TS2 \"\\\"$qFname-\\\" $qStart $qEnd\" " .
            "$TS1X $TS2X reverse >> $raw");
  } else {
    &run("/bin/cat $TZF >> $raw");
  }
}

sub liftLav {
  # Run blastz-normalizeLav to lift up chunk coords to sequence level.
  my ($raw, $out, $tSeq, $qSeq) = @_;
  my $tLen = $tSizes{$tSeq};
  my $qLen = $qSeq ? $qSizes{$qSeq} : "0";
  &run("blastz-normalizeLav $tLen $qLen < $raw > $out");
}

sub convertOutput {
  # Convert lav file to psl or axt file, optionally dropping trivial self al's.
  my ($lav, $out) = @_;
  my $tSeq = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
  my $qSeq = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
  my $tSizes = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
  my $qSizes = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
  if ($opt_outFormat eq 'axt') {
    my $dropSelf = ($opt_dropSelf ? "-dropSelf " : "");
    &run("lavToAxt $dropSelf $lav $tSeq $qSeq $out");
  } elsif ($opt_dropSelf) {
    # lavToPsl doesn't have lavToAxt's -dropSelf functionality, so pipe 
    # lavToAxt -dropSelf to axtToPsl.
    &run("lavToAxt -dropSelf $lav $tSeq $qSeq stdout " .
	 "| axtToPsl stdin $tSizes $qSizes $out");
  } else {
    &run("lavToPsl $lav $out");
  }
}


#########################################################################
#
# -- main --

&checkOptions();

&usage(1) if (scalar(@ARGV) != 4);
my ($target, $query, $DEF, $out) = @ARGV;

# It is OK to have a previous result existing
if ( -f "$out") {
   exit 0;
}

&loadDef($DEF);
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
&loadSeqSizes($seq1Len, \%tSizes);
&loadSeqSizes($seq2Len, \%qSizes);

# Make a temporary directory for intermediate files:
if (exists($defVars{'TMPDIR'})) {
  $TMP = `mktemp -d $defVars{'TMPDIR'}/blastz.XXXXXX` || &cleanDie("Can't mktemp");
} else {
  $TMP = `mktemp -d /tmp/blastz.XXXXXX` || &cleanDie("Can't mktemp");
}
chomp $TMP;
print STDERR "temp files in $TMP -- will attempt cleanup before die-ing.\n";

# Some internal defaults for blastz params, carried over from 
# Scott's blastz-run:
$defVars{'BLASTZ_H'} = 2000 if (! defined $defVars{'BLASTZ_H'});

my $blastzOptions = &getBlastzParams(%defVars);

my @tFileSpecs = &enumerateFileSpecs($target, $seq1Dir);
my @qFileSpecs = &enumerateFileSpecs($query, $seq2Dir);
if ($defVars{'BLASTZ_ABRIDGE_REPEATS'} && scalar(@qFileSpecs) > 1) {
  &cleanDie("Sorry, DEF can't have BLASTZ_ABRIDGE_REPEATS set if a list of " .
	    "query sequences is passed in -- not supported.\n");
}
my $collapsed;
($collapsed, @qFileSpecs) = &collapseFileSpecs(@qFileSpecs);

# Local file to hold concatenated results of all {target, query} runs:
my $localOut = "$TMP/local.lav";
&run("/bin/cp /dev/null $localOut");
# Local temporary files that are overwritten by each {target, query} run:
my $littleRaw = "$TMP/little.raw";
my $littleOut = "$TMP/little.lav";
my $littleConv = "$TMP/little.conv";

# Run blastz on each {target, query} pair;  lift and collect results.
foreach my $tSpec (@tFileSpecs) {
  my ($tSeq, $tLocal) = &unpackForBlastz($tSpec, $seq1Dir, 't');
  foreach my $qSpec (@qFileSpecs) {
    my ($qSeq, $qLocal) = &unpackForBlastz($qSpec, $seq2Dir, 'q');
    if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) {
      &abridgeLinSpecReps($tLocal, $qLocal, $blastzOptions, $littleRaw);
    } else {
      &plainBlastz($tLocal, $qLocal, $blastzOptions, $littleRaw);
    }
    if ($collapsed) {
      # Lift target side only:
      &liftLav($littleRaw, $littleOut, $tSeq, undef);
    } else {
      &liftLav($littleRaw, $littleOut, $tSeq, $qSeq);
      if ($qLocal =~ /^$TMP/) {
	  $qLocal =~ s/\[.*\]$//;
	  &run("/bin/rm $qLocal");
      }
    }
    if ($opt_outFormat) {
      &convertOutput($littleOut, $littleConv);
      &run("/bin/cat $littleConv >> $localOut");
    } else {
      &run("/bin/cat $littleOut >> $localOut");
    }
  }
  if ($tLocal =~ /^$TMP/) {
      $tLocal =~ s/\[.*\]$//;
      &run("/bin/rm $tLocal");
  }
}
if ($opt_gz) {
  &run("/bin/gzip $localOut");
  $localOut .= ".gz";
}
# Move collected results to final location.
&run("/bin/mv $localOut $out");

# Clean up temporary directory:
&run("/bin/rm -rf $TMP");

