#!/usr/bin/perl
#
# gbAlignFinish [options] database ...
#
# Finish up the BLAT alignments, sort, combining and installing PSL 
# and associated files.
#
# Options:
#   -workdir=work/align - Must be the same as specified for gbAlignSetup.
#   -verbose - print details.
#   -keep - keep tmp files for debugging.
#   -noMigrate - Don't migrate alignments.
#
# Arguments:
#   - database - Database to process.
#
# Obtains other parameters from etc/genbank.conf
#
# $Id: gbAlignFinish,v 1.15 2006/03/26 19:18:16 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;

# globals from command line
my $gKeep = 0;
my $gWorkDir = "work/align";
my $noMigrate = 0;

# get dirs+type+accPrefixes for a work directory
sub getUpdWorkDirPrefixes($) {
    my($updWorkDir) = @_;
    my %dirPrefixes;
    # use orgCat.aligns files to get prefixes, excluding category.
    foreach my $aln (glob("$updWorkDir/*.native.aligns"),
                      glob("$updWorkDir/*.xeno.aligns")) {
        my $dirPrefix = dirname($aln) . "/"
            . basename($aln, ("\.native\.aligns", "\.xeno\.aligns"));
        $dirPrefixes{$dirPrefix} = 1;
    }
    return sort(keys(%dirPrefixes));
}

# Build list of update dirs with type and accPrefixes:
#    work/align/genbank.132.0/hg13/full/mrna
#    work/align/genbank.132.0/hg13/full/est.aj
# result is sorted by update so that full is returned first, followed
# by sorted dailies.  My return an empty list if none are being aligned
# for this database.
sub getDirsPrefixes($) {
    my($db) = @_;
    # find based on fasta files used to align
    my @dirPrefixes = ();
    # loop over releases and databases
    for my $relDbWorkDir ((glob("$gWorkDir/genbank.*/$db"),
                           glob("$gWorkDir/refseq.*/$db"))) {
        # loop over updates, full first
        foreach my $updWorkDir ((glob("$relDbWorkDir/full"),
                                 sort(glob("$relDbWorkDir/daily.*")))) {
            push(@dirPrefixes, getUpdWorkDirPrefixes($updWorkDir));
        }
    }
    return @dirPrefixes;
}

# Parse an update directory and file prefix (can be work dir or aligned dir)
#    work/align/genbank.132.0/hg13test/full
sub parseDirPrefix($) {
    my($dirPrefix) = @_;

    my $typePrefix = basename($dirPrefix);
    my $dir = dirname($dirPrefix);
    my $update = basename($dir);
    $dir = dirname($dir);
    my $database = basename($dir);
    $dir = dirname($dir);
    my $release = basename($dir);

    my $type = $typePrefix;
    $type =~ s/\..*$//;
    return ($release, $database, $update, $type, $typePrefix);
}

# get the srcDb from a release
sub getSrcDb($) {
    my ($release) = @_;

    # parse srcDb from release
    my @srcDbList = split(/\./, $release);
    if ($#srcDbList < 1) {
        die("invalid release: $release");
    }
    return  $srcDbList[0];
}

# get a type and orgCat from a type prefix
sub getTypeOrgCat($) {
    my ($typePrefix) = @_;
    my @typeList = split(/\./, $typePrefix);
    if ($#typeList < 1) {
        die("invalid typePrefix: $typePrefix");
    }
    my $type = $typeList[0];
    my $orgCat = ($type eq "est") ? $typeList[2] : $typeList[1];
    return ($type, $orgCat);
}

# Determine pslCDnaFilter parameters to use from config file.  Return undef to
# skip PSL filtering.
sub getPslCDnaFilterArgs($$$) {
    my ($db, $release, $typePrefix) = @_;

    my $srcDb = getSrcDb($release);
    my($type, $orgCat) = getTypeOrgCat($typePrefix);

    my $args = getDbConf($db, "$srcDb.$type.$orgCat.pslCDnaFilter");
    if ($args eq "no") {
        return undef;
    }
    my $hapRegions = getDbConfUndef($db, "hapRegions");
    if (defined($hapRegions)) {
        $args = "$args -hapRegions=$hapRegions";
    }
    return $args;
}

# Combine PSL from one set of PSL (results from one query file), appending to
# output file.  Record the offset of the start and end of the psl that were
# added.
#
# We do this as a pipeline, to avoid intermediate disk I/O if possible.  We
# need to sort by query for pslCDnaFilter.  We use sort rather than pslSort, since
# pslSort can't read from a pipe (and sort is much faster).  We use the gbBlat
# naming convention where query file is the first level of directory to group
# into small batches for pslCDnaFilter processing.  However, we need to save the
# output of pslCDnaFilter for use by selectWithPsl.  Instead of writing the data
# twice, we just save the offsets in the output PSL file.  These are used by
# selectWithPsl to only load a subset of the PSL.
sub combinePslSet($$$$$$) {
    my($queryDir, $pslCDnaFilterArgs, $tmpDir, $outPsl, $outRawPsl,
       $pslQueryOffsets) = @_;

    my $startOff = (-e $outPsl) ? getFileSize($outPsl) : 0;

    # sort by query for pslCDnaFilter
    my $cmd = "find $queryDir -name '*.psl' | xargs cat "
        . "| sort -T $tmpDir -k 10,10 -k 12n,12n ";
    if (defined($outRawPsl)) {
        $cmd .= "| tee -a $outRawPsl "
    }

    if (defined($pslCDnaFilterArgs)) {
        # ugly hack, generate poly-A file from queryDir
        my $polyA = $queryDir . ".polya";
        $polyA =~ s/\/psl\//\//;
        $pslCDnaFilterArgs =~ s/-polyASizes/-polyASizes=$polyA/;
        $cmd .= "| pslCDnaFilter $pslCDnaFilterArgs stdin stdout "
    }
    $cmd .= " >> $outPsl";
    runPipe($cmd);
    my $endOff = (-e $outPsl) ? getFileSize($outPsl) : 0;

    # always save offsets, even if empty
    $$pslQueryOffsets{$queryDir} = [$startOff, $endOff];
}

# Combine, removing duplicates, return name of output psl or undef if no PSLs.
sub combinePsls($$$$$$$@) {
    my($db, $release, $type, $typePrefix, $updWorkDir, $tmpDir,
       $pslQueryOffsets, @queryDirs) = @_;

    my $pslCDnaFilterArgs = getPslCDnaFilterArgs($db, $release, $typePrefix);
    
    # tmp file for combined PSLs
    my $outPsl = "$updWorkDir/$typePrefix.psl";
    if (-e $outPsl) {
        unlink($outPsl) || die("unlink $outPsl");
    }
    
    # for mRNAs, we save the files before PSL reps for expermental work.
    # this is not done for ESTs, as there are so many. Also not done
    # if we are not filterting, as this would be redundant.
    my $outRawPsl;
    if (($type eq "mrna") && defined($pslCDnaFilterArgs)) {
        $outRawPsl = "$updWorkDir/$typePrefix.rawPsl";
        if (-e $outRawPsl) {
            unlink($outRawPsl) || die("unlink $outRawPsl");
        }
    }

    # Process each query dir in a pipeline with error checking, use find to
    # avoid possible comand line overflow. Sort by qName,qStart,qEnd for
    # pslCDnaFilter.
    foreach my $queryDir (@queryDirs) {
        combinePslSet($queryDir, $pslCDnaFilterArgs, $tmpDir, $outPsl, $outRawPsl,
                      $pslQueryOffsets);
    }

    # return pair of output files, or undef if empty
    my $outPslResults = (getFileSize($outPsl) > 0)
        ? $outPsl : undef;
    my $outRawPslResults = (defined($outRawPsl) && (getFileSize($outPsl) > 0))
        ? $outRawPsl : undef;
    return ($outPslResults, $outRawPslResults);
}

# get the list start and end offsets from a pslQueryOffsets
sub getPslQueryOffsets($$) {
    my($queryDir, $pslQueryOffsets) = @_;
    if (!defined(${$pslQueryOffsets}{$queryDir})) {
        gbError("BUG: no PSL offsets found for queryDir $queryDir");
    }
    return (${$pslQueryOffsets}{$queryDir}[0],
            ${$pslQueryOffsets}{$queryDir}[1]);
}

# Combine one set of orientation info files.  Return 1 if any were extracted.
sub combineOISet($$$$$) {
    my($queryDir, $outPsl, $pslQueryOffsets, $outOI, $outRawOI) = @_;

    my($startOff, $endOff) =  getPslQueryOffsets($queryDir, $pslQueryOffsets);
    if (($startOff == $endOff) && !defined($outRawOI)) {
        return; # nothing to do
    }
    my $cmd = "find $queryDir -name '*.oi' | xargs cat ";
    if ($startOff < $endOff) {
        # at least create selected OI
        if (defined($outRawOI)) {
            $cmd .= "| tee -a $outRawOI ";
        }
        $cmd .= "| selectWithPsl -fmt=oi -startOff=$startOff -endOff=$endOff $outPsl stdin stdout"
            . " >> $outOI";
    } else {
        # just create raw OI
        $cmd .= " >> $outRawOI";
    }
    runPipe($cmd);
}

# Combine orientation info. Only called if there are PSLs, and hence
# must be orientInfo files.  This processing parallels combinePsls.
sub combineOIs($$$$$$@) {
    my($type, $typePrefix, $updWorkDir, $tmpDir, $outPsl, $pslQueryOffsets, @queryDirs) = @_;

    # Process each query dir in a pipeline, selecting the ones kept
    # by pslCDnaFilter.
    my $outOI = "$updWorkDir/$typePrefix.oi";
    if (-e $outOI) {
        unlink($outOI) || die("unlink $outOI");
    }
    # for mRNAs, also save all alignments
    my $outRawOI;
    if ($type eq "mrna") {
        $outRawOI = "$updWorkDir/$typePrefix.rawOi";
        if (-e $outRawOI) {
            unlink($outRawOI) || die("unlink $outRawOI");
        }
    }
    foreach my $queryDir (@queryDirs) {
        combineOISet($queryDir, $outPsl, $pslQueryOffsets, $outOI, $outRawOI);
    }

    # return pair of output files, or undef if empty
    my $outOIResults = (getFileSize($outOI) > 0)
        ? $outOI : undef;
    my $outRawOIResults = (defined($outRawOI) && (getFileSize($outOI) > 0))
        ? $outRawOI : undef;
    return ($outOIResults, $outRawOIResults);
}

# combine one set of intron PSLs
sub combineIntronPslSet($$$$) {
    my($queryDir, $outPsl, $pslQueryOffsets, $outIntronPsl) = @_;

    my($startOff, $endOff) =  getPslQueryOffsets($queryDir, $pslQueryOffsets);
    if ($startOff == $endOff) {
        return;  # will no PSLs saved
    }
    my $cmd = "find $queryDir -name '*.intronPsl' | xargs cat ";
    $cmd .= "| selectWithPsl -fmt=psl -startOff=$startOff -endOff=$endOff $outPsl stdin stdout"
        . " >> $outIntronPsl";
    runPipe($cmd);
}

# Combine intronPsl files. This processing parallels combinePsls, however
# intronPsl files only exist for native ESTs and not all EST psl will have
# intronPsl files.
sub combineIntronPsls($$$$$$@) {
    my($db, $typePrefix, $updWorkDir, $tmpDir, $outPsl, $pslQueryOffsets, @queryDirs) = @_;

    my $outIntronPsl = "$updWorkDir/$typePrefix.intronPsl";
    if (-e $outIntronPsl) {
        unlink($outIntronPsl) || die("unlink $outIntronPsl");
    }

    # Process each query dir in a pipeline, selecting the ones kept
    # by pslCDnaFilter.
    foreach my $queryDir (@queryDirs) {
        combineIntronPslSet($queryDir, $outPsl, $pslQueryOffsets,
                            $outIntronPsl);
    }

    # return output file, or undef if empty
    if (getFileSize($outIntronPsl) > 0) {
        return $outIntronPsl;
    } else {
        return undef;
    }
}

# Do work of combining and install PSL, etc for a
# release+update+typeAccPrefix+orgCat.  Skip if orgCat was not aligned
# or has already been completed.
sub finishOrgCatAligns($$$$) {
    my($db, $dirPrefix, $orgCat, $tmpDir) = @_;
    my($release, $database, $update, $type, $typeAccPrefix)
        = parseDirPrefix($dirPrefix);
    my $typePrefix = "$typeAccPrefix.$orgCat";
    my $updWorkDir = "$gWorkDir/$release/$database/$update";
    my $markFile = "$updWorkDir/$typePrefix.aligns";
    if (! -e $markFile) {
        return;  # nothing to do
    }
    # check if already completed
    my $alignDir = "data/aligned/$release/$database/$update";
    my $alidxBase = "$alignDir/$typePrefix";

    if (-e "$alidxBase.alidx") {
        if ($gbCommon::verbose > 1) {
            print STDERR "finishAligns: $alidxBase.alidx exists, skipping,\n";
        }
        return;
    }

    my ($outPsl, $outRawPsl, $outOI, $outRawOI, $outIntronPsl, %pslQueryOffset);
    my @queryDirs = glob("$updWorkDir/psl/$typePrefix.*");
    if ($#queryDirs >= 0) {
        ($outPsl, $outRawPsl)
            = combinePsls($db, $release, $type, $typePrefix,
                          $updWorkDir, $tmpDir, \%pslQueryOffset,
                          @queryDirs);
        
        # build orient info and intron psl if we have PSLs and are native.
        if ((defined($outPsl) || defined($outRawPsl))
            && ($orgCat eq "native")) {
            ($outOI, $outRawOI)
                = combineOIs($type, $typePrefix, $updWorkDir,
                             $tmpDir, $outPsl, \%pslQueryOffset,
                             @queryDirs);
            if ($type eq "est") {
                $outIntronPsl = combineIntronPsls($db, $typePrefix,
                                                  $updWorkDir, $tmpDir,
                                                  $outPsl, \%pslQueryOffset,
                                                  @queryDirs);
            }
        }
    }
    
    # save raw PSLs first, as the index file created by gbAlignInstall
    # is flag of successful completion
    if (defined($outRawPsl)) {
        installRawPsls($outRawPsl, $outRawOI, $alignDir);
    }

    # copy psl, migrate alignments, and build index.  This is done even
    # if output files were not produced, as the index lists unaligned seqs.
    my $cmd = "gbAlignInstall ";
    if ($noMigrate) {
        $cmd .= " -noMigrate";
    }
    $cmd .= " -workdir=$gWorkDir -orgCats=$orgCat $release $update $typeAccPrefix $database";
    runProg($cmd);

    if (!$gKeep) {
        foreach my $f ($outPsl, $outRawPsl, $outOI, $outRawOI, $outIntronPsl) {
            if (defined($f)) {
                unlink($f);
            }
        }
    }
}

# install raw PSL and OI files
sub installRawPsls($$$) {
    my($outRawPsl, $outRawOI, $alignDir) = @_;
    my $instRawPsl = $alignDir . "/" . basename($outRawPsl) . ".gz";
    my $instRawOI;
    if (defined($outRawOI)) {
        $instRawOI = $alignDir . "/" . basename($outRawOI) . ".gz";
    }

    makeDir($alignDir);
    runProg("gzip -1c $outRawPsl >$instRawPsl.tmp");
    if (defined($outRawOI)) {
        runProg("gzip -1c $outRawOI >$instRawOI.tmp");
    }

    renameFile("$instRawPsl.tmp", $instRawPsl);
    if (defined($instRawOI)) {
        renameFile("$instRawOI.tmp", $instRawOI);
    }
}

# Process a release+update+typeAccPrefix, combining cluster-generated files,
# migrating alignments, and building the index file.
sub finishAligns($$) {
    my($db, $dirPrefix) = @_;
    my($release, $database, $update, $type, $typePrefix)
        = parseDirPrefix($dirPrefix);
    if ($gbCommon::verbose) {
        print STDERR "finishAligns: $release $database $update $typePrefix\n";
    }
    my $tmpDir = getTmpDir("gbAlign");
    
    finishOrgCatAligns($db, $dirPrefix, "native", $tmpDir);
    finishOrgCatAligns($db, $dirPrefix, "xeno", $tmpDir);

    if ($gKeep) {
        print STDERR "keeping tmp directory $tmpDir\n";
    } else {
        removeDir($tmpDir);
    }
}

# Verify that all expected PSLs exist
sub verifyExpectedPsls() {
    my $expected = "$gWorkDir/align.expected";
    open(EXPECTED, $expected) || die("can't open $expected");
    my $relPsl;
    my $missingCnt = 0;
    while ($relPsl = <EXPECTED>) {
        my $psl = "$gWorkDir/$relPsl";
        chomp($psl);
        if (! -e $psl) {
            print STDERR "Error: missing PSL: $psl\n";
            $missingCnt++;
        }
    }
    close(EXPECTED) || die("close failed");
    if ($missingCnt > 0) {
        die("$missingCnt missing PSLs");
    }
}

# Entry
setTaskName("gbAlignFinish");

# parse into globals used in program
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt =~ /^-workdir($|=)/) {
        $gWorkDir = parseOptEq($opt);
    } elsif ($opt eq "-verbose") {
        $gbCommon::verbose = 1;
    } elsif ($opt =~ /^-verbose=/) {
        $gbCommon::verbose = parseOptEq($opt);
    } elsif ($opt eq "-keep") {
        $gKeep = 1;
    } elsif ($opt eq "-noMigrate") {
        $noMigrate = 1;
    } else {
        gbError("invalid option \"$opt\"");
    }
}
if ($#ARGV < 0) {
    gbError("wrong # args: gbAlignFinish [-workdir=dir] [-keep] [-verbose] database ...");
}
my @databases;
while ($#ARGV >= 0) {
    push(@databases, $ARGV[0]);
    shift @ARGV;
}

# check that the empty done file was created
my $doneFlag = "$gWorkDir/align.done";
if (! -e $doneFlag) {
    gbError("alignment appears not to have completed, $doneFlag does not exist");
}

verifyExpectedPsls();

# process each update and type for each database
foreach my $db (@databases) {
    foreach my $dirPrefix (getDirsPrefixes($db)) {
        finishAligns($db, $dirPrefix);
    }
}
