#!/usr/bin/env tclsh

#
# Run blat, polyInfo and pslIntronsOnly.
#
# gbBlat [options] jobSpecFile out.psl
#
# Options:
#  -keep - keep temporary files.
#  -verbose - verbose debugging
#  -overwrite - overwrite existing output (for debugging)
#  -tmpDir - use this tmpdir (for debugging).
#  
#
# Arguments:
#  - jobSpecFile - path to a file describing the set of alignments to perform.
#    The format is described below.
#  - out.psl - PSL output file  The other output filenames are derived
#    by removing the .psl from the name.  If no sequences align, an
#    empty psl is still created to server as a job-completed marker.
#
# Job specification file:
#   Format are lines of key words and values. Blank lines and lines where
#   the first non-blank character is '#' are ignored.  A line ending in
#   a '\' is continued to the next line.  Records are in the form
#      key<whitespace+>value
#   A job spec defines a set of queries and target to align against each
#   other, as well as a set of parameters that control the alignment.
#
#   The following keys are recognized:
#     o type - alignment type, two value, first is the orgCat (native or xeno),
#       second is the cDNA type (mrna or est)
#     o ooc - blat ooc file to use (optional).
#     o tdb - target database, this is either a NIB directory or a twoBit file.
#     o tseq - white-space separated list of target sequences, which are either
#       a sequence id (chr1), or a sequence size and subrange
#       (chr1:size:start-end).
#     o qdb - white-space seperated list of query fasta files.  All sequences
#       in the files will be queried.
#     o preFilterOpts - pslCDnaFilter options to use for prefiltering.  This
#       should only include the non-comparitive options.  The -polyASizes
#       option should not have an argument, it will be added.
#
# PSL files are written to a local tmp directory and then copied to the output
# directory to reduce NFS impact.  If no sequences align, an empty psl file
# is still produced.  This script is very careful to exit non-zero if an error
# occurs.



# written in Tcl because bash version was getting hideously hard and wanted
# a language with named fuction parameters, ya known, like in fortran.
#
# $Id: gbBlat,v 1.70 2010/04/10 18:09:13 markd Exp $
#

# Tools are normally taken from /cluster/bin, however we allow them to be
# overridden by puting them under gbRoot/bin/$ARCH.  Use path to this script to
# find bin dir.
if {$tcl_platform(machine) == "i686"} {
    set arch i386
} else {
    set arch $tcl_platform(machine)
}

# include i386 as a fall back on x86_64 platforms
set gbBinDir [file dir [info script]]
set env(PATH) "$gbBinDir/$arch:$gbBinDir/i386:/cluster/bin/$arch:$env(PATH)"

puts $env(PATH)

# FXIME: tmp workaround for libmysqlclient.so.* not being on cluster
set env(LD_LIBRARY_PATH) [file dir [file dir [info script]]]/lib/i386

# use to record last command executed so that a shorter, more useful error
# message can be generated on child process error
set lastExec {}

# lassign list varname ?varname..?
proc lassign {vals args} {
    set i 0
    foreach var $args {
        if {$var != ""} {
            set val [lindex $vals $i]
            uplevel 1 [list set $var $val]
        }
        incr i
    }
    return [lrange $vals $i end]
}

##
# exec a command with verbose output if requested stderr passed through
proc doExec {args} {
    global verbose lastExec

    if {$verbose} {
        puts stderr [join $args]
    }
    set lastExec $args
    eval exec $args 2>@stderr
}

##
# exec, recording last command, return stdout
proc callProg {args} {
    global lastExec
    set lastExec $args
    return [eval exec $args]
}

##
# writes a message and abort without a stack trace
proc abort {msg} {
    puts stderr "Error: $msg"
    exit 1
}

##
# pop a command line argument
proc cmdlinePop {} {
    global argv
    set opt [lindex $argv 0]
    set argv [lreplace $argv 0 0]
    return $opt
}

##
# pop a command line option argument
proc cmdlinePopVal {opt} {
    global argv
    if {[llength $argv] == 0} {
        abort "$opt requires an argument"
    }
    return [cmdlinePop]
}

##
# read the next job spec record from the file, or empty if no more.
proc nextJobSpec {fh} {
    # find next record
    set line {}
    while {[gets $fh line] >= 0} {
        set line [string trimleft $line]
        if {![regexp {^(\#|$)} $line]} {
            break
        }
    }
    if {[string length $line] == 0} {
        return {}  ;# eof
    }

    # read continuation lines, ensuring a blank between each line
    while {[regsub {\\$} $line {} line]} {
        if {[gets $fh line2] < 0} {
            error "EOF while parsing line continuation"
        }
        set line "$line $line2"
    }
    
    # break into a list of words
    regsub -all {[[:space:]]+} $line { } line
    return [split [string trim $line]]
}

##
# constants used in validating specs.  specsDef is an array of valid keys,
# with the value being a list of {numValues required}.  Where numValues
# is the required number of values, or negative of the minimum number of values.
#    
array set specsDef {
    type {2 1}
    ooc  {1 0}
    tdb  {1 1}
    tseq {-1 1}
    qdb  {-1 1}
    maxIntron {1 0}
    preFilterOpts {-1 0}
}

##
# parse open jobs specs file
proc parseJobSpecsFh {fh specsVar} {
    upvar $specsVar specs
    global specsDef

    # read specs
    while {[llength [set spec [nextJobSpec $fh]]]} {
        set specs([lindex $spec 0]) [lrange $spec 1 end]
    }

    # validate
    foreach key [array names specs] {
        if {![info exists specsDef($key)]} {
            error "unknown key \"$key\""
        }
    }
    foreach key [array names specsDef] {
        set def $specsDef($key)
        if {[lindex $def 1] && ![info exists specs($key)]} {
            error "missing required key \"$key\""
        }
        if {![info exists specs($key)]} {
            if {[lindex $def 1]} {
                error "missing required key \"$key\""
            }
        } else {
            set numVals [lindex $def 0]
            if {$numVals >= 0} {
                if {[llength $specs($key)] != $numVals} {
                    error "$key requires $numVals values"
                }
            } else {
                if {[llength $specs($key)] < $numVals} {
                    error "$key requires at least $numVals values"
                }
            }
        }
    }
    if {[lsearch {native xeno} [lindex $specs(type) 0]] < 0} {
        error "key \"type\" orgCat should be \"native\" or \"xeno\""
    }
    if {[lsearch {mrna est} [lindex $specs(type) 1]] < 0} {
        error "key \"type\" cDNA type should be \"mrna\" or \"est\""
    }
}

##
# parse the job spec file into an array indexed by key, with values being a list
proc parseJobSpecs {specFile specsVar} {
    upvar $specsVar specs
    set fh [open $specFile]
    if {[catch {
        parseJobSpecsFh $fh specs
    } msg]} {
        error "$msg: $specFile" "parsing $specFile\n$::errorInfo" $::errorCode
    }
    close $fh
}

# figure out BLAT parameters bases on specs
proc determineBlatParams {specsVar} {
    upvar $specsVar specs
    lassign $specs(type) orgCat cdnaType
    set blatParams {}

    #- repeat-masking only used on xeno, ooc not used for translated
    switch $orgCat {
        native {
            lappend blatParams -minScore=20 -stepSize=5
            switch $cdnaType {
                mrna {lappend blatParams -q=rna -fine}
                est {}
            }
            if {[info exists specs(ooc)]} {
                lappend blatParams -ooc=$specs(ooc)
            }
        }
        xeno {
            switch $cdnaType {
                mrna {lappend blatParams -q=rnax -t=dnax -mask=lower}
                est  {lappend blatParams -q=dnax -t=dnax -mask=lower}
            }
        }
    }
    if {[info exists specs(maxIntron)]} {
        lappend blatParams -maxIntron=$specs(maxIntron)
    }

    return $blatParams
}

##
# create name referencing a genome sequences in a genome file.
proc makeGenomeSeqRef {specsVar tSeqSet} {
    upvar $specsVar specs
    set path $specs(tdb)
    if {[string match *.2bit $specs(tdb)]} {
        # either single subrange, or list of whole sequences
        append path : [join $tSeqSet ,]
    } else {
        # nib is single whole sequence or subrange
        if {![regexp {^([^:]+)(:([0-9]+)-([0-9]+))?$} [lindex $tSeqSet 0] {} id {} start end]} {
            error "can't parse sequence spec [lindex $tSeqSet 0]"
        }
        append path /$id.nib
        if {$start != {}} {
            append path :$start-$end
        }
    }
    return $path
}

# checksum a file
proc cksum {file} {
    return [lindex [split [exec cksum $file]] 0]
} 

##
# parse a target sequence spec into a list
# return either {seqId seqSize subStart subEnd} or {seqId}
proc parseTSeq {tseq} {
    if {[regexp {^(.+):(.+):(.+)-(.+)$} $tseq {} id size start end]} {
        return [list $id $size $start $end]
    } else {
        return [list $tseq]
    }
}

##
# create a lift file, if needed, otherwise return {}.
proc makeLiftIfNeeded {specsVar} {
    upvar $specsVar specs

    # build rows of lift file
    set rows {}
    foreach seq $specs(tseq) {    
        lassign [parseTSeq $seq] id size start end
        if {$start != {}} {
            set name $id:$start-$end
            lappend rows [join [list $start $name [expr $end-$start] $id $size] \t]
        }
    }
    set liftFile {}
    if {[llength $rows] > 0} {
        global tmpPrefix
        set liftFile $tmpPrefix.lft
        set fh [open $liftFile w]
        puts $fh [join $rows \n]
        close $fh
    } 
    return $liftFile
}

##
# append a file to a cummulative tmp output file, optionally lifting
proc liftOrCp {type inFile outFile liftFile} {
    set outFh [open $outFile a]
    if {$liftFile == ""} {
        exec cat $inFile >@$outFh
    } else {
        # Note: wanted to specify "error", however blat doesn't include
        # subrange if it covers the whole sequence
        doExec liftUp -nohead -nosort -type=.$type stdout $liftFile carry $inFile >@$outFh
    }
    close $outFh
}  

##
# get poly-A sizes for a cDNA fasta
proc getPolyASizes {cdnaFa} {
    return [file root $cdnaFa].polya
}

##
# get prefiltering options, adding in polyA file if needed
proc getPreFilterOpts {specsVar cdnaFa} {
    upvar $specsVar specs
    # Find -polyASize and modify if needed
    set opts [split $specs(preFilterOpts)]
    if {[lsearch -glob $opts -polyASizes=*] >= 0} {
        error "-polyASizes in preFilterOpts  should not have an argument"
    }
    if {[set i [lsearch $opts -polyASizes]] >= 0} {
        set opts [lreplace $opts $i $i -polyASizes=[getPolyASizes $cdnaFa]]
    }
    return [concat {-verbose=0} $opts]
}

##
# prefilter a psl file with non-comparive pslCDnaFilter options
#
proc preFilterAligns {specsVar cdnaFa inPsl} {
    global tmpPrefix
    upvar $specsVar specs
    set outPsl $tmpPrefix.filtered.psl

    set cmd [concat {pslCDnaFilter} [getPreFilterOpts specs $cdnaFa] \
                 [list  $inPsl $outPsl]]
    eval doExec $cmd
    return $outPsl
}

##
# atomic install of a file to it's final location
proc installFile {inFile outFile} {
    # copy to global storage and verify checksum
    set inCksum [cksum $inFile]
    doExec cp $inFile $outFile.tmp
    set cpCksum [cksum $outFile.tmp]
    if {$inCksum != $cpCksum} {
        error "checksum mismatch after copy to global storage: $inFile ($inCksum) != $outFile.tmp ($cpCksum)"
    }
    # atomic rename
    file rename -force $outFile.tmp $outFile
}

## 
# Align a set of target seqs or subsequences and process results, appending
# results to specified cummulative output files.
proc alignToGenome {specsVar blatParams liftFile tSeqSet cdnaFa outPsl outOi outIntronPsl} {
    global tmpPrefix
    upvar $specsVar specs
    lassign $specs(type) orgCat cdnaType

    # unfiltered tmp output on local FS
    set localPsl $tmpPrefix.psl
    set localOi $tmpPrefix.oi
    set localIntronPsl $tmpPrefix.intronPsl
    file delete $localPsl $localOi $localIntronPsl
    
    set genomeSeqRef [makeGenomeSeqRef specs $tSeqSet]

    # align
    set cmd [concat {blat -noHead -repeats=lower} $blatParams \
                 [list $genomeSeqRef $cdnaFa $localPsl]]
    eval doExec $cmd
    if {[info exists specs(preFilterOpts)]} {
        set localPsl [preFilterAligns specs $cdnaFa $localPsl]
    }
    liftOrCp psl $localPsl $outPsl $liftFile

    # orientation info info and ESTs w/intron selection is only done for
    # native and if the output file length is greater than zero
    if {([file size $localPsl] > 0) && ($orgCat == "native")} {
        # get orientation info
        doExec polyInfo $localPsl $genomeSeqRef $cdnaFa $localOi
        liftOrCp bed $localOi $outOi $liftFile

        # if these are native ESTs, select ones that have introns, saving only
        # if some actually exist.
        if {$cdnaType == "est"} {
            doExec pslIntronsOnly $localPsl $genomeSeqRef $localIntronPsl
            if {[file size $localIntronPsl] > 0} {
                liftOrCp psl $localIntronPsl $outIntronPsl $liftFile
            }
        }
    }
}

##
# partition target seqs for nibs, which is one sequence or subrange per set
proc partitionNibTargetSeqs {specsVar} {
    upvar $specsVar specs
    set tSeqSets {}
    foreach tseq $specs(tseq) {
        lassign [parseTSeq $tseq] id {} start end
        if {$start == {}} {
            set spec $id
        } else {
            set spec $id:$start-$end
        }
        lappend tSeqSets [list $spec]
    }
    return $tSeqSets
}

# partition target seqs for twoBits
proc partitionTwoBitTargetSeqs {specsVar} {
    upvar $specsVar specs
    set maxSpecsLen 20000
    set tSeqSets {}
    set fnameLen [string length $specs(tdb)]
    set seqSet {}
    set totalLen $fnameLen
    foreach tseq $specs(tseq) {
        lassign [parseTSeq $tseq] id {} start end
        if {$start == {}} {
            set spec $id
        } else {
            set spec $id:$start-$end
        }
        set len [string length $spec]
        if {$totalLen+$len+1 > $maxSpecsLen} {
            # set is full
            lappend tSeqSets $seqSet
            set seqSet {}
            set totalLen $fnameLen
        }
        lappend seqSet $spec
        set totalLen [expr {$totalLen+$len+1}]
    }
    if {[llength $seqSet] > 0} {
        lappend tSeqSets $seqSet
    }
    return $tSeqSets
}

##
# partition target sequences into sets that can be passed to alignToGenome.
# This optimize 2bit access by group whole seqs together, but avoiding
# overflowing command lines.  Returns {{seq1 seq2 ...} ...}
proc partitionTargetSeqs {specsVar} {
    upvar $specsVar specs
    if {[string match *.2bit $specs(tdb)]} {
        return [partitionTwoBitTargetSeqs specs]
    } else {
        return [partitionNibTargetSeqs specs]
    }
}

##
# generate alignments based on the specs
proc genAlignments {specsVar outPrefix} {
    global tmpPrefix
    upvar $specsVar specs
    set blatParams [determineBlatParams specs]

    # accumulate output files in tmpdir
    set cumPsl $tmpPrefix.cum.psl
    set cumOi $tmpPrefix.cum.oi
    set cumIntronPsl $tmpPrefix.cum.intronPsl

    set liftFile [makeLiftIfNeeded specs]

    # loop over each target sequence set and each query fasta
    foreach tSeqSet [partitionTargetSeqs specs] {
        foreach cdnaFa $specs(qdb) {
            alignToGenome specs $blatParams $liftFile $tSeqSet $cdnaFa $cumPsl $cumOi $cumIntronPsl
        }
    }

    # install to final location, psl last to indicate completion
    if {[file exist $cumOi]} {
        installFile $cumOi $outPrefix.oi
    }
    if {[file exist $cumIntronPsl]} {
        installFile $cumIntronPsl $outPrefix.intronPsl
    }
    installFile $cumPsl $outPrefix.psl
}

##
# handle an caught error, avoid dumping stack on subprocess or system call
# errors.
proc handleError {errorResult} {
    global errorInfo errorCode keepTmp tmpDir lastExec verbose
    set holdErrorInfo $errorInfo
    set holdErrorCode $errorCode
    if {!$keepTmp} {
        # don't use callProg here, need to preserve lastExec
         exec rm -rf $tmpDir &
    }
    if {$verbose} {
        puts stderr "ErrorInfo: $holdErrorInfo"
        puts stderr "ErrorCode: $holdErrorCode"
    }
    switch [lindex $holdErrorCode 0] {
        CHILDSTATUS {
            puts stderr "Error: $errorResult"
            puts stderr "Exec: $lastExec"
            exit 1
        }
        POSIX {
            puts stderr "Error: $errorResult"
            exit 1
        }
        default {
            error $errorResult $holdErrorInfo $holdErrorCode
        }
    }
}

##
# create tmp directory
proc mkTmpDir {tmpArg} {
    global env keepTmp
    # include user name in tmpDir name, since stale directories from another
    # user can't be removed.  Prefer /scratch/tmp, an fall back to /var/tmp
    set dirName gbBlat.$env(USER).[pid].tmp

    if {$tmpArg != {}} {
        set tmpDir $tmpArg/$dirName
    } elseif {[file isdirectory /scratch/tmp]} {
        set tmpDir /scratch/tmp/$dirName
    } else {
        set tmpDir /var/tmp/$dirName
    }
    callProg rm -rf $tmpDir
    callProg mkdir -p -m 02775 $tmpDir
    if {$keepTmp} {
        puts stderr "note: will keep [callProg hostname] tmpDir $tmpDir"
    }
    return $tmpDir
}

##
# create output directory Call early, so mkdir failure (usually NFS) will not happen
# after blat has completed. Maintain group write/sticky
proc mkOutDir {outDir} {
    if {[catch {callProg mkdir -p -m 02775 $outDir}]} {
        # Try twice, just because we sometimes get weird failures.  
        callProg sleep 2
        callProg mkdir -p -m 02775 $outDir
    }
}

##
# entry, parse arguments
set keepTmp 0
set verbose 0
set overwrite 0
set tmpArg {}
while {[llength $argv] && [string match -* [lindex $argv 0]]} {
    set opt [cmdlinePop]
    switch -- $opt {
        -keep {
            set keepTmp 1
        }
        -verbose {
            set verbose 1
        }
        -overwrite {
            set overwrite 1
        }
        -tmpDir {
            set tmpArg [cmdlinePopVal $opt]
        }
        default {
            puts stderr "Error: invalid option: $opt"
            exit 1
        }
    }
}

if {[llength $argv] != 2} {
    abort {Wrong \# args: gbBlat [options] jobSpecFile out.psl}
}
parseJobSpecs [lindex $argv 0] specs
set outPsl [lindex $argv 1]

set outDir [file dirname $outPsl]
set outBase [file root [file tail $outPsl]]
set outPrefix $outDir/$outBase

# skip if output file already exists
if {[file exists $outPsl] && !$overwrite} {
    puts stderr "Note: output psl exists, skipping: $outPsl"
    exit 0
}

set tmpDir [mkTmpDir $tmpArg]
set tmpPrefix $tmpDir/$outBase
mkOutDir $outDir

# generate alignments, cleaning up tmp on error
if {[catch {
    genAlignments specs $outPrefix
} errorResult]} {
    handleError $errorResult
}
if {!$keepTmp} {
    callProg rm -rf $tmpDir &
}
