# for emacs: -*- mode: sh; -*-


# Drosophila pseudoobscura -- 
# 
# Baylor HGSC's Drosophila Genome Project Freeze 1 assembly (Aug. 2003)
# http://www.hgsc.bcm.tmc.edu/projects/drosophila/
#


# DOWNLOAD SEQUENCE (DONE 10/8/03 angie)
    ssh kksilo
    mkdir /cluster/store6/dp1
    cd /cluster/data
    ln -s /cluster/store6/dp1 dp1
    cd /cluster/data/dp1
    mkdir downloads
    cd downloads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/conditions_for_use
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/freeze1_readme
    # Freeze 1 is 759 scaffolds, 271 of which are assigned to chromosomes 
    # in the .txt file:
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/D_pseudo_freeze1_scaffolds_8_28_03.fa
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/D_pseudo_freeze1_scaffold_to_chromosome_assignments_8_28_03.txt
    # Unpack scaffolds into separate files in scaffolds/ dir:
    cd /cluster/data/dp1
    faSplit byname downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa scaffolds/


# CREATE UNORDERED CHROM FROM SCAFFOLDS (DONE 10/10/03 angie)
# Note: scaffolds are separated by 1000 Bp gaps
    ssh kksilo
    cd /cluster/data/dp1
    # DOH!  They omitted ">" at the beginning of 8 headers where they 
    # were noting scaffolds that they split due to spanning multiple 
    # chromosomes.  
    # Edit downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa to restore 
    # the ">" at the beginning of the mangled headers.  
    scaffoldFaToAgp downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa
    mkdir Un
    mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.agp Un/chrUn.agp
    mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.gap Un/chrUn.gap
    mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.lft Un/chrUn.lft
    mkdir jkStuff
    cp Un/chrUn.lft jkStuff/liftAll.lft
    # Create chromosome FA file from AGP and file of masked scaffolds
    agpToFa -simpleMultiMixed Un/chrUn.agp chrUn Un/chrUn.fa \
      downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa


# CREATING DATABASE (DONE 10/10/03 angie)
    # Create the database.
    ssh hgwdev
    echo 'create database dp1' | hgsql ''
    # Make a semi-permanent read-only alias:
    alias dp1 "mysql -u hguser -phguserstuff -A dp1"
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql


# PARTITION SCAFFOLDS FOR PROCESSING (DONE 11/14/03 angie)
    # For all our runs, we need results in scaffold coordinates (to share 
    # with collaborators) as well as chrUn (to display in the browser).  
    # It was a mistake to chop chrUn into chunks for processing the 
    # first time around -- that leaves us with 2 sets of useless coords 
    # as far as others are concerned.  
    #
    # Chop up scaffolds into ~500k chunks (mainly to limit size of 
    # RepeatMasker input sequence).  Cat together split .lft's into 
    # jkStuff/scaffoldsSplit.lft which will be used to lift to scaffold 
    # coords after processing.  
    ssh kksilo
    cd /cluster/data/dp1
    mkdir scaffoldsSplit
    foreach f (scaffolds/*.fa)
      faSplit size $f 500000 scaffoldsSplit/$f:t:r_ \
        -lift=scaffoldsSplit/$f:t:r.lft
    end
    cat scaffoldsSplit/C*.lft > jkStuff/scaffoldsSplit.lft
    # Create a big fasta file containing all split scaffolds, ordered 
    # by decreasing size.  Then use faSplit again -- we end up with 
    # big scaffold chunks in their own files, but small scaffolds 
    # grouped together.   
    mkdir chunks
    cat `ls -1S scaffoldsSplit/*.fa` \
    | faSplit about stdin 300000 chunks/chunk_


# RUN REPEAT MASKER (DONE 10/13/03 angie)
    # Note: drosophila library ("drosophila.lib") is dated May 27 '03.
    # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
    # RepeatMasker runs manageable on the cluster ==> results need lifting.

    # make the run directory, output directory, and job list
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach f ( chunks/*.fa )
      set chunk = $f:t
      echo /cluster/bin/scripts/RMDrosophila \
           /cluster/data/dp1/chunks $chunk /cluster/data/dp1/chunks \
         '{'check out line+ /cluster/data/dp1/chunks/$chunk.out'}' \
      >> RMRun/RMJobs
    end

    # do the run
    ssh kk
    cd /cluster/data/dp1/RMRun
    para create RMJobs
    para try
    para check
    para push
    para check,...
#Completed: 304 of 304 jobs
#Average job time:                5692s      94.87m     1.58h    0.07d
#Longest job:                     7392s     123.20m     2.05h    0.09d
#Submission to last job:          7425s     123.75m     2.06h    0.09d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh kksilo
    cd /cluster/data/dp1
    foreach f (chunks/*.fa.out)
      liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null
    end
    # Make a consolidated scaffold .out file too:
    liftUp scaffolds/scaffolds.fa.out jkStuff/scaffoldsSplit.lft warn \
      chunks/*.fa.out > /dev/null

    # Lift up scaffold .out to chrUn .out.
    liftUp Un/chrUn.fa.out jkStuff/liftAll.lft warn scaffolds/scaffolds.fa.out \
    > /dev/null

    # soft-mask chunk .fa's with .out's (for running TRF next)
    foreach chunk (chunks/chunk*.fa)
      maskOutFa $chunk $chunk.out $chunk -soft
    end

    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut dp1 /cluster/data/dp1/?{,?}/*.fa.out


# SIMPLE REPEATS (TRF) (DONE 11/14/03 angie)
    ssh kksilo
    mkdir -p /cluster/data/dp1/bed/simpleRepeat
    cd /cluster/data/dp1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach f (/cluster/data/dp1/chunks/chunk*.fa)
        set fout = $f:t:r.bed
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # Lift to scaffold coords, then chrUn coords for track-loading:
    liftUp scaffoldsTrf.bed ../../jkStuff/scaffoldsSplit.lft warn \
      trf/*.bed > /dev/null
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \
      scaffoldsTrf.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed dp1 simpleRepeat \
      /cluster/data/dp1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/14/03 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/dp1/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      echo -n "filtering $f... "
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    awk '{if ($5 <= 12) print;}' scaffoldsTrf.bed > scaffoldsTrfMask.bed
    # Lift up filtered trf output to chrom coords as well:
    mkdir trfMaskChrom
    liftUp trfMaskChrom/chrUn.bed ../../jkStuff/liftAll.lft warn \
      scaffoldsTrfMask.bed > /dev/null


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/14/03 angie)
    ssh kksilo
    cd /cluster/data/dp1
    foreach c (?{,?})
      echo repeat- and trf-masking chr$c.fa
      maskOutFa -soft $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa
      maskOutFa -softAdd $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed \
        $c/chr$c.fa
    end
    echo repeat- and trf-masking chunks
    foreach chunk (chunks/chunk*.fa)
      set trfMask=bed/simpleRepeat/trfMask/$chunk:t:r.bed
      maskOutFa -soft $chunk $chunk.out $chunk
      maskOutFa -softAdd $chunk $trfMask $chunk
    end
    echo repeat- and trf-masking scaffolds
    # maskOutFa dies if the lift file describes more seqs than are in 
    # the fasta, so make a separate mask file for each scaffold.
    set rmMask=scaffolds/scaffolds.fa.out
    set trfMask=bed/simpleRepeat/scaffoldsTrfMask.bed
    # use head to grab the RepeatMasker output header, then grep contents.
    foreach f (scaffolds/*.fa)
      head -3 $rmMask > tmpMask.out
      grep $f:t:r $rmMask >> tmpMask.out
      maskOutFa -soft $f tmpMask.out $f
      grep $f:t:r $trfMask > tmpMask.bed
      maskOutFa -softAdd $f tmpMask.bed $f
    end
    rm tmpMask.{bed,out}
    # Put masked scaffolds and chunks on /cluster/bluearc
    mkdir /cluster/bluearc/drosophila/dp1/trfFa
    cp -p /cluster/data/dp1/scaffolds/*.fa /cluster/bluearc/drosophila/dp1/trfFa
    mkdir /cluster/bluearc/drosophila/dp1/chunks
    cp -p /cluster/data/dp1/chunks/*.fa /cluster/bluearc/drosophila/dp1/chunks


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 10/13/03 angie)

    # Translate to nib
    ssh kksilo
    cd /cluster/data/dp1
    mkdir nib
    # redone 11/14/03:
    faToNib -softMask Un/chrUn.fa nib/chrUn.nib

    # Make symbolic links from /gbdb/dp1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/dp1/nib
    ln -s /cluster/data/dp1/nib/chrUn.nib  /gbdb/dp1/nib

    # Load /gbdb/dp1/nib paths into database and save size info.
    ssh hgwdev
    hgsql dp1  < /cluster/data/src/hg/lib/chromInfo.sql
    cd /cluster/data/dp1
    hgNibSeq -preMadeNib dp1 /gbdb/dp1/nib chrUn.nib
    echo "select chrom,size from chromInfo" | hgsql -N dp1 > chrom.sizes

    # load gap track
    ssh hgwdev
    hgLoadGap dp1 /cluster/data/dp1


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/10/03 angie)
    # Copy all the data from the table "grp" 
    # in the existing database "rn1" to the new database
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql dp1

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 12/2/03 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk) values \
        ("dp1", "Aug. 2003", "/gbdb/dp1/nib", "D.pseud.", \
               "chrUn:827700-845800", 1, 56, "D.pseud.", \
                "Drosophila pseudoobscura", "/gbdb/dp1/html/description.html", \
                0);' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("D.pseud.", "dp1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add dp1 in all the right places and do
    make update

    # go public on genome-test
    #make alpha
    cvs commit makefile

    # Add trackDb directories
    mkdir drosophila/dp1
    cvs add drosophila/dp1
    cvs commit drosophila/dp1


PRODUCING GENSCAN PREDICTIONS (TODO 11/14/03 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh kkr1u00
    mkdir /cluster/data/dp1/bed/genscan
    cd /cluster/data/dp1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Make hard-masked contigs
    mkdir /cluster/data/dp1/scaffoldsMasked
    foreach f (/cluster/data/dp1/scaffolds/*.fa)
      maskOutFa $f hard /cluster/data/dp1/scaffoldsMasked/$f:t.masked
    end
    # Generate a list file, contigs.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f contigs.list
    touch contigs.list
    foreach f ( `ls -1S /cluster/data/dp1/scaffoldsMasked/*.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> contigs.list
    end
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 contigs.list single gsub jobList
    para create jobList
    para try
    para check
    para push
#Completed: 759 of 759 jobs
#Average job time:                  16s       0.27m     0.00h    0.00d
#Longest job:                      424s       7.07m     0.12h    0.00d
#Submission to last job:          1538s      25.63m     0.43h    0.02d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    # chr14_21, chr16_4
    
    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/dp1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/dp1/bed/genscan
    ldHgGene dp1 genscan genscan.gtf
    hgPepPred dp1 generic genscanPep genscan.pep
    hgLoadBed dp1 genscanSubopt genscanSubopt.bed


# MAKE DOWNLOADABLE FILES (DONE (no mrna.zip) 12/1/03 angie)
    ssh kksilo
    cd /cluster/data/dp1
    mkdir zips
    zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out
    # use head to keep the RepeatMasker output header of the first file, 
    # and tail to skip past it in all subsequent files.
    head -3 chunks/chunk_00.scaf.out > scaffolds.fa.out
    foreach f ( chunks/*_??.scaf.out chunks/*_???.scaf.out)
      tail +4 $f >> scaffolds.fa.out
    end
    zip zips/scaffoldOut.zip scaffolds.fa.out
    rm scaffolds.fa.out
    zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa
    foreach f (?{,?}/chr?{,?}.fa)
      maskOutFa $f hard $f.masked
    end
    zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked
    zip -j zips/scaffoldFa.zip scaffolds/*.fa
    zip -j zips/scaffoldFaMasked.zip scaffoldsMasked/*.fa.masked
    cd bed/simpleRepeat
    zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed
    zip ../../zips/scaffoldTrf.zip scaffoldsTrfMask.bed
    cd ../..
    zip -j zips/liftAll.zip jkStuff/liftAll.lft
    foreach f (zips/*.zip)
      echo $f
      unzip -t $f | tail -1
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dp1
    cd /usr/local/apache/htdocs/goldenPath/dp1
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cp -p /cluster/data/dp1/zips/*.zip bigZips


# SCAFFOLDS TRACK (DONE 12/1/03 angie)
    # make ctgPos (contig name, size, chrom, chromStart, chromEnd) from liftAll
    ssh hgwdev
    mkdir /cluster/data/dp1/bed/ctgPos
    cd /cluster/data/dp1/bed/ctgPos
    perl -we 'while (<>) { \
                ($cStart, $ctg, $size, $chrom, undef) = split; \
                if ($ctg =~ /^Contig/) { \
                  $cEnd = $cStart + $size; \
                  print "$ctg\t$size\t$chrom\t$cStart\t$cEnd\n"; \
                } \
              }' \
      ../../jkStuff/liftAll.lft > ctgPos.tab
    hgsql dp1 < ~/kent/src/hg/lib/ctgPos.sql
    echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql dp1


# ALL-DROSOPHILA REFSEQ (DONE 12/2/03 angie)
    ssh kksilo
    mkdir /cluster/data/dp1/bed/drosRefSeq
    cd /cluster/data/dp1/bed/drosRefSeq
    grep "^Drosophila" /cluster/data/genbank/data/species.lst > dros.lst
    grep "Drosophila" \
      /cluster/data/genbank/data/processed/refseq.138.0/full/mrna.gbidx \
    | awk '{print $1;}' > refSeqAcc.lst
    perl -wpe 's/^>(\w+)\.\d+/>$1/' \
      /cluster/data/genbank/data/processed/refseq.138.0/full/mrna.fa \
    > allRefSeq.fa
    faSomeRecords allRefSeq.fa refSeqAcc.lst drosRefSeq.fa
    rm allRefSeq.fa
    faSplit sequence drosRefSeq.fa 4 drs
    ls -1S drs*.fa > refSeq.lst
    ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst
    mkdir psl
    # translated blat because these are xeno:
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.lst refSeq.lst gsub spec
    ssh kk
    cd /cluster/data/dp1/bed/drosRefSeq
    para create spec
    para try, check, push, check, ...
#Completed: 1216 of 1216 jobs
#Average job time:                  85s       1.41m     0.02h    0.00d
#Longest job:                      308s       5.13m     0.09h    0.00d
#Submission to last job:           439s       7.32m     0.12h    0.01d
    # back on kksilo:
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps raw.psl cooked.psl /dev/null -minAli=0.25 -minCover=0.3
    liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > drosRefSeq.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load it up:
    ssh hgwdev
    cd /cluster/data/dp1/bed/drosRefSeq
    # hgc needs the track table name to start with xeno...
    ln -s drosRefSeq.psl xenoDrosRefSeq.psl
    hgLoadPsl dp1 xenoDrosRefSeq.psl
    mkdir /gbdb/dp1/drosMrna
    ln -s /cluster/data/dp1/bed/drosRefSeq/drosRefSeq.fa /gbdb/dp1/drosMrna/
    hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosRefSeq.fa


# ALL-DROSOPHILA MRNA (DONE 12/2/03 angie)
    ssh kksilo
    mkdir /cluster/data/dp1/bed/drosMrna
    cd /cluster/data/dp1/bed/drosMrna
    grep "Drosophila" \
      /cluster/data/genbank/data/processed/genbank.138.0/full/mrna.gbidx \
    | awk '{print $1;}' > mrnaAcc.lst
    perl -wpe 's/^>(\w+)\.\d+/>$1/' \
      /cluster/data/genbank/data/processed/genbank.138.0/full/mrna.fa \
    > allMrna.fa
    faSomeRecords allMrna.fa mrnaAcc.lst drosMrna.fa
    rm allMrna.fa
    faSplit sequence drosMrna.fa 4 drs
    ls -1S drs*.fa > mrna.lst
    ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst
    mkdir psl
    # translated blat because these are xeno:
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.lst mrna.lst gsub spec
    ssh kk
    cd /cluster/data/dp1/bed/drosMrna
    para create spec
    para try, check, push, check, ...
#Completed: 1216 of 1216 jobs
#Average job time:                  69s       1.15m     0.02h    0.00d
#Longest job:                      102s       1.70m     0.03h    0.00d
#Submission to last job:           351s       5.85m     0.10h    0.00d
    # back on kksilo:
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps raw.psl cooked.psl /dev/null -minAli=0.25
    liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > drosMrna.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load it up:
    ssh hgwdev
    cd /cluster/data/dp1/bed/drosMrna
    # hgc needs the track table name to start with xeno...
    ln -s drosMrna.psl xenoDrosMrna.psl
    hgLoadPsl dp1 xenoDrosMrna.psl
    ln -s /cluster/data/dp1/bed/drosMrna/drosMrna.fa /gbdb/dp1/drosMrna/
    hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosMrna.fa


# ALL-DROSOPHILA EST (DONE 12/2/03 angie)
    ssh kksilo
    mkdir /cluster/data/dp1/bed/drosEst
    cd /cluster/data/dp1/bed/drosEst
    foreach idx (/cluster/data/genbank/data/processed/genbank.138.0/full/est.*.gbidx)
      grep "Drosophila" $idx \
      | awk '{print $1;}' > $idx:t:r.lst
      perl -wpe 's/^>(\w+)\.\d+/>$1/' $idx:r.fa \
      > $idx:t:r.all.fa
      faSomeRecords $idx:t:r.all.fa $idx:t:r.lst $idx:t:r.fa
      rm $idx:t:r.all.fa
    end
    cat est.*.lst > drosEst.lst
    cat est.*.fa > drosEst.fa
    faSplit sequence drosEst.fa 20 drs
    rm est.*.lst est.*.fa
    ls -1S drs*.fa > est.lst
    ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst
    mkdir psl
    # translated blat because these are xeno:
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.lst est.lst gsub spec
    ssh kk
    cd /cluster/data/dp1/bed/drosEst
    para create spec
    para try, check, push, check, ...
#Completed: 6080 of 6080 jobs
#Average job time:                  63s       1.04m     0.02h    0.00d
#Longest job:                      109s       1.82m     0.03h    0.00d
#Submission to last job:           485s       8.08m     0.13h    0.01d
    # back on kksilo:
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps raw.psl cooked.psl /dev/null -minAli=0.10
    liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > drosEst.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load it up:
    ssh hgwdev
    cd /cluster/data/dp1/bed/drosEst
    # hgc needs the track table name to start with xeno...
    ln -s drosEst.psl xenoDrosEst.psl
    hgLoadPsl dp1 xenoDrosEst.psl
    ln -s /cluster/data/dp1/bed/drosEst/drosEst.fa /gbdb/dp1/drosMrna/
    hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosEst.fa


# LIFTOVER BDGP GENES (IN PROGRESS 12/9/03 angie)
    ssh hgwdev
    mkdir /cluster/data/dp1/bed/bdgpAnnotations
    cd /cluster/data/dp1/bed/bdgpAnnotations
    echo 'select * from bdgpGene' | hgsql -N dm1 > dm1.bdgpGene.tab
    liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \
      dm1.bdgpGene.tab \
      /cluster/data/dm1/bed/blastz.dp1.2003-11-14/axtChainScaffoldQ/all.chain \
      scaffold.bdgpLiftGene.tab scaffold.unmapped.tab
    liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \
      dm1.bdgpGene.tab \
      /cluster/data/dm1/bed/blastz.dp1.2003-11-14/axtChainChrom/all.chain \
      chrUn.bdgpLiftGene.tab chrUn.unmapped.tab
    wc -l dm1.bdgpGene.tab scaffold.bdgpLiftGene.tab chrUn.bdgpLiftGene.tab 
    grep ^# chrUn.unmapped.tab | awk '{print $1 " " $2 " " $3;}' \
    | sort | uniq -c | sort -gr
    hgsql dp1 <<end
CREATE TABLE bdgpLiftGene (
    name varchar(255) not null, # Name of gene
    chrom varchar(255) not null,        # Chromosome name
    strand char(1) not null,    # + or - for strand
    txStart int unsigned not null,      # Transcription start position
    txEnd int unsigned not null,        # Transcription end position
    cdsStart int unsigned not null,     # Coding region start
    cdsEnd int unsigned not null,       # Coding region end
    exonCount int unsigned not null,    # Number of exons
    exonStarts longblob not null,       # Exon start positions
    exonEnds longblob not null, # Exon end positions
              #Indices
    PRIMARY KEY(name(16)),
    INDEX(chrom(12),txStart),
    INDEX(chrom(12),txEnd)
);
end
    echo 'load data local infile "chrUn.bdgpLiftGene.tab" into table bdgpLiftGene'\
    | hgsql dp1
    echo 'create table bdgpLiftGenePep select * from dm1.bdgpGenePep' \
    | hgsql dp1
    # Make the scaffold-coords version available for download
    zip /usr/local/apache/htdocs/goldenPath/dp1/bigZips/scaffoldAnnot.zip \
      scaffold.bdgpLiftGene.tab
    # Update /usr/local/apache/htdocs/goldenPath/dp1/bigZips/README.txt


# SWAP DM1-DP1 BLASTZ (DONE 12/3/03 angie)
    ssh kksilo
    mkdir -p /cluster/data/dp1/bed/blastz.dm1.swap/axtScaffold/byT
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtScaffold/byT
    foreach f (/cluster/data/dm1/bed/blastz.dp1.2003-11-14/axtScaffold/chr*.axt)
      axtSwap $f /cluster/data/dm1/chrom.sizes \
        /cluster/data/dp1/scaffold.sizes $f:t
    end
    cd ..
    mkdir byQ
    cat byT/chr*.axt | axtSplitByTarget stdin byQ
    rm -r byT
    foreach f (byQ/*.axt)
      axtSort $f $f:t
    end
    rm -r byQ
    cd ..
    mkdir axtChrom
    liftUp tmp.axt /cluster/data/dp1/jkStuff/liftAll.lft warn axtScaffold/*.axt
    axtSort tmp.axt axtChrom/chrUn.axt
    rm tmp.axt


# CHAIN & NET DP1/DM1 BLASTZ ALIGNMENTS (DONE 12/3/03 angie)
    ssh kkr1u00
    mkdir -p /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/run1
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/run1
    ls -1S ../../axtScaffold/*.axt > input.lst
    echo '#LOOP' > gsub
    echo 'doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}' >> gsub
    echo '#ENDLOOP' >> gsub
    gensub2 input.lst single gsub spec
    echo '#\!/bin/csh' > doChain
    echo 'axtChain $1 /cluster/data/dp1/scaffoldsNib /cluster/data/dm1/nib $2 > $3' \
    >> doChain
    chmod a+x doChain
    mkdir chain out
    para create spec
    para try, check, push, check, ...
#Completed: 671 of 671 jobs
#Average job time:                  11s       0.19m     0.00h    0.00d
#Longest job:                       45s       0.75m     0.01h    0.00d
#Submission to last job:           639s      10.65m     0.18h    0.01d

    ssh kksilo
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    mkdir -p ../axtChainChrom/chain
    liftUp ../axtChainChrom/all.chain \
      /cluster/data/dp1/jkStuff/liftAll.lft warn all.chain
    cd ../axtChainChrom/chain
    chainSplit chain all.chain

    ssh hgwdev
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainChrom/chain
    foreach i (*.chain)
      set c = $i:r
      hgLoadChain dp1 ${c}_chainDm1 $i
      echo done $c
    end

    # Create the nets.  You can do this while the database is loading
    ssh kksilo
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold
    # First do a crude filter that eliminates many chains so the
    # real chainer has less work to do.
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      chainPreNet $i /cluster/data/dp1/scaffold.sizes \
        /cluster/data/dm1/chrom.sizes ../preNet/$i
    end
    cd ..
    # Run the main netter, putting the results in n1.
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      chainNet $i -minSpace=1 /cluster/data/dp1/scaffold.sizes \
        /cluster/data/dm1/chrom.sizes ../n1/$n /dev/null
    end
    cd ..
    # Classify parts of net as syntenic, nonsyntenic etc.
    cat n1/*.net | netSyntenic stdin noClass.net

    # The final step of net creation needs the database.
    # Best to wait for the database load to finish if it
    # hasn't already.
    ssh hgwdev
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold
    netClass -liftT=/cluster/data/dp1/jkStuff/liftAll.lft -noAr \
      noClass.net dp1 dm1 fruitfly.net
    rm -r n1 noClass.net
    netFilter -minGap=10 fruitfly.net > fruitflyFilt.net
    liftUp ../axtChainChrom/fruitflyFiltChrom.net \
      /cluster/data/dp1/jkStuff/liftAll.lft warn fruitflyFilt.net
    hgLoadNet dp1 netDm1 ../axtChainChrom/fruitflyFiltChrom.net
    # Move back to the file server to create axt files corresponding
    # to the net.
    ssh kksilo
    cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold
    mkdir ../axtNetScaffold
    netSplit fruitflyFilt.net fruitflyNet
    cd fruitflyNet
    foreach i (*.net)
        set c = $i:r
        netToAxt -maxGap=300 $i ../chain/$c.chain \
          /cluster/data/dp1/scaffoldsNib /cluster/data/dm1/nib \
          ../../axtNetScaffold/$c.axt
        echo done ../axtNetScaffold/$c.axt
    end
    cd ..
    rm -r fruitflyNet
    mkdir ../axtNetChrom
    cd ../axtNetChrom
    liftUp tmp.axt /cluster/data/dp1/jkStuff/liftAll.lft warn \
      ../axtNetScaffold/*.axt
    axtSort tmp.axt chrUn.axt
    rm tmp.axt

    # Load up the axtNet (alignment score wiggle) track:
    ssh hgwdev
    mkdir /gbdb/dp1/axtNetDm1
    foreach f (/cluster/data/dp1/bed/blastz.dm1.swap/axtNetChrom/chr*.axt)
      ln -s $f /gbdb/dp1/axtNetDm1
    end
    hgLoadAxt dp1 axtNetDm1


# ADD BLASTZ/CHAIN/NET DOWNLOADABLE FILES (DONE 12/3/03 angie)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dp1/vsDm1
    cd /usr/local/apache/htdocs/goldenPath/dp1/vsDm1
    gzip -c \
      /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/all.chain \
      > chain.gz
    gzip -c \
      /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/fruitflyFilt.net \
      > net.gz
    mkdir axtNet
    foreach f (/cluster/data/dp1/bed/blastz.dm1.swap/axtNetScaffold/*.axt)
      gzip -c $f > axtNet/$f:t.gz
    end
    # Make a README.txt which explains the files & formats.


