# for emacs: -*- mode: sh; -*-


# Strongylocentrotus purpuratus -- 0.5 assembly April 15, 2005
# 
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+


# DOWNLOAD SEQUENCE (DONE 4/4/05 Andy) -- REDONE 4/17/2005, see below
    mkdir -p /cluster/store5/strPur1/downloads
    ln -s /cluster/store5/strPur1 /cluster/data/strPur1
    cd /cluster/data/strPur1/downloads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-contigs.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-reptigs.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20050323.agp
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/accession.contig.txt
    gunzip *.gz
    sed 's/^\(\w\+\)\b\W\+Spur_1.1_\(\w\+\).*/\1\t\1|\2/' accession.contig.txt > fromAccs.map
    sed 's/^\(\w\+\)\b\W\+Spur_1.1_\(\w\+\).*/\2\t\1|\2/' accession.contig.txt > fromContigs.map
    # agpChangeNames and faChangeNames are uncommitted utilities at this point.  They can maybe go in oneShot
    # subs was too slow.
    sed 's/\.1//' Spur20050323.agp | \
        agpChangeNames /dev/stdin fromAccs.map /dev/stdout | sort -k1,1 -k2,2n > urchin.agp
    rm Spur20050323.agp
    faChangeNames Spur20041123-contigs.fa fromContigs.map urchinContigs.fa
    faChangeNames Spur20041123-reptigs.fa fromContigs.map reptigs.fa
    cat urchinReptigs.fa >> urchinContigs.fa
    rm *.map Spur20041123-*.fa
    agpAllToFaFile urchin.agp urchinContigs.fa urchinSupertigs.fa
    rm urchinContigs.fa urchinReptigs.fa
    # UPDATED 4/17/2005
    # OK I've notived that there's been an update.  I'm not making the fancier named
    cd /cluster/data/strPur1/downloads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-contigs.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-reptigs.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20050413.agp
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/accession.contig.txt
    sed 's/Spur_1.1_//' accession.contig.txt > tmp.map
    fixCr tmp.map
    awk '{printf("%s\t%s|%s\n", $1, $1, $2)}' tmp.map > fromAccs.map
    awk '{printf("%s\t%s|%s\n", $2, $1, $2)}' tmp.map > fromCont.map
    rm tmp.map
    sed 's/\.1//' Spur20050413.agp | \
        agpChangeNames /dev/stdin fromAccs.map /dev/stdout | sort -k1,1 -k2,2n > urchin.agp
    faChangeNames Spur20041123-contigs.fa fromCont.map urchinContigs.fa
    faChangeNames Spur20041123-reptigs.fa fromCont.map reptigs.fa
    cat reptigs.fa >> urchinContigs.fa
    rm *.map Spur20041123-*.fa
    agpAllToFaFile urchin.agp urchinContigs.fa urchinScaffolds.fa
    rm Spur* urchinContigs.fa reptigs.fa *.map accession.contig.txt
    

# CREATING DATABASE (DONE 4/10/05 Andy)
    # Create the database.
    ssh hgwdev
    hgsql '' -e 'create database strPur1'

# LOAD GAP & GOLD TABLES FROM AGP (DONE 4/10/2005 Andy) (REDONE 4/17/2005 Andy)
    ssh hgwdev
    cd /cluster/data/strPur1/downloads
    hgGoldGapGl -noGl strPur1 urchin.agp
    # For some reason, the indices did not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    # *** Andy's note: the same thing happened in this assembly too.
    hgsql strPur1 -e 'analyze table gold; analyze table gap;'

# REPEATMASKER (DONE 4/14/05 Andy) (REDONE 4/17/2005 Andy)
    # Check the library for urchin repeats.  Nothing with "urchin", or "purpuratus".
    /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species Strongylocentrotus -stat
#>IS1#ARTEFACT  Length = 768 bp
#>IS2#ARTEFACT  Length = 1331 bp
#>IS3#ARTEFACT  Length = 1258 bp
#>IS5#ARTEFACT  Length = 1195 bp
#>IS10#ARTEFACT  Length = 1329 bp
#>IS30#ARTEFACT  Length = 1221 bp
#>IS150#ARTEFACT  Length = 1443 bp
#>IS186#ARTEFACT  Length = 1338 bp
#>Tn1000#ARTEFACT  Length = 5981 bp
#>(CA)n#Simple_repeat  Length = 180 bp
#...
#>SPRP2#Unknown  Length = 247 bp
#>SU7N33#Unknown  Length = 195 bp
#
#Total Sequence Length = 50249 bp
   # Split files for the run:
   ssh eieio
   cd /cluster/data/strPur1
   mkdir unmaskedSplits
   cd unmaskedSplits/
   faSplit -outDirDepth=3 -lift=splits.lft gap ../downloads/urchinScaffolds.fa 500000 .
   # copy to panasas
   mkdir -p /panasas/store/strPur1
   cd /panasas/store/strPur1
   cp -r /cluster/data/strPur1/unmaskedSplits .
   ssh kk
   cd /cluster/data/strPur1
   mkdir RMRun RMOut jkStuff
   cat << "_EOF_" > jkStuff/RMApis
#!/bin/bash 

animal=Strongylocentrotus
genome=strPur1
inloc=/panasas/store/strPur1/unmaskedSplits
outloc=/cluster/data/strPur1/RMOut
file=`basename $1`
dir=`dirname $1`

cd $outloc
mkdir -p /tmp/$genome/$dir
cp $inloc/$1 /tmp/$genome/$dir
pushd /tmp/$genome/$dir
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec $animal $file
popd
mkdir -p $dir
cp /tmp/$genome/$dir/$file.out ./$dir/$file.out
rm -fr /tmp/$genome/$dir/$file.*
rmdir --ignore-fail-on-non-empty /tmp/$genome/$dir
rmdir --ignore-fail-on-non-empty /tmp/$genome	
_EOF_
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMApis
    cd unmaskedSplits/
    find . -name '*.fa' | sed 's/^\.\///' > ../RMRun/files.lst
    cd ../RMRun/
    cat << "_EOF_" > gsub
#LOOP
../jkStuff/RMApis $(path1) {check in line+ /panasas/store/strPur1/unmaskedSplits/$(path1)} {check out line+ ../RMOut/$(dir1)/$(root1).fa.out}
#ENDLOOP
_EOF_
    gensub2 files.lst single gsub RMJobs
    para create RMJobs
    para try
    para push
    para time
#Completed: 187983 of 187983 jobs
#CPU time in finished jobs:    1025942s   17099.04m   284.98h   11.87d  0.033 y
#IO & Wait Time:                513501s    8558.35m   142.64h    5.94d  0.016 y
#Average job time:                   8s       0.14m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             159s       2.65m     0.04h    0.00d
#Submission to last job:          3927s      65.45m     1.09h    0.05d
   
    # Lift results.
    ssh eieio
    cd /cluster/data/strPur1/RMOut
    head -n 3 0/0/0/97000.fa.out > header
    find . -name "*.fa.out" -exec tail +4 '{}' ';' > rmsk.out 
    cat header rmsk.out > all.out
    rm rmsk.out header
    liftUp rmsk.lifted.out ../unmaskedSplits/splits.lft warn all.out
    rm all.out
    
    # Load results
    ssh hgwdev
    cd /cluster/data/strPur1/RMOut
    hgLoadOut strPur1 rmsk.lifted.out
    hgsql strPur1 -e 'rename table rmsk_rmsk to rmsk'
    hgsql strPur1 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(11), bin); \
                  create index genoStart on rmsk (genoName(11), genoStart); \
                  create index genoEnd   on rmsk (genoName(11), genoEnd);' 

# SIMPLE REPEATS (TRF) (DONE 4/15/2005 Andy) (REDONE 4/19/2005 Andy)
    ssh eieio
    mkdir -p /cluster/data/strPur1/bed/simpleRepeat
    cd /panasas/store/strPur1
    # Split up stuff
    mkdir kk9Splits
    cd kk9Splits/
    faSplit sequence /cluster/data/strPur1/downloads/urchinScaffolds.fa 100 urchin_
    ssh kk9
    cd /cluster/data/strPur1/bed/simpleRepeat
    cat << "_EOF_" > gsub
#LOOP
./trfBig.sh {check in line+ $(path1)} {check out line+ $(root1).bed} -bed -tempDir=/tmp
#ENDLOOP
_EOF_
   cat << "_EOF_" > trfBig.sh
#!/bin/bash
trfBig $* > /dev/null
_EOF_
    chmod +x trfBig.sh
    ls -1 /panasas/store/strPur1/kk9Splits/* > files.lst
    gensub2 files.lst single gsub spec
    para create spec
    para try
    para push
    para time
#Completed: 99 of 99 jobs
#CPU time in finished jobs:      24356s     405.93m     6.77h    0.28d  0.001 y
#IO & Wait Time:                  1121s      18.69m     0.31h    0.01d  0.000 y
#Average job time:                 257s       4.29m     0.07h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1044s      17.40m     0.29h    0.01d
#Submission to last job:          1044s      17.40m     0.29h    0.01d
    ssh eieio
    cd /cluster/data/strPur1/bed/simpleRepeat
    cat urch*.bed > trf.bed 
    rm urch*.be
    # Load this into the database as so
    ssh hgwdev
    cd /cluster/data/strPur1/bed/simpleRepeat
    cat urch*.bed | grep trf > trf.bed 
    rm urch*.bed
    hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \
       strPur1 simpleRepeat trf.bed
    rm -rf /panasas/store/strPur1/kk9Splits

# MAKE MASKED 2BIT (DONE 4/15/2005 Andy)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh eieio
    cd /cluster/data/strPur1/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' trf.bed > trfMask.bed
    cd ../../
    mkdir maskedFa
    cd maskedFa/
    maskOutFa -soft ../downloads/urchinScaffolds.fa ../bed/simpleRepeat/trfMask.bed urchin.softMasked.fa 
    maskOutFa -softAdd urchin.softMasked.fa ../RMOut/rmsk.lifted.out urchin.softMasked.fa
    maskOutFa urchin.softMasked.fa hard urchin.hardMasked.fa
    faToTwoBit urchin.softMasked.fa strPur1.2bit
    # faToTwoBit urchin.hardMasked.fa strPur1.hardMasked.2bit
    mv *.2bit ../
    cd ../
    # Load into database
    ssh hgwdev
    hgsql strPur1 < ~/kent/src/hg/lib/chromInfo.sql
    twoBitInfo strPur1.2bit /dev/stdout | 
        awk '{printf("%s\t%s\t/gbdb/strPur1/strPur1.2bit\n", $1, $2)}' > chrom.sizes
    echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql strPur1

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/15/2005 Andy)
    # Copy all the data from the table "grp" 
    # in an existing database to the new database
    ssh hgwdev
    hgsql strPur1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/15/2005 Andy)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("strPur1", "April 2005", "/gbdb/strPur1", "S. purpuratus", \
             "Scaffold3158:1-12000", 1, 80, \
             "S. purpuratus", \
             "Strongylocentrotus purpuratus", "/gbdb/strPur1/html/description.html", \
             0, 0, "Baylor");' \
      | hgsql -N hgcentraltest
    echo 'insert into defaultDb (genome, name) values ("S. purpuratus", "strPur1");' \
      | hgsql -N hgcentraltest
    echo 'insert into genomeClade (genome, clade, priority) values ("S. purpuratus", "deuterostome", "30");' \
      | hgsql -N hgcentraltest
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs update
    # Edit trackDb/makefile to add strPur1 to the DBS variable.
    mkdir -p urchin/strPur1
    touch urchin/strPur1/description.html
    # Create a simple worm/strPur1/description.html file.  In the initial case it's just empty.
    cvs add urchin
    cd urchin
    cvs add strPur1
    cvs add strPur1/description.html
    #make update DBS=strPur1 ZOO_DBS=
    # go public on genome-test
    cvs ci -m "Added strPur1 (sea urchin)." makefile
    cvs ci -m "Empty initial trackDb.ra and description.html for strPur1" urchin
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha

# SOME GBDB STUFF (DONE 4/15/2005 Andy)
    ssh hgwdev
    mkdir -p /gbdb/strPur1
    mkdir /cluster/data/strPur1/html
    ln -s /cluster/data/strPur1/html /gbdb/strPur1/html
    ln -s /cluster/data/strPur1/strPur1.2bit /gbdb/strPur1/strPur1.2bit

# MAKE GC5BASE WIGGLE TRACK (DONE 4/19/2005 Andy)
    ssh hgwdev
    mkdir /cluster/data/strPur1/bed/gc5Base
    cd /cluster/data/strPur1/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 strPur1 \
       /cluster/data/strPur1 | wigEncode stdin gc5Base.wig gc5Base.wib
    mkdir /gbdb/strPur1/wib
    ln -s `pwd`/gc5Base.wib /gbdb/strPur1/wib
    hgLoadWiggle -pathPrefix=/gbdb/strPur1/wib strPur1 gc5Base gc5Base.wig

## MAKE DOWNLOADABLE FILES (DONE 4/19/2005 Andy)
    ssh eieio
    cd /cluster/data/strPur1
    mkdir zips
    zip -j zips/allOut.zip RMOut/rmsk.lifted.out
    zip -j zips/allFa.zip maskedFa/*.fa
    zip -j zips/allTrf.zip bed/simpleRepeat/trfMask.bed    
    zip -j zips/allAgp.zip downloads/urchin.agp
    cd maskedFa/
    maskOutFa urchin.softMasked.fa hard urchin.hardMasked.fa
    cd ../
    zip -j zips/allFaMasked.zip maskedFa/urchin.hardMasked.fa
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/strPur1
    cd /usr/local/apache/htdocs/goldenPath/strPur1
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cd bigZips/
    cp -p /cluster/data/strPur1/zips/*.zip .
    md5sum *.zip > md5sum.txt

# MAKE 11.OOC FILE FOR BLAT (DONE 4/19/2005 Andy)
    ssh hgwdev
    mkdir -p /panasas/store/strPur1
    blat /cluster/data/strPur1/strPur1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/panasas/store/strPur1/11.ooc -repMatch=300

# PRODUCING GENSCAN PREDICTIONS (DONE 4/19/2005 Andy)
    ssh eieio
    # Make hard-masked scaffolds and split up for processing:
    cd /panasas/store/strPur1
    mkdir hardMaskedFaSplits
    cd hardMaskedFaSplits/
    faSplit -outDirDepth=3 -lift=hardMasked.lft gap /cluster/data/strPur1/maskedFa/urchin.hardMasked.fa 1000000 .
    mkdir /cluster/data/strPur1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    ssh hgwdev
    mkdir -p /cluster/data/strPur1/bed/genscan
    cd /cluster/data/strPur1/bed/genscan
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p /panasas/store/strPur1/genscan   
    find /panasas/store/strPur1/hardMaskedFaSplits -name '*.fa' | \
         sed 's/\/panasas\/store\/strPur1\/hardMaskedFaSplits\///'> chunks.lst
    cat << _EOF_ > genscan.sh
#!/bin/bash
outdir=\`dirname \$2\`
mkdir -p \$outdir
gsBig \$*
_EOF_
    chmod +x genscan.sh
    cat << _EOF_ > gsub
#LOOP
./genscan.sh {check in line+ /panasas/store/strPur1/hardMaskedFaSplits/\$(path1)} {check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).gtf} -trans={check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).pep} -subopt={check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
_EOF_

    # << this line keeps emacs coloring happy
    gensub2 chunks.lst single gsub jobList
    ssh kk
    cd /cluster/data/strPur1/bed/genscan
    para create jobList
    para try
    para check
    para push 
    para time
#Completed: 187943 of 187943 jobs
#CPU time in finished jobs:     154777s    2579.62m    42.99h    1.79d  0.005 y
#IO & Wait Time:               2009533s   33492.21m   558.20h   23.26d  0.064 y
#Average job time:                  12s       0.19m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             992s      16.53m     0.28h    0.01d
#Submission to last job:          5577s      92.95m     1.55h    0.06d
    # Lift and cat things
    ssh eieio
    cd /cluster/data/strPur1/bed/genscan
    find /panasas/store/strPur1/genscan/ -name "*.gtf" -exec cat '{}' ';' > genscan.gtf 
    find /panasas/store/strPur1/genscan/ -name "*.pep" -exec cat '{}' ';' > genscan.pep 
    find /panasas/store/strPur1/genscan/ -name "*.bed" -exec cat '{}' ';' > genscan.bed 
    liftUp genscan.lifted.gtf /panasas/store/strPur1/hardMaskedFaSplits/hardMasked.lft warn genscan.gtf
    liftUp genscanSubopt.lifted.bed /panasas/store/strPur1/hardMaskedFaSplits/hardMasked.lft warn genscan.bed
    # Clean up
    rm -rf /panasas/store/strPur1/hardMaskedFaSplits 
    # Load into the database:
    cd /cluster/data/strPur1/bed/genscan
    ldHgGene -gtf strPur1 genscan genscan.lifted.gtf
    hgPepPred strPur1 generic genscanPep genscan.pep
    hgLoadBed strPur1 genscanSubopt genscanSubopt.lifted.bed

# GENBANK mRNA AND EST COUNTS (DONE 4/20/2005 Andy)
    # Go to the latest GenBank full release dir and get an idea of how
    # many mRNAs and ESTs there are to align.
    ssh eieio
    cd /cluster/data/genbank/data/processed/genbank.146.0/full
    grep purpuratus mrna.gbidx | wc -l
#302
    grep purpuratus est.*gbidx | wc -l
#89289
    cd /cluster/data/genbank/data/processed/refseq.10/full
    grep purpuratus mrna.gbidx | wc -l
#219

# AUTO UPDATE GENBANK MRNA RUN  (DONE 4/23/2005 Andy)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvs update
    cd etc/
    # Edit genbank.conf
strPur1.genome = /iscratch/i/strPur1/strPur1.2bit
strPur1.mondoTwoBitParts = 5000
strPur1.lift = no
strPur1.refseq.mrna.native.load = yes
strPur1.refseq.mrna.xeno.load = yes
strPur1.refseq.mrna.xeno.pslReps = -minCover=0.25 -minAli=0.15 -nearTop=0.005
strPur1.genbank.mrna.xeno.load = yes
strPur1.genbank.est.native.load = no
strPur1.genbank.est.xeno.load = no
strPur1.downloadDir = strPur1
strPur1.perChromTables = no

    cvs commit -m 'strPur1 added to genbank update.' etc/genbank.conf 
    # Edit src/align/gbBlat to add /panasas/store/strPur1/11.ooc
    cvs commit -m 'Sea urchin added' src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server
    ssh `fileServer /cluster/data/genbank/`
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial strPur1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad strPur1 &
    featureBits strPur1 all_mrna
#13122660 bases of 835421305 (1.571%) in intersection
    featureBits strPur1 xenoMrna
    # Clean up:
    rm -rf work/initial.strPur1

# HGGATEWAY IMAGE (picture used with permission) (DONE 5/7/2005 Andy)
    ssh hgwdev
    export CVSROOT=/projects/hg/cvsroot
    cvs co browser/images
    cd browser/images
    wget -O S_purpuratus.jpg http://biology.fullerton.edu/courses/biol_317/Web/murray/Fall01/Stronglyocentrotus%20purpuratus.JPG
    mogrify -geometry 200x200 S_purpuratus.jpg
    cvs add S_purpuratus.jpg
    cvs commit S_purpuratus.jpg

# BLASTZ CIOSAV1 (DONE 5/23/2005 Andy)
    ssh eieio
    cd /cluster/data/strPur1/bed
    mkdir blastz.cioSav1.2005-05-20
    cd blastz.cioSav1.2005-05-20/
    cat << _EOF_ > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_L=6000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - Sea urchin
SEQ1_DIR=/iscratch/i/strPur1/strPur1.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000 

# QUERY - Sea squirt
SEQ2_DIR=/iscratch/i/cioSav1/nib/cioSav1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000

BASE=/cluster/data/strPur1/bed/blastz.cioSav1.2005-05-20

DEF=\$BASE/DEF
RAW=\$BASE/raw
CDBDIR=\$BASE
SEQ1_LEN=/panasas/store/strPur1/S1.len
SEQ2_LEN=\$BASE/S2.len
_EOF_
    # << this line keeps emacs coloring happy
    
    sort -rn +1 /cluster/data/strPur1/chrom.sizes | cut -f1-2 > /panasas/store/strPur1/S1.len
    cp /panasas/store/strPur1/S1.len .
    twoBitInfo /cluster/data/cioSav1/cioSav1.2bit /dev/stdout | sort -rn +1 > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:

# BLASTZ CI1 (DONE 5/23/2005 Andy)
    ssh eieio
    cd /cluster/data/strPur1/bed
    mkdir blastz.ci1.2005-07-14
    ln -s blastz.ci1.2005-07-14 blastz.ci1
    cd blastz.ci1/
    twoBitInfo /cluster/data/ci1/ci1.2bit /dev/stdout | sort -rn +1 > S2.len
    cat << "_EOF_" > DEF
# D. melanogaster vs. T. castaneum

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

BASE=/cluster/data/strPur1/bed/blastz.ci1

# QUERY - Sea urchin
SEQ2_DIR=/santest/scratch/strPur1/strPur1.2bit
SEQ2_CHUNK=20000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/strPur1/chrom.sizes

# TARGET - Sea squirt
SEQ1_DIR=/santest/scratch/ci1/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=$BASE/S2.len
_EOF_
#<< Emacs
    screen
    doBlastzChainNet.pl DEF -blastzOutRoot \
        /santest/scratch/bz.strPur1.ci1 >& do.log &
    screen -d

# REDO REPEATMASKING (DONE 8/3/2005 Andy)
    # Currently the repeat-masking is too low-coverage.
    ssh hgwdev
    /cluster/bluearc/RepeatMasker/util/queryRepeatDatabase.pl -species Deuterostomia -stat
#Total Sequence Length = 3617796 bp
    mv RMOut RMOut.bad
    mv RMRun RMRun.bad
    mkdir rmsk
    cd rmsk/
    mkdir -p /santest/scratch/rmsk/run /santest/scratch/rmsk/out
    ln -s /santest/scratch/rmsk/run run
    ln -s /santest/scratch/rmsk/out out
    cd run/
    cat << "_EOF_" > rmsk.sh
#!/bin/bash 
animal=Deuterostomia
genome=strPur1
outloc=`dirname $2`
input=`basename $1`
tmp=/scratch/${genome}.$$

mkdir $tmp
mkdir -p $outloc
cd $outloc
pushd $tmp
cp $1 .
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec $animal $input
popd
cp $tmp/${input}.out .
rm -fr $tmp
_EOF_
    # << this line makes emacs coloring happy
    chmod +x rmsk.sh
    find /panasas/store/strPur1/unmaskedSplits -name "*.fa" > splits.lst
    cat << "_EOF_" > gsub
#LOOP
./rmsk.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDirs1=3)/$(root1).fa.out}
#ENDLOOP
_EOF_

    # Tested out OK, run this when cluster is back online.
    ~/kent/src/parasol/bin/gensub2 splits.lst single gsub spec
    ssh kk
    cd /cluster/data/strPur1/rmsk/run
    para create spec
    para push
    para time
    
    # Cat and lift.
    ssh hgwdev
    cd /cluster/data/strPur1/rmsk/out
    # Find one with a header. ls -l 0/0/0  
    head -n 3 0/0/0/97000.fa.out > header
    find . -name "*.fa.out" -exec tail +4 '{}' ';' > rest
    cat header rest > all.out
    rm header rest
    liftUp rmsk.lifted.out ../unmaskedSplits/splits.lft warn all.out
    rm all.out

    # Load results... before shot:
    featureBits -or strPur1 rmsk simpleRepeat
#90840012 bases of 835421305 (10.874%) in intersection
    hgLoadOut strPur1 rmsk.lifted.out
    hgsql strPur1 -e 'rename table rmsk_rmsk to rmsk'
    hgsql strPur1 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(11), bin); \
                  create index genoStart on rmsk (genoName(11), genoStart); \
                  create index genoEnd   on rmsk (genoName(11), genoEnd);' 
    # So how did we do?
    featureBits -or strPur1 rmsk simpleRepeat
#94138492 bases of 835421305 (11.268%) in intersection
    # Well... that sucks too.
    # Move stuff off santest.
    ssh hgwdev
    cd /cluster/data/strPur1/rmsk
    rm out run
    cp -r /santest/scratch/strPur1/rmsk/* .
    rm -rf /santest/scratch/strPur1/rmsk

# REPEAT SCOUT DENOVO REPEAT FINDER (DONE 9/9/2005 Andy)
    ssh kolossus
    cd /scratch/andy
    cp /cluster/data/strPur1/downloads/urchinScaffolds.fa .
    wget http://repeatscout.bioprojects.org/RepeatScout-1.0.0.tar.gz
    tar xfz RepeatScout-1.0.0.tar.gz
    cd RepeatScout-1.0.0
    make
    cd ../
    screen
    ./RepeatScout-1.0.0/build_lmer_table -l 15 -sequence urchinScaffolds.fa -freq lmers.txt -v >& rscout.1.log
    # Control+a+d
    # Also re-niced this process: nice +10
    # Started about 2:10pm
    # Finished around 3:15pm.  
    cat rscout.1.log
#  Done allocating headptr
#  Done building headptr
#  There are 66717762 l-mers
#  Done sorting headptr
    ./RepeatScout-1.0.0/RepeatScout-v1 -seq urchinScaffolds.fa -output rscout.out -freqlmers.txt -l 15
    # THIS DOESN'T RUN
    # Hahaha.  Dumb cause of problem: -seq should be -sequence.  However, after 
    # exchanging a ton of e-mail with Neil Jones at UCSD, it's become apparent
    # that -l shouldn't be specified normally.  1.0.0 needed -l, so Neil 
    # put 1.0.1 online.  In addition, it's clear that the RepeatScout-v1 program
    # takes much longer to run than build_lmer_table, so that's becoming a cluster
    # run now.
    #
    # OK I'm basically starting over:
    ssh hgwdev
    cd /santest/scratch/strPur1
    mkdir -p repeatScout/splits
    cd repeatScout/splits/
    faSplit about /cluster/data/strPur1/downloads/urchinScaffolds.fa 2000000 rs_urch_
    cd ../
    wget -r ftp://ftp.ncbi.nih.gov/pub/seg/nseg/
    cd ftp.ncbi.nih.gov/pub/seg/nseg/
    make
    cp nseg nmerge /cluster/bin/i386
    ssh kolossus
    cd /scratch/andy
    wget http://repeatscout.bioprojects.org/RepeatScout-1.0.1.tar.gz
    tar xfz RepeatScout-1.0.1.tar.gz
    cd RepeatScout-1.0.1/
    make
    cp RepeatScout-v1 build_lmer_table /cluster/bin/x86_64
    cp compare-out-to-gff.prl filter-stage-{1,2}.prl /cluster/bin/scripts
    ssh pk
    cd /san/sanVol1/scratch/strPur1/repeatScout
    mkdir run
    cd run/
    mkdir ../outputs/ ../logs ../freqs/
    ls -1 ../splits/ > fa.lst
    cat > rscout.sh << "_EOF_"
#!/bin/bash
tmp=/scratch/`basename $1 .fa`
tmpFreq=$tmp.lmers.txt
tmpOut=$tmp.out
tmpLog=$tmp.log

/cluster/bin/x86_64/build_lmer_table -sequence $1 -freq $tmpFreq -v >& $tmpLog
/cluster/bin/x86_64/RepeatScout-v1 -sequence $1 -freq $tmpFreq -output $tmpOut >& $tmpLog.2

cp $tmpFreq $2
cp $tmpOut $3
cp $tmpLog $4
cp $tmpLog.2 $4.2

rm ${tmp}*
_EOF_
    cat > gsub << "EOF"
#LOOP
./rscout.sh ../splits/$(path1) ../freqs/ ../outputs/ ../logs/
#ENDLOOP 
EOF
    chmod +x rscout.sh
    gensub2 single fa.lst gsub spec
    para create spec
    para try
    para push
    para time
#533 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 533 of 533 jobs
#CPU time in finished jobs:      29317s     488.61m     8.14h    0.34d  0.001 y
#IO & Wait Time:                  2552s      42.54m     0.71h    0.03d  0.000 y
#Average job time:                  60s       1.00m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              88s       1.47m     0.02h    0.00d
#Submission to last job:           381s       6.35m     0.11h    0.00d
    cd ../
    R=0
    for out in outputs/*; do
       numR=`grep "R=" $out | wc -l`
       awk 'BEGIN{FS = "="};{if (/=/) {printf(">R=%d\n", $2 + "'"$R"'")} else print $1}' $out >> all.out
       R=$(($R + $numR))
    done    
    mkdir libSplits
    faSplit 400 all.out libSplits/all_
    mkdir run2 filtered
    cd run2/
    cat > filter1.sh << "EOF"
#!/bin/bash
input=`basename $1`
output=`basename $2`
cp $1 /scratch
pushd /scratch
/cluster/bin/scripts/filter-stage-1.prl < $input > $output
popd
cp /scratch/$output $2
rm /scratch/$input
rm /scratch/$output
EOF
    chmod +x filter1.sh
    cat > gsub << "EOF"
#LOOP
./filter1.sh {check in line+ ../libSplits/$(path1)} {check out line+ ../filtered/$(root1).lib}
#ENDLOOP 
EOF
    ls -1 ../libSplits/ > lib.lst
    gensub2 lib.lst single gsub spec
    para create spec
    para push
    para time
#384 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 383 of 384 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:       1149s      19.15m     0.32h    0.01d  0.000 y
#IO & Wait Time:                  1448s      24.14m     0.40h    0.02d  0.000 y
#Average job time:                   7s       0.11m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              66s       1.10m     0.02h    0.00d
#Submission to last job:           303s       5.05m     0.08h    0.00d
    # The crashed job actually worked.
    cd ../
    for f in filtered/*; do cat $f >> filtered.lib; done
    cd ../
    mkdir -p rmsk.repeatScout/{run,out}
    ssh kkstore02
    cd /cluster/data/strPur1
    cp -r unmaskedSplits/ /san/sanVol1/scratch/strPur1
    exit
    cd rmsk.repeatScout/run/
    cat > rmsk.sh << "EOF"
#!/bin/bash

thelib=/san/sanVol1/scratch/strPur1/repeatScout/filtered.lib
fa=`basename $1`

pushd /scratch
oldDir=`dirs +1`
cp $oldDir/$1 .
/cluster/bluearc/RepeatMasker/RepeatMasker -lib $thelib $fa
mkdir -p $oldDir/`dirname $2`
cp $fa.out $oldDir/$2
rm $fa*
popd
EOF
    chmod +x rmsk.sh
    find ../../unmaskedSplits/ -name "*.fa" > fa.lst
    cat > gsub << "EOF"
#LOOP
./rmsk.sh {check in line+ $(path1)} {check out exists ../out/$(lastDirs1=3)/$(file1).out} 
#ENDLOOP
EOF
    ~/kent/src/parasol/bin/gensub2 fa.lst single gsub spec
    para create spec 
    para try
    para push
    para time
#Completed: 187983 of 187983 jobs
#CPU time in finished jobs:    8288262s  138137.70m  2302.29h   95.93d  0.263 y
#IO & Wait Time:                757852s   12630.87m   210.51h    8.77d  0.024 y
#Average job time:                  48s       0.80m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            4578s      76.30m     1.27h    0.05d
#Submission to last job:         50036s     833.93m    13.90h    0.58d
    ssh hgwdev
    cd /santest/scratch/strPur1/rmsk.repeatScout/  
    find out/ -name "*.fa.out" -type f -size +80c -exec tail +4 '{}' ';' > initial.noheader.out
    head -n 3 out/0/0/0/94000.fa.out > rmsk.header
    cat rmsk.header initial.noheader.out > initial.out
    # 100
    cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=100 > 100.lib
    grep '^>' 100.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 100.lst
    tabGrep 100.lst 10 initial.noheader.out > 100.noheader.out
    cat rmsk.header 100.noheader.out > 100.out
    liftUp 100.lifted.out ../unmaskedSplits/splits.lft warn 100.out
    hgLoadOut -nosplit -table=repeatScout100 strPur1 100.lifted.out
    # 300
    cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=300 > 300.lib
    grep '^>' 300.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 300.lst
    tabGrep 300.lst 10 initial.noheader.out > 300.noheader.out
    cat rmsk.header 300.noheader.out > 300.out
    liftUp 300.lifted.out ../unmaskedSplits/splits.lft warn 300.out
    # 1000
    cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=1000 > 1000.lib
    grep '^>' 1000.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 1000.lst
    tabGrep 1000.lst 10 initial.noheader.out > 1000.noheader.out
    cat rmsk.header 1000.noheader.out > 1000.out
    liftUp 1000.lifted.out ../unmaskedSplits/splits.lft warn 1000.out
    # Yeah ok finally a reduction in the amount of output.  It's probably better to filter on score 
    # versus copy #, but that has a bias towards really long repeats, and I'm not sure that's what
    # is desirable in terms of guessing what's biologically correct.

# MAKE SOFT-MASKED FA/2BIT WITH REPEATSCOUT (DONE 9/9/2005 Andy)
    ssh eieio
    cd /cluster/data/strPur1
    mkdir repeatScout/
    cd repeatScout/
    cp /san/sanVol1/scratch/strPur1/rmsk.repeatScout/100.lifted.out .
    cd ../maskedFa/
    maskOutFa -soft ../downloads/urchinScaffolds.fa ../bed/simpleRepeat/trfMask.bed urchin.softMasked.fa 
    maskOutFa -softAdd urchin.softMasked.fa ../repeatScout/100.lifted.out urchin.softWithRepeatScout.fa
    faSize urchin.softWithRepeatScout.fa 
#1096072416 bases (260724172 N's 835348244 real 595793115 upper 239555129 lower) in 187943 sequences in 1 files
#Total size: mean 5831.9 sd 25954.4 min 100 (Scaffold188639) max 962155 (Scaffold1) median 909
#N count: mean 1387.3 sd 13142.5
#U count: mean 3170.1 sd 12856.6
#L count: mean 1274.6 sd 3702.0    
    # That's like 40% masked... OK that's too much.

# PILER DENOVO REPEAT FINDING (STALLED 8/4/2005 Andy)
    ssh hgwdev
    cd /cluster/data/strPur1
    mkdir -p piler/src
    cd piler/src/
    wget http://www.drive5.com/piler/piler_source.tar.gz
    wget http://www.drive5.com/muscle/downloads3.52/muscle3.52_src.tar.gz
    wget http://www.drive5.com/pals/pals_source.tar.gz
    tar xfz muscle3.52_src.tar.gz
    tar xfz pals_source.tar.gz
    tar xfz piler_source.tar.gz
    mkdir pals piler 
    mv pals*.gz pals
    mv piler*.gz piler
    rm muscle*.gz
    cd muscle3.52_src/
    sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron
    mv Makefile Makefile.default
    mv Makefile.opteron Makefile
    make
    cp muscle /cluster/bin/x86_64
    cd ../pals/
    tar xfz pals_source.tar.gz
    rm pals_source.tar.gz
    sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron
    mv Makefile Makefile.default
    mv Makefile.opteron Makefile
    make
    cp pals /cluster/bin/x86_64
    cd ../piler/
    tar xfz piler_source.tar.gz
    rm piler_source.tar.gz
    sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron
    mv Makefile Makefile.default
    mv Makefile.opteron Makefile
    make
    cp piler /cluster/bin/x86_64
    # Split up genome into 20 chunks on SAN.
    ssh kkstore02
    cd /cluster/data/strPur1/piler
    mkdir splits
    cd splits/
    faSplit sequence ../../downloads/urchinScaffolds.fa 20 urch_    
    ssh pk
    cd /cluster/data/strPur1/piler
    mkdir -p /san/sanvol1/scratch/strPur1/piler/splits
    cp splits/* /san/sanvol1/scratch/strPur1/piler/splits
    rm -rf splits/
    ln -s /san/sanvol1/scratch/strPur1/piler/splits splits
    mkdir -p /san/sanvol1/scratch/strPur1/piler/palsSelfOut  
    ln -s /san/sanvol1/scratch/strPur1/piler/palsSelfOut
    mkdir -p /san/sanvol1/scratch/strPur1/piler/palsSelfRun
    ln -s /san/sanvol1/scratch/strPur1/piler/palsSelfRun
    cd palsSelfRun/
    ls -1 ../splits/ > fa.lst
cat > doPalsSelf.sh << "EOF"
#!/bin/sh
tmp=/scratch/`basename $2` > /cluster/bin/x86_64/pals -self $1 -out $tmp > cp $tmp $2
rm $tmp
EOF
    chmod +x doPalsSelf.sh 
    #<<;
    cat > gsub << "EOF"
#LOOP
./doPalsSelf.sh {check in line+ ../splits/$(path1)} {check out line+ ../palsSelfOut/pals_$(root1).gff}
#ENDLOOP
EOF
    # <<;
    gensub2 fa.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 19 of 20 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:      12863s     214.39m     3.57h    0.15d  0.000 y
#IO & Wait Time:                   161s       2.68m     0.04h    0.00d  0.000 y
#Average job time:                 685s      11.42m     0.19h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1308s      21.80m     0.36h    0.02d
#Submission to last job:          1356s      22.60m     0.38h    0.02d
    # The crashing one is a PROBLEM.  Andy needs to e-mail Bob Edgar about this.

# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 12/12/2005 Andy)
    ssh hgwdev
    hgsql -N hgcentraltest -e 'insert into blatServers values("strPur1", "blat19", "17782", 1, 0);
          insert into blatServers values("strPur1", "blat19", "17783", 0, 1);'

# HUMAN PROTEINS TBLASTN (DONE 12/14/2005 Andy)
    # Split up sea urchin
    ssh kkstore02
    cd urch/
    twoBitToFa strPur1.2bit urchin.fa
    mkdir splits
    faSplit sequence splits.fa 60 splits/urch_
    mkdir /san/sanVol1/scratch/strPur1/tblastn.{hg17KG,splits}
    cp splits/* /san/sanVol1/scratch/strPur1/tblastn.splits
    
    # Create blast databases
    ssh kolossus
    cd /san/sanVol1/scratch/strPur1/tblastn.hg17KG
    awk '{printf("0\t%s\t%s\t%s\t%s\n", $1, $2, $1, $2)}' /cluster/data/strPur1/chrom.sizes > garbage.lft
    ln -s ../../andy/blast
    mkdir splits
    cd splits/
    ln -s ../../tblastn.splits/* .
    for fa in urch*.fa; do
       split=${fa%.fa}; echo $split
       ../blast/bin/formatdb -i $fa -pF -n $split
    done
    rm *.fa formatdb.log 

    # protein setup
    mkdir humanKgExons
    cd humanKgExons/
    splitFile /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl 38 kgExons_
    for exonPsl in *; do pslxToFa $exonPsl $exonPsl.fa; rm $exonPsl; done

    # blast run
    ssh pk
    cd san/strPur1/tblastn.hg17KG/
    ls -1S humanKgExons/*.fa > exons.lst
    ls -1S splits/urch*.nsq > targets.lst
    cp /cluster/data/hg17/bed/blat.hg17KG/protein.lft .
    cp /san/sanVol1/scratch/andy/tblastn{Exons,Gsub} .
    mkdir blastOut/
    gensub2 targets.lst exons.lst tblastnGsub spec
    para create spec
    para try
    para push
    para time
#Completed: 59040 of 59040 jobs
#CPU time in finished jobs:    6544747s  109079.12m  1817.99h   75.75d  0.208 y
#IO & Wait Time:                186223s    3103.72m    51.73h    2.16d  0.006 y
#Average job time:                 114s       1.90m     0.03h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             300s       5.00m     0.08h    0.00d
#Submission to last job:         20123s     335.38m     5.59h    0.23d
    
    # Chaining part
    cat << "EOF" > tblastnChain
#!/bin/bash
dir=$1
echo $dir
cat blastOut/${dir}/*.psl | simpleChain -prot -outPsl /dev/stdin simpleChain/c.${dir}.psl 
awk "(\$13 - \$12)/\$11 > 0.6 {print}" simpleChain/c.${dir}.psl > simpleChain/c60.${dir}.psl
sort -rn simpleChain/c60.${dir}.psl | pslUniq stdin simpleChain/u.${dir}.psl
awk "((\$1 / \$11) ) > 0.60 { print   }" simpleChain/c60.${dir}.psl > simpleChain/m60.${dir}.psl
EOF
    chmod +x tblastnChain
    for d in `cat dirs.lst`; do 
        ./tblastnChain $d
    done        
    cd simpleChain/
    sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > ../blastHg17KG.psl

    # Loading
    ssh hgwdev
    cd /cluster/data/strPur1/bed/tblastn.hg17KG
    hgLoadPsl strPur1 blastHg17KG.psl

# HUMAN BLASTZ (DONE 2/9/2006 Andy)
    ssh hgwdev
    cd san/strPur1/
    mkdir blastz.hg18.2006-02-09
    ln -s blastz.hg18.2006-02-09 blastz.hg18
    cd blastz.hg18/
    cp /cluster/data/strPur1/{strPur1.2bit,chrom.sizes} .
    cat << "_EOF_" > DEF
# Sea urchin vs. Human

export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h
ome/angie/schwartzbin:/parasol/bin

BASE=/san/sanVol1/scratch/strPur1/blastz.hg18.2006-02-09

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=$BASE/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes

# QUERY - Sea urchin
SEQ2_DIR=$BASE/strPur1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=$BASE/chrom.sizes
SEQ2_LIMIT=5000

TMPDIR=/scratch/tmp
_EOF_
   screen
   doBlastzChainNet.pl -bigClusterHub pk -smallClusterHub pk DEF >& run.log &
   screen -d
   # monitor and continue where it breaks
   ./cleanUp.csh
   cd ../
   cp -r blastz.apiMel2.2005-10-22/ /cluster/data/triCas2/bed/

# CIONA BLASTZ (DONE 2/9/2006 Andy)
    ssh hgwdev
    cd san/strPur1/
    mkdir blastz.ci2.2006-02-10
    ln -s blastz.ci2.2006-02-10 blastz.ci2
    cd blastz.hg18/
    cp /cluster/data/strPur1/{strPur1.2bit,chrom.sizes} .
    cat << "_EOF_" > DEF
# Sea urchin vs. Human

export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h
ome/angie/schwartzbin:/parasol/bin

BASE=/san/sanVol1/scratch/strPur1/blastz.ci2.2006-02-10

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=$BASE/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes

# QUERY - Sea urchin
SEQ2_DIR=$BASE/strPur1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=$BASE/chrom.sizes
SEQ2_LIMIT=5000

TMPDIR=/scratch/tmp
_EOF_
   screen
   doBlastzChainNet.pl -bigClusterHub pk -smallClusterHub pk DEF >& run.log &
   screen -d
   # monitor and continue where it breaks
   ./cleanUp.csh
   cd ../
   cp -r blastz.apiMel2.2005-10-22/ /cluster/data/triCas2/bed/


# REFSEQ MRNA (TRIED 2/8/2006)
    # I already have genbank.conf configured.
    ssh `fileServer /cluster/data/genbank/`
    cd /cluster/data/genbank
    nice ./bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -verbose=1 -initial strPur1 &
  
# GENBANK EST
    ssh kkstore02
    cd /cluster/data/genbank
    screen
    nice ./bin/gbAlignStep -srcDb=genbank -verbose=1 -initial strPur1 &

# QUALITY SCORES (DONE 2/13/06 Andy)
    ssh kkstore02
    mkdir /cluster/data/strPur1/bed/quality
    cd /cluster/data/strPur1/bed/quality
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20041123-contigs.qual.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20041123-reptigs.qual.gz
    sed 's/AAG.*|//' ../../downloads/urchin.agp > urchin.agp
    zcat Spur20041123-*.gz > all.qual
    qaToQac all.qual all.qac
    qacAgpLift urchin.agp all.qac lifted.qac
    qacToWig -fixed lifted.qac stdout \
      | wigEncode stdin strPur1.{wig,wib}
    ssh hgwdev
    cd /cluster/data/strPur1/bed/quality
    mkdir -p /gbdb/strPur1/wib
    ln -s `pwd`/strPur1.wib /gbdb/strPur1/wib
    hgLoadWiggle strPur1 quality strPur1.wig

# GENSCAN PREDICTIONS AGAIN (DONE 2/27/2006 Andy)
    # 3 tables are messed up.  liftUp didn't give the split 
    hgsql strPur1 -e "select * from genscan" | tail +2 | sort -k2,2 -k4,4n > genscan.txt
    hgsql strPur1 -e "select * from genscanPep" | tail +2  > genscanPep.txt
    echo > genscanClean.py << "_EOF_"
import sys, string

# Deal with first table
genscanTable = open(sys.argv[1])
genscanTableNew = sys.argv[1] + ".new"
output = open(genscanTableNew, 'w')
lines = genscanTable.readlines()
counter = 1
scaff = ""
hash = {}
for line in lines:
    words = string.split(line.strip(), '\t', 2)
    if (words[1] != scaff):
	counter = 1
	scaff = words[1]
    newname = scaff + ".%d" %(counter)
    hash[words[0]] = newname
    words[0] = newname
    counter = counter + 1
    output.write(string.join(words,'\t') + '\n')
output.close()
genscanTable.close()
print("Done with main table")
# Deal with Pep table
pepTable = open(sys.argv[2])
pepTableNew = sys.argv[2] + ".new"
lines = pepTable.readlines()
output = open(pepTableNew, 'w')
for line in lines:
    words = string.split(line.strip(), '\t')
    words[0] = hash[words[0]]
    output.write(string.join(words, '\t') + '\n')
output.close()
pepTable.close()
print("All done")
_EOF_
    python genscanClean.py genscan.txt genscanPep.txt
    hgsqldump -d strPur1 genscanPep > genscanPep.sql
    hgsql strPur1 -e 'drop table genscanPep'
    hgLoadSqlTab strPur1 genscanPep genscanPep.sql genscanPep.txt.new 
    ldHgGene -predTab strPur1 genscan genscan.txt.new 
    rm genscan*{new,txt,sql}

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26 )
    ssh kkstore02
    bash # if not using bash shell already

    mkdir /cluster/data/strPur1/blastDb
    cd /cluster/data/strPur1
    faToTwoBit strPur1.2bit temp.fa
    faSplit sequence temp.fa 500 blastDb/
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/strPur1/blastDb
    cd /cluster/data/strPur1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/strPur1/blastDb
    done

    mkdir -p /cluster/data/strPur1/bed/tblastn.hg18KG
    cd /cluster/data/strPur1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/strPur1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 494 query.lst

   # we want around 50000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(50000/494) = 362.862760

   mkdir -p /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa
   split -l 360 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/strPur1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/strPur1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 50882 of 50882 jobs
# CPU time in finished jobs:    8964073s  149401.22m  2490.02h  103.75d  0.284 y
# IO & Wait Time:               1545368s   25756.13m   429.27h   17.89d  0.049 y
# Average job time:                 207s       3.44m     0.06h    0.00d
# Longest finished job:             983s      16.38m     0.27h    0.01d
# Submission to last job:        133787s    2229.78m    37.16h    1.55d

    ssh kkstore04
    cd /cluster/data/strPur1/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun

    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/strPur1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.

# Completed: 103 of 103 jobs
# CPU time in finished jobs:       4863s      81.06m     1.35h    0.06d  0.000 y
# IO & Wait Time:                 88639s    1477.31m    24.62h    1.03d  0.003 y
# Average job time:                 908s      15.13m     0.25h    0.01d
# Longest finished job:            1737s      28.95m     0.48h    0.02d
# Submission to last job:          3936s      65.60m     1.09h    0.05d

    ssh kkstore04
    cd /cluster/data/strPur1/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/strPur1/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
    # this is ok.
    # load table 
    ssh hgwdev
    cd /cluster/data/strPur1/bed/tblastn.hg18KG
    hgLoadPsl strPur1 blastHg18KG.psl
    # check coverage
    featureBits strPur1 refGene:cds  blastHg18KG -enrichment
# refGene:cds 0.065%, blastHg18KG 0.898%, both 0.024%, cover 37.31%, enrich 41.56x

    featureBits strPur1 blastHg18KG blastHg17KG  -enrichment
# blastHg18KG 0.898%, blastHg17KG 0.842%, both 0.749%, cover 83.46%, enrich 99.18x
    
    ssh kkstore04
    rm -rf /cluster/data/strPur1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################

##########################################################################
# CREATE LIFTOVER FROM strPur1 TO strPur2
# STARTED 2008-03-31 kord

# strPur1 -> /cluster/store5/strPur1
# kkstore02-10:/export/cluster/store5 1.5T  1.1T  308G  78% /cluster/store5

ssh kkstore02
mkdir /cluster/data/strPur1/bed/blat.strPur2
cd /cluster/data/strPur1/bed/blat.strPur2

nice time doSameSpeciesLiftOver.pl strPur1 strPur2 \
    -bigClusterHub pk \
    -ooc /cluster/bluearc/strPur1/11.ooc \
    -buildDir /cluster/data/strPur1/bed/blat.strPur2 >do.log 2>&1 &


ssh pk
cd /cluster/data/strPur1/bed/blat.strPur2/run.blat
parasol list batches
#user     run   wait   done crash pri max batch
#kord      396  28560   2904     0  10  -1 /cluster/store5/strPur1/bed/blat.strPur2/run.blat/

# *** All done!
# *** Steps were performed in /cluster/data/strPur1/bed/blat.strPur2
# *** Test installation (/gbdb, goldenPath, hgLiftover operation) on hgwdev.
#
# 1.71user 0.69system 5:50:08elapsed 0%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (9major+29957minor)pagefaults 0swaps

# remove the symbolic link to liftOver chains and copy over the file
rm ../liftOver/strPur1ToStrPur2.over.chain.gz
cp -p strPur1ToStrPur2.over.chain.gz ../liftOver/

# a link in /usr/local/apache/htdocs/goldenPath/strPur2/liftOver
# has already been made to this file and md5sum.txt needs to be updated
ssh hgwdev 
cd /usr/local/apache/htdocs/goldenPath/strPur1/liftOver
md5sum *.gz > md5sum.txt

