# for emacs: -*- mode: sh; -*-

# Brugia malayi
# TIGR and the Sanger Institute
# "Draft genome of the filarial nematode parasite Brugia malayi.",
# Science, 2007 Sep 21; 317(5845):1756-60
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2613796/
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAQA01

##############################################################################
## Fetch sequence (DONE - 2010-09-20 - Hiram)

    mkdir -p /hive/data/genomes/bruMal1/ws210
    cd /hive/data/genomes/bruMal1/ws210

    wget --no-parent --timestamping -m -nH --cut-dirs=5 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/b_malayi/

    hgFakeAgp -minScaffoldGap=100000 -minContigGap=1 \
	sequences/dna/b_malayi.WS210.dna.fa.gz ucsc.fake.bruMal1.agp

##############################################################################
## Initial browser build (DONE - 2010-09-20 - Hiram)
    cd /hive/data/genomes/bruMal1

    cat << '_EOF_' > bruMal1.config.ra
# Config parameters for makeGenomeDb.pl:
db bruMal1
clade worm
genomeCladePriority 80
scientificName Brugia malayi
commonName B. malayi
assemblyDate Sep. 2007
assemblyLabel TIGR and the Sanger Institute
assemblyShortLabel TIGR 1.0
orderKey 890
mitoAcc NC_004298.1
fastaFiles /hive/data/genomes/bruMal1/ws210/sequences/dna/b_malayi.WS210.dna.fa.gz
agpFiles /hive/data/genomes/bruMal1/ws210/ucsc.fake.bruMal1.agp
# qualFiles none
dbDbSpeciesDir worm
taxId 6279
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-stop=agp bruMal1.config.ra > agp.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-continue=db bruMal1.config.ra > db.log 2>&1
    #	real    0m48.923s

#########################################################################
# REPEATMASKER (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/bruMal1/bed/repeatMasker
    cd /hive/data/genomes/bruMal1/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` bruMal1 > do.log 2>&1 &
    #	real    74m4.437s

    #	from the do.log:
    #	RepeatMasker version development-$Id:
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp
    #	CC   RELEASE 20090604;                                            *

    cat faSize.rmsk.txt 
    #	95828100 bases (6592564 N's 89235536 real
    #	79152210 upper 10083326 lower) in 27211 sequences in 1 files
    #	%10.52 masked total, %11.30 masked real

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/bruMal1/bed/simpleRepeat
    cd /hive/data/genomes/bruMal1/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \
	-buildDir=`pwd` bruMal1 > do.log 2>&1 &
    #	real    96m47.195s
    #	something went haywire with the featureBits measurement
    #	too many sequences in chromInfo makes featureBits take a long time.

#########################################################################
# run window masker (DONE - 2010-09-22 - Hiram)
    mkdir /hive/data/genomes/bruMal1/bed/windowMasker
    cd /hive/data/genomes/bruMal1/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` bruMal1 > do.log 2>&1 &
    #	real    1m59.029s

    twoBitToFa bruMal1.wmsk.sdust.2bit stdout | faSize stdin
    #	95828100 bases (6592564 N's 89235536 real 57075076 upper
    #	32160460 lower) in 27211 sequences in 1 files
    #	%33.56 masked total, %36.04 masked real

    #	load this initial data to get ready to clean it
    hgLoadBed bruMal1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 650400 elements of size 3
    featureBits bruMal1 windowmaskerSdust
    #	38704468 bases of 91681645 (42.216%) in intersection

    #	eliminate the gaps from the masking
    time featureBits bruMal1 -not gap -bed=notGap.bed
    #	89235536 bases of 89235536 (100.000%) in intersection
    #	real    0m14.056s
    time nice -n +19 featureBits bruMal1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	32160460 bases of 89235536 (36.040%) in intersection
    #	real    8m44.312s

    #	reload track to get it clean
    hgLoadBed bruMal1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 659259 elements of size 3
    time featureBits bruMal1 windowmaskerSdust \
	> fb.bruMal1.cleanWMask.txt 2>&1 &
    #	real    0m17.374s
    cat fb.bruMal1.cleanWMask.txt 
    #	32160460 bases of 89235536 (36.040%) in intersection

    cd /hive/data/genomes/bruMal1
    #	mask the sequence with this clean mask
    zcat bed/windowMasker/cleanWMask.bed.gz \
	| twoBitMask bruMal1.unmasked.2bit stdin \
	    -type=.bed bruMal1.2bit
    twoBitToFa bruMal1.2bit stdout | faSize stdin \
        > bruMal1.faSize.txt
    cat bruMal1.faSize.txt
    #	95828100 bases (6592564 N's 89235536 real 57075076 upper
    #	32160460 lower) in 27211 sequences in 1 files
    #	%33.56 masked total, %36.04 masked real

    ln -s `pwd`/bruMal1.2bit /gbdb/bruMal1/bruMal1.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-22 - Hiram)
    # numerator is bruMal1 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 89235536 / 2897310462 \) \* 1024
    #	( 89235536 / 2897310462 ) * 1024 = 31.538625
    # 31 is way too small, use 100 to keep the number of overused
    #	11-mers to a smaller number

    cd /hive/data/genomes/bruMal1
    blat bruMal1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=bruMal1.11.ooc -repMatch=100
    #	Wrote 7849 overused 11-mers to bruMal1.11.ooc

    mkdir /hive/data/staging/data/bruMal1
    cp -p bruMal1.2bit chrom.sizes bruMal1.11.ooc \
	/hive/data/staging/data/bruMal1

########################################################################
# SWAP LASTZ (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22
    cat fb.ce9.chainBruMal1Link.txt 
    #	4804043 bases of 100286004 (4.790%) in intersection

    #	running swap
    mkdir /hive/data/genomes/bruMal1/bed/blastz.ce9.swap
    cd /hive/data/genomes/bruMal1/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22/DEF \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk \
	-workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    19m54.633s

    cat fb.bruMal1.chainCe9Link.txt
    #	4869447 bases of 89235536 (5.457%) in intersection

########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add bruMal1 just before caeJap2

# bruMal1 (Brugia malayi)
bruMal1.serverGenome = /hive/data/genomes/bruMal1/bruMal1.2bit
bruMal1.clusterGenome = /scratch/data/bruMal1/bruMal1.2bit
bruMal1.ooc = /scratch/data/bruMal1/bruMal1.11.ooc
bruMal1.lift = no
bruMal1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
bruMal1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
bruMal1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
bruMal1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
bruMal1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
bruMal1.refseq.mrna.native.load = yes
bruMal1.refseq.mrna.xeno.load  = yes
bruMal1.refseq.mrna.xeno.loadDesc = yes
bruMal1.genbank.mrna.xeno.load = yes
bruMal1.genbank.est.native.load = yes
bruMal1.genbank.est.native.loadDesc = no
bruMal1.downloadDir = bruMal1
bruMal1.perChromTables = no

    git commit -m "Added bruMal1 B. malayi WS210" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial bruMal1 &
    #	logFile: var/build/logs/2010.09.27-13:31:44.bruMal1.initalign.log
    #	real    698m21.115s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bruMal1
    #	logFile: var/dbload/hgwdev/logs/2010.09.28-14:10:17.dbload.log
    #	real    21m50.206s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add bruMal1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added bruMal1" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("bruMal1", "blat5", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("bruMal1", "blat5", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to one result for ZC101.2e protein blat
#	This doesn't blat very good and matches to many contigs
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="Bmal_supercontig12776:1-8071"
	where name="bruMal1";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE -  Hiram - 2010-10-07
    #	this breaking up was already done when ce4 genes were done
    #	reworking this with new blast binaries
    cd /hive/data/genomes/bruMal1
    mkdir blastDb
    twoBitToFa bruMal1.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	27235 pieces of 27235 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	echo $i
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/bruMal1/bed/tblastn.ce9SG
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG
    echo  /hive/data/genomes/bruMal1/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 139 query.lst

   # we want around 200000 jobs
    calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl  | cut -d" " -f1`/\(200000/`wc -l query.lst | cut -d" " -f1`\)
    #	28103/(200000/27235) = 3826.926025

   mkdir sgfa
   split -l 3800 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir  blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=bruMal1
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 217880 of 217880 jobs
# CPU time in finished jobs:   11964555s  199409.26m  3323.49h  138.48d  0.379 y
# IO & Wait Time:               1644883s   27414.71m   456.91h   19.04d  0.052 y
# Average job time:                  62s       1.04m     0.02h    0.00d
# Longest finished job:             453s       7.55m     0.13h    0.01d
# Submission to last job:         68937s    1148.95m    19.15h    0.80d

XXX - ready - Fri Oct  8 10:16:48 PDT 2010
    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/bruMal1/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/bruMal1/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/bruMal1/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG/chainRun
    para create jobList
    para -maxJob=30 push
    para try, check, push, check etc.
# Completed: 8 of 8 jobs
# CPU time in finished jobs:        184s       3.06m     0.05h    0.00d  0.000 y
# IO & Wait Time:                  8836s     147.27m     2.45h    0.10d  0.000 y
# Average job time:                1128s      18.79m     0.31h    0.01d
# Longest finished job:            1132s      18.87m     0.31h    0.01d
# Submission to last job:          1141s      19.02m     0.32h    0.01d

XXX - ready
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 15489 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/bruMal1/bed/tblastn.ce9SG
    hgLoadPsl bruMal1 blastCe9SG.psl

    # check coverage
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits cb3 blastCe9SG 
    #	18431207 bases of 108433446 (16.998%) in intersection
    featureBits ce9 sangerGene
    #	28134889 bases of 100281426 (28.056%) in intersection

    rm -rf blastOut

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-25 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=bruMal1 ./all.joiner
    joinerCheck -keys -database=bruMal1 ./all.joiner
    joinerCheck -times -database=bruMal1 ./all.joiner
    joinerCheck -all -database=bruMal1 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-25 - Hiram)
    cd /hive/data/genomes/bruMal1
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 bruMal1 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-25 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bruMal1/pushQ
    cd /hive/data/genomes/bruMal1/pushQ
    makePushQSql.pl bruMal1 > bruMal1.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy bruMal1.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    scp -p bruMal1.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < bruMal1.sql

#######################################################################
