# for emacs: -*- mode: sh; -*-

# Caenorhabditis japonica
#	Washington University School of Medicine GSC and Sanger Institute
#	WUSTL version 7.0.1 Sep 2010
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABLE03

##############################################################################
## Fetch sequence (DONE - 2010-09-20 - Hiram)

    wget --timestamping -O wgs.ABLE.1.fsa_nt.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.fsa_nt.gz
    wget --timestamping -O wgs.ABLE.1.qscore.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.qscore.gz
    wget --timestamping -O wgs.ABLE.1.gbff.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.gbff.gz

    zcat wgs.ABLE.1.qscore.gz | sed -e 's#^>gb|#>#; s#.1|.*##' \
    | gzip -c > ABLE000.ucsc.qa.gz

    zcat wgs.ABLE.1.fsa_nt.gz | sed -e 's#^>gi.*ABLE#>ABLE#; s#.1|.*##' \
    | gzip -c > ABLE000.ucsc.fa.gz

    cat << '_EOF_' > replaceNames.pl
#!/bin/env perl

use strict;
use warnings;

my %ableToContig;
open (FH, "<accToContig.list") or die "can not read accToContig.list";
while (my $line = <FH>) {
    chomp $line;
    my ($ableName, $contigName) = split('\s+', $line);
    $ableName =~ s/.1$//;
    $ableToContig{$ableName} = $contigName;
}
close (FH);

open (FH, "<ABLE000.ucsc.agp") or die "can not read ABLE000.ucsc.agp";
while (my $line = <FH>) {
    chomp $line;
    my ($acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd, $strand) =
        split('\s+', $line);
    $acc =~ s/.1$//;
    printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
        $acc, $start, $end, $id, $type, $ableToContig{$acc}, $ctgStart, $ctgEnd, $strand;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x replaceNames.pl

    ./replaceNames.pl > contig.ucsc.agp

##############################################################################
## Initial browser build (DONE - 2010-09-20 - Hiram)
    cd /hive/data/genomes/caeJap3

    cat << '_EOF_' > caeJap3.config.ra
# Config parameters for makeGenomeDb.pl:
db caeJap3
clade worm
genomeCladePriority 70
scientificName Caenorhabditis japonica
commonName C. japonica
assemblyDate Sep. 2010
assemblyLabel Washington University School of Medicine GSC C. japonica 7.0.1
assemblyShortLabel C. japonica 7.0.1
orderKey 880
mitoAcc none
fastaFiles /hive/data/genomes/caeJap3/able0.3/ABLE000.ucsc.fa.gz
agpFiles /hive/data/genomes/caeJap3/able0.3/contig.ucsc.agp
qualFiles /hive/data/genomes/caeJap3/able0.3/ABLE000.ucsc.qac
dbDbSpeciesDir worm
taxId 281687
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	caeJap3.config.ra > makeGenomeDb.log 2>&1
    #	real    2m45.252s

##############################################################################
# REPEATMASKER (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/caeJap3/bed/repeatMasker
    cd /hive/data/genomes/caeJap3/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` caeJap3 > do.log 2>&1 &
    #	real    58m59.433s

    #	from the do.log:
    #	RepeatMasker version development-$Id:
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp
    #	CC   RELEASE 20090604;                                            *

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/caeJap3/bed/simpleRepeat
    cd /hive/data/genomes/caeJap3/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \
	-smallClusterHub=memk -buildDir=`pwd` caeJap3 > do.log 2>&1 &
    #	real    30m8.251s

#########################################################################
# run window masker (DONE - 2010-09-21 - Hiram)
    mkdir /hive/data/genomes/caeJap3/bed/windowMasker
    cd /hive/data/genomes/caeJap3/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` caeJap3 > do.log 2>&1 &
    #	real    4m14.628s

    twoBitToFa caeJap3.wmsk.sdust.2bit stdout | faSize stdin
    #	154057934 bases (0 N's 154057934 real 82158768 upper 71899166 lower)
    #	in 35931 sequences in 1 files
    #	%46.67 masked total, %46.67 masked real

    hgLoadBed caeJap3 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 1023323 elements of size 3
    featureBits caeJap3 windowmaskerSdust
    #	71899166 bases of 154057934 (46.670%) in intersection

    #	this is the mask for the genome
    cd /hive/data/genomes/caeJap3
    zcat bed/windowMasker/windowmasker.sdust.bed.gz \
	| twoBitMask caeJap3.unmasked.2bit stdin \
	    -type=.bed caeJap3.2bit
    twoBitToFa caeJap3.2bit stdout | faSize stdin
    #	154057934 bases (0 N's 154057934 real 82158768 upper 71899166 lower)
    #	in 35931 sequences in 1 files
    #	%46.67 masked total, %46.67 masked real

    ln -s `pwd`/caeJap3.2bit /gbdb/caeJap3/caeJap3.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram)
    # numerator is caeJap3 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 154057934 / 2897310462 \) \* 1024
    #	( 154057934 / 2897310462 ) * 1024 = 54.448885
    # 54 is way too small, use 100 to keep the number of overused
    #	11-mers to a smaller number

    cd /hive/data/genomes/caeJap3
    blat caeJap3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=caeJap3.11.ooc -repMatch=100
    #	Wrote 21362 overused 11-mers to caeJap3.11.ooc

    mkdir /hive/data/staging/data/caeJap3
    cp -p caeJap3.2bit chrom.sizes caeJap3.11.ooc \
	/hive/data/staging/data/caeJap3

#########################################################################
# SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20
    cat fb.ce9.chainCaeJap1Link.txt
    #	26876526 bases of 100286004 (26.800%) in intersection

    #	swap, this is also in caeJap1.txt
    mkdir /hive/data/genomes/caeJap1/bed/blastz.ce9.swap
    cd /hive/data/genomes/caeJap1/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20/DEF \
	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    1m18.062s
    cat fb.caeJap1.chainCe9Link.txt
    #	26200691 bases of 129347181 (20.256%) in intersection

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add caeJap3 just before caeJap2

# caeJap3 (C. japonica)
caeJap3.serverGenome = /hive/data/genomes/caeJap3/caeJap3.2bit
caeJap3.clusterGenome = /scratch/data/caeJap3/caeJap3.2bit
caeJap3.ooc = /scratch/data/caeJap3/caeJap3.11.ooc
caeJap3.lift = no
caeJap3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caeJap3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caeJap3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caeJap3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caeJap3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
caeJap3.refseq.mrna.native.load = yes
caeJap3.refseq.mrna.xeno.load  = yes
caeJap3.refseq.mrna.xeno.loadDesc = yes
caeJap3.genbank.mrna.xeno.load = yes
caeJap3.genbank.est.native.load = yes
caeJap3.genbank.est.native.loadDesc = no
caeJap3.downloadDir = caeJap3
caeJap3.perChromTables = no

    git commit -m "Added caeJap3 C. japonica WUGSC 7.0.1" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial caeJap3 &
    #	logFile: var/build/logs/2010.09.27-13:22:49.caeJap3.initalign.log
    #	real    548m31.324s
    # this had a temporary failure:
# Cannot allocate memory
# can't fork
# command failed: gbAlignInstall  -noMigrate -workdir=work/initial.caeJap3/align -orgCats=native genbank.179.0 daily.0827 mrna caeJap3 at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272.

    # continuing
    time ./etc/align-genbank -finish -workdir work/initial.caeJap3/align
    #	logFile: var/build/logs/2010.09.28-10:08:30.align.log
    #	real    5m47.281s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeJap3
    #	logFile: var/dbload/hgwdev/logs/2010.09.28-10:27:34.dbload.log
    #	real    17m14.683s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add caeJap3 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added caeJap3 - C. elegans WS210" \
	etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeJap3", "blat9", "17790", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeJap3", "blat9", "17791", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to ZC101.2e protein blat result
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="ABLE03026714:2543-32231"
	where name="caeJap3";' hgcentraltest
    # there isn't much there on this small contig, better to use a larger
    #	contig with many gene definitions:
    hgsql -e 'update dbDb set defaultPos="ABLE03030821:1-151,170"
	where name="caeJap3";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-07 - Hiram)
    cd /hive/data/genomes/caeJap3
    mkdir blastDb
    twoBitToFa caeJap3.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	35931 pieces of 35931 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/caeJap3/bed/tblastn.ce9SG
    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG
    echo  /hive/data/genomes/caeJap3/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 35931 query.lst

   # we want around 200000 jobs
   calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)

    #	28103/(200000/35931) = 5048.844465

   mkdir sgfa
   split -l 5000 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=caeJap3
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 215586 of 215586 jobs
# CPU time in finished jobs:   15523183s  258719.71m  4312.00h  179.67d  0.492 y
# IO & Wait Time:               1986539s   33108.99m   551.82h   22.99d  0.063 y
# Average job time:                  81s       1.35m     0.02h    0.00d
# Longest finished job:             185s       3.08m     0.05h    0.00d
# Submission to last job:         83070s    1384.50m    23.07h    0.96d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun
    para create jobList
    para try, check, push, check etc.
# Completed: 6 of 6 jobs
# CPU time in finished jobs:        336s       5.60m     0.09h    0.00d  0.000 y
# IO & Wait Time:                  6832s     113.87m     1.90h    0.08d  0.000 y
# Average job time:                1195s      19.91m     0.33h    0.01d
# Longest finished job:            1198s      19.97m     0.33h    0.01d
# Submission to last job:          1204s      20.07m     0.33h    0.01d

    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 31430 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG
    hgLoadPsl caeJap3 blastCe9SG.psl

    # check coverage
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
# set this as the defaultDb (DONE - 2010-10-20 - Hiram)
    hgsql -e 'update defaultDb set name="caeJap3" where name="caeJap1";' \
	hgcentraltest

#########################################################################
# liftOver sangerGene WS210 to this caeJap3 (DONE - 2010-10-15 - Hiram)
    mkdir /hive/data/genomes/caeJap3/bed/ws210Gene
    cd /hive/data/genomes/caeJap3/bed/ws210Gene
    ln -s /hive/data/genomes/caeJap2a/bed/ws210Gene/caeJap2a.ws210Gene.gp .
    ln -s \
/hive/data/genomes/caeJap2a/bed/liftOver/caeJap2aToCaeJap3.over.chain.gz .

    export DB=caeJap3
    liftOver -genePred caeJap2a.ws210Gene.gp caeJap2aToCaeJap3.over.chain.gz \
	${DB}.ws210Gene.gp ws210gene.unmapped.gp

    genePredCheck -db=${DB} ${DB}.ws210Gene.gp
    #	checked: 16233 failed: 0

    hgLoadGenePred ${DB} ws210Gene ${DB}.ws210Gene.gp

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-21 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=caeJap3 ./all.joiner
    joinerCheck -keys -database=caeJap3 ./all.joiner
    joinerCheck -times -database=caeJap3 ./all.joiner
    joinerCheck -all -database=caeJap3 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-21 - Hiram)
    cd /hive/data/genomes/caeJap3
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 caeJap3 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-21 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/caeJap3/pushQ
    cd /hive/data/genomes/caeJap3/pushQ
    makePushQSql.pl caeJap3 > caeJap3.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy caeJap3.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    hgsql qapushq < caeJap3.sql

#######################################################################
