# for emacs: -*- mode: sh; -*-


# Drosophila pseudoobscura -- 
# 
# Baylor HGSC's Drosophila Genome Project Freeze 1 assembly (Aug. 2003)
# http://www.hgsc.bcm.tmc.edu/projects/drosophila/
#
# Note: this is the same assembly as dp1!  But now the browser is scaffold-
# based, not chrUn-based which everybody hated.  


# DOWNLOAD SEQUENCE (DONE 8/2/04 angie)
    ssh kksilo
    mkdir /cluster/store8/dp2
    cd /cluster/data
    ln -s /cluster/store8/dp2 dp2
    cd /cluster/data/dp2
    mkdir jkStuff bed
    mkdir downloads
    cd downloads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/conditions_for_use
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/freeze1_readme
    # Freeze 1 is 759 scaffolds, 271 of which are assigned to chromosomes 
    # in the .txt file:
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/D_pseudo_freeze1_scaffolds_8_28_03.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/D_pseudo_freeze1_scaffold_to_chromosome_assignments_8_28_03.txt
    # Unpack scaffolds into separate files in scaffolds/ dir:
    cd /cluster/data/dp2
    mkdir scaffolds
    zcat downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa.gz \
    | faSplit byname stdin scaffolds/


# FIX UP SCAFFOLDS WITH FORMATTING PROBLEMS (DONE 8/2/04 angie)
    # ***ASSEMBLY-SPECIFIC*** SHOULD NOT BE NECESSARY IN THE FUTURE!
    # Some scaffolds ended up getting split by Baylor at the last minute,
    # and ".fa" was left appended to their fasta record names which caused 
    # faSplit to put them in ".fa.fa" files:
    ls -1 scaffolds/*.fa.fa
#scaffolds/Contig3286_Contig7811A.fa.fa
#scaffolds/Contig3286_Contig7811B.fa.fa
#scaffolds/Contig4971_Contig7717A.fa.fa
#scaffolds/Contig4971_Contig7717B.fa.fa
#scaffolds/Contig6811_Contig7852A.fa.fa
#scaffolds/Contig6811_Contig7852B.fa.fa
#scaffolds/Contig7891_Contig7492A.fa.fa
#scaffolds/Contig7891_Contig7492B.fa.fa
    foreach f (scaffolds/*.fa.fa)
      perl -wpe 's/^(>Contig\d+_Contig\w+).fa/$1/' $f \
      > $f:r
      head -2 $f:r
      faCmp $f $f:r
    end
    rm scaffolds/*.fa.fa
    # Since dp1 is the same assembly, double-check against it to make sure 
    # we did the right thing in both places.  
    foreach f (scaffolds/*.fa)
      faCmp $f /cluster/data/dp1/$f | grep -v ' are the same$'
    end


# PARTITION SCAFFOLDS FOR PROCESSING (DONE 8/2/04 angie)
    # Chop up scaffolds into ~500k chunks (mainly to limit size of 
    # RepeatMasker input sequence).  Cat together split .lft's into 
    # jkStuff/liftAll.lft which will be used to lift to scaffold 
    # coords after processing.  
    ssh kksilo
    cd /cluster/data/dp2
    mkdir scaffoldsSplit
    foreach f (scaffolds/*.fa)
      faSplit size $f 500000 scaffoldsSplit/$f:t:r_ \
        -lift=scaffoldsSplit/$f:t:r.lft
    end
    cat scaffoldsSplit/C*.lft > jkStuff/liftAll.lft
    # Create a big fasta file containing all split scaffolds, ordered 
    # by decreasing size.  Then use faSplit again -- we end up with 
    # big scaffold chunks in their own files, but small scaffolds 
    # grouped together.   
    mkdir chunks
    cat `ls -1S scaffoldsSplit/*.fa` \
    | faSplit about stdin 300000 chunks/chunk_


# CREATING DATABASE (DONE 8/2/04 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  514G  1.2T  31% /var/lib/mysql
    hgsql '' -e 'create database dp2'


# RUN REPEAT MASKER (DONE 8/2/04 angie)
    # January ("March") '04 version of RepeatMasker and libs.
    # make the run directory, output directory, and job list
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/dp2/$2
/bin/cp ../chunks/$2 /tmp/dp2/$2/
pushd /tmp/dp2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/dp2/$2/$2.out ./
/bin/rm -fr /tmp/dp2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/dp2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/dp2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun RMOut
    cp /dev/null RMRun/RMJobs
    foreach f ( chunks/*.fa )
      set chunk = $f:t
      echo ../jkStuff/RMDrosophila \
           /cluster/data/dp2/RMOut $chunk \
           '{'check in line+ /cluster/data/dp2/chunks/$chunk'}' \
         '{'check out line+ /cluster/data/dp2/RMOut/$chunk.out'}' \
      >> RMRun/RMJobs
    end

    # do the run
    ssh kk
    cd /cluster/data/dp2/RMRun
    para create RMJobs
    para try, check, push, check,...
#Completed: 304 of 304 jobs
#Average job time:                5720s      95.33m     1.59h    0.07d
#Longest job:                     7412s     123.53m     2.06h    0.09d
#Submission to last job:          7423s     123.72m     2.06h    0.09d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh kksilo
    cd /cluster/data/dp2
    foreach f (RMOut/*.fa.out)
      liftUp $f:r:r.scaf.out jkStuff/liftAll.lft warn $f > /dev/null
    end
    # Make a consolidated scaffold .out file too:
    head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out
    foreach f (RMOut/*.scaf.out)
      tail +4 $f >> RMOut/scaffolds.fa.out 
    end
    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut dp2 /cluster/data/dp2/RMOut/scaffolds.fa.out
    # hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk, 
    # but we want a non-split with no prefix table:
    hgsql dp2 -e 'rename table scaffolds_rmsk to rmsk'
    # Fix up the indices too:
    hgsql dp2 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(9), bin); \
                  create index genoStart on rmsk (genoName(9), genoStart); \
                  create index genoEnd   on rmsk (genoName(9), genoEnd);'


# SIMPLE REPEATS (TRF) (DONE 8/3/04 angie)
    ssh kksilo
    mkdir -p /cluster/data/dp2/bed/simpleRepeat
    cd /cluster/data/dp2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach f (/cluster/data/dp2/chunks/*.fa)
      set fout = $f:t:r.bed
      echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
      >> jobs.csh
    end
    csh -fe jobs.csh \
    |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
    > jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # Lift chunk results to one file for all scaffolds:
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn trf/*.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed dp2 simpleRepeat \
      /cluster/data/dp2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 8/3/04 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/dp2/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    liftUp scaffoldsTrfMask.bed ../../jkStuff/liftAll.lft warn trfMask/*.bed


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 8/3/04 angie)
    ssh kksilo
    cd /cluster/data/dp2
    echo repeat- and trf-masking chunks
    foreach chunk (chunks/chunk*.fa)
      set trfMask=bed/simpleRepeat/trfMask/$chunk:t:r.bed
      maskOutFa -soft $chunk RMOut/$chunk:t.out $chunk
      maskOutFa -softAdd $chunk $trfMask $chunk
    end
    echo repeat- and trf-masking scaffolds
    # maskOutFa dies if the lift file describes more seqs than are in 
    # the fasta, so make a separate mask file for each scaffold.
    set rmMask=RMOut/scaffolds.fa.out
    set trfMask=bed/simpleRepeat/scaffoldsTrfMask.bed
    # use head to grab the RepeatMasker output header, then grep contents 
    # for scaffold.
    foreach f (scaffolds/*.fa)
      head -3 $rmMask > tmpMask.out
      grep $f:t:r $rmMask >> tmpMask.out
      maskOutFa -soft $f tmpMask.out $f
      grep $f:t:r $trfMask > tmpMask.bed
      maskOutFa -softAdd $f tmpMask.bed $f
    end
    rm tmpMask.{bed,out}


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 8/2/04 angie)
    # Translate to nib
    ssh kksilo
    cd /cluster/data/dp2
    mkdir nib
    foreach f (scaffolds/*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make a .2bit file for blat:
    faToTwoBit scaffolds/*.fa dp2.2bit

    # Make symbolic links from /gbdb/dp2/nib to the real nibs and 2bit.
    ssh hgwdev
    mkdir -p /gbdb/dp2/nib
    ln -s /cluster/data/dp2/nib/*.nib /gbdb/dp2/nib
    ln -s /cluster/data/dp2/dp2.2bit /gbdb/dp2/nib

    # Load /gbdb/dp2/nib paths into database and save size info.
    # Make index on chromInfo.name non-unique (in its first N chars) 
    # to accommodate 22-char scaffold names like 
    # Contig7891_Contig7492A, Contig7891_Contig7492B
    # Also, index first 9 chars instead of first 16 -- the 6th to 9th 
    # chars are almost unique (except for those A and B scaffolds).  
    ssh hgwdev
    cd /cluster/data/dp2
    hgNibSeq -preMadeNib -index='INDEX(chrom(9))' dp2 /gbdb/dp2/nib \
      /gbdb/dp2/nib/*.nib
#135845121 total bases
    hgsql dp2 -N -e 'select chrom,size from chromInfo' > chrom.sizes
    wc -l chrom.sizes
#    759 chrom.sizes


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 8/3/04 angie)
    # Copy all the data from the table "grp" 
    # in the existing database "rn1" to the new database
    ssh hgwdev
    hgsql dp2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 8/3/04 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("dp2", "Aug. 2003", "/gbdb/dp2/nib", "D. pseudoobscura", \
             "Contig1006_Contig943:3363975-3378814", 1, 56, \
             "D. pseudoobscura", \
             "Drosophila pseudoobscura", "/gbdb/dp2/html/description.html", \
             0, 0, "Baylor HGSC Freeze 1");' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'update defaultDb set name = "dp2" where genome = "D. pseudoobscura"' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit trackDb/makefile to add dp2 to the DBS variable.
    mkdir drosophila/dp2
    # Create a simple drosophila/dp2/description.html file.
    cvs add drosophila/dp2
    cvs add drosophila/dp2/description.html
    make update

    # go public on genome-test
    cvs commit makefile
    cvs commit drosophila/dp2
    mkdir /gbdb/dp2/html
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha


# PRODUCING GENSCAN PREDICTIONS (DONE 8/3/04 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh kksilo
    mkdir /cluster/data/dp2/bed/genscan
    cd /cluster/data/dp2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Make hard-masked scaffolds
    mkdir /cluster/data/dp2/scaffoldsMasked
    foreach f (/cluster/data/dp2/scaffolds/*.fa)
      maskOutFa $f hard /cluster/data/dp2/scaffoldsMasked/$f:t.masked
    end
    # Generate a list file, scaffolds.list, of all the hard-masked scaffolds 
    # that *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null scaffolds.list
    foreach f ( `ls -1S /cluster/data/dp2/scaffoldsMasked/*.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> scaffolds.list
    end
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 scaffolds.list single gsub jobList
    ssh kki
    cd /cluster/data/dp2/bed/genscan
    para create jobList
    para try, check, push, check, ...
#Completed: 759 of 759 jobs
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest job:                      127s       2.12m     0.04h    0.00d
#Submission to last job:           666s      11.10m     0.18h    0.01d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    
    # Concatenate scaffold-level results:
    ssh kksilo
    cd /cluster/data/dp2/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat subopt/*.bed > genscanSubopt.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/dp2/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf dp2 genscan genscan.gtf
    hgPepPred dp2 generic genscanPep genscan.pep
    hgLoadBed dp2 genscanSubopt genscanSubopt.bed

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch dp2 genscanPep 0501071300.00

# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 8/3/04 angie)
    ssh kkr1u00
    mkdir /iscratch/i/dp2
    cp -pR /cluster/data/dp2/nib /iscratch/i/dp2/
    iSync


# MAKE DOWNLOADABLE FILES (DONE 8/16/04 angie)
    ssh kksilo
    mkdir /cluster/data/dp2/zips
    cd /cluster/data/dp2
    zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out
    zip -j zips/scaffoldFa.zip scaffolds/*.fa
    zip -j zips/scaffoldFaMasked.zip scaffoldsMasked/*.fa.masked
    zip -j zips/scaffoldTrf.zip bed/simpleRepeat/scaffoldsTrfMask.bed
    foreach f (zips/*.zip)
      echo $f
      unzip -t $f | tail -1
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dp2
    cd /usr/local/apache/htdocs/goldenPath/dp2
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cd bigZips
    cp -p /cluster/data/dp2/zips/*.zip .
    md5sum *.zip > md5sum.txt


# SWAP DM1-DP2 BLASTZ (DONE 8/3/04 angie)
    ssh kksilo
    mkdir -p /cluster/data/dp2/bed/blastz.dm1.swap.2004-08-03/axtScaffold
    ln -s blastz.dm1.swap.2004-08-03 /cluster/data/dp2/bed/blastz.dm1
    cd /cluster/data/dp2/bed/blastz.dm1
    set aliDir = /cluster/data/dm1/bed/blastz.dp2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtScaffold
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtScaffold/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtScaffold
#292M    /cluster/data/dm1/bed/blastz.dp2/axtChrom
#294M    unsorted
#294M    axtScaffold
    rm -r unsorted


# CHAIN MELANOGASTER BLASTZ (DONE 8/4/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dp2/bed/blastz.dm1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/dp2/bed/blastz.dm1/axtScaffold/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -verbose=0 $1 \
  /iscratch/i/dp2/nib \
  /iscratch/i/dm1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 671 of 671 jobs
#Average job time:                   4s       0.07m     0.00h    0.00d
#Longest job:                       10s       0.17m     0.00h    0.00d
#Submission to last job:           669s      11.15m     0.19h    0.01d

    # now on the fileserver, sort chains
    ssh kksilo
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain/chain
    cat *.chain | hgLoadChain dp2 chainDm1 stdin


# NET MELANOGASTER BLASTZ (DONE 8/4/04 angie)
    ssh kksilo
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    netClass -noAr noClass.net dp2 dm1 melanogaster.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    rm noClass.net
    netFilter -syn melanogaster.net > melanogasterSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    netFilter -minGap=10 melanogaster.net |  hgLoadNet dp2 netDm1 stdin
    netFilter -minGap=10 melanogasterSyn.net \
    | hgLoadNet dp2 netSyntenyDm1 stdin


# MAKE AXTNET (DONE 8/17/04 angie)
    ssh kksilo
    cd /cluster/data/dp2/bed/blastz.dm1/axtChain
    netSplit melanogaster.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/dp2/nib \
        /cluster/data/dm1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end


# MAKE VSDM1 DOWNLOADABLES (DONE 8/17/04 angie)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dp2/vsDm1
    cd /usr/local/apache/htdocs/goldenPath/dp2/vsDm1
    gzip -c \
      /cluster/data/dp2/bed/blastz.dm1/axtChain/all.chain \
      > melanogaster.chain.gz
    gzip -c \
      /cluster/data/dp2/bed/blastz.dm1/axtChain/melanogaster.net \
      > melanogaster.net.gz
    mkdir axtNet
    foreach f (/cluster/data/dp2/bed/blastz.dm1/axtNet/*.axt)
      gzip -c $f > axtNet/$f:t.gz
    end
    # Make a README.txt which explains the files & formats.
    md5sum *.gz */*.gz > md5sum.txt


# LIFTOVER BDGP GENES (TODO 8/?/03 angie)
    ssh hgwdev
    mkdir /cluster/data/dp2/bed/bdgpAnnotations
    cd /cluster/data/dp2/bed/bdgpAnnotations
    echo 'select * from bdgpGene' | hgsql -N dm1 > dm1.bdgpGene.tab
    liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \
      dm1.bdgpGene.tab \
      /cluster/data/dm1/bed/blastz.dp2/axtChain/all.chain \
      dp2.bdgpLiftGene.tab dp2.unmapped.tab
    wc -l dm1.bdgpGene.tab dp2.*.tab
    grep ^# dp2.unmapped.tab | awk '{print $1 " " $2 " " $3;}' \
    | sort | uniq -c | sort -gr
    ldHgGene -predTab dp2 bdgpLiftGene dp2.bdgpLiftGene.tab
    hgsql dp2 -e 'create table bdgpLiftGenePep select * from dm1.bdgpGenePep'


# MAKE 11.OOC FILE FOR BLAT (DONE 8/4/04 angie)
    # Use -repMatch=75 (based on size -- for human we use 1024, and 
    # mosquito size is ~5% of human => 50 -- but I worry it doesn't scale 
    # linearly so be conservative and require more matches to be "overused")
    ssh kkr1u00
    mkdir /cluster/data/dp2/bed/ooc
    cd /cluster/data/dp2/bed/ooc
    ls -1 /cluster/data/dp2/nib/*.nib > nib.lst
    mkdir /cluster/bluearc/dp2
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/dp2/11.ooc -repMatch=75
#Wrote 6348 overused 11-mers to /cluster/bluearc/dp2/11.ooc
    cp -p /cluster/bluearc/dp2/*.ooc /iscratch/i/dp2/
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 8/4/04 angie)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
# dp2 (D. pseudoobscura)
dp2.genome = /iscratch/i/dp2/nib/*.nib
dp2.lift = no
dp2.refseq.mrna.native.load = no
dp2.refseq.mrna.xeno.load = yes
dp2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
dp2.genbank.mrna.xeno.load = yes
dp2.genbank.est.xeno.load = no
dp2.downloadDir = dp2
dp2.perChromTables = no

    cvs ci etc/genbank.conf
    # Since D. pseudoobscura is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    cvs diff src/lib/gbGenome.c
    make
    cvs ci src/lib/gbGenome.c
    # Edit src/align/gbBlat to add /iscratch/i/dp2/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) RefSeq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dp2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp2
    featureBits dp2 xenoRefGene
#14616774 bases of 135845121 (10.760%) in intersection
    # Clean up:
    rm -rf work/initial.dp2

    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dp2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp2
    featureBits dp2 mrna
#SELECT ,,,,,,,,,, FROM mrna WHERE  < 3647374 AND  > 0
    # oops, try this instead:
    featureBits dp2 all_mrna
#17931 bases of 135845121 (0.013%) in intersection
    # Wow.  that is so paltry, perhaps we shouldn't even bother having a 
    # mrna track??
    featureBits dp2 xenoMrna
#15763946 bases of 135845121 (11.604%) in intersection
    # Clean up:
    rm -rf work/initial.dp2

    # -initial for ESTs
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial dp2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 dp2 &
    # Doh!  wow, surprised that there are really no ESTs:
#eieio 2004.08.04-19:18:53 dp2.initalign: nothing to align
    # Clean up:
    rm -rf work/initial.dp2


# MAKE GCPERCENT (DONE 8/4/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dp2/bed/gcPercent
    cd /cluster/data/dp2/bed/gcPercent
    # create and load gcPercent table
    hgsql dp2  < ~/src/hg/lib/gcPercent.sql
    hgGcPercent dp2 ../../nib


# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie)
    ssh kksilo
    mkdir /cluster/data/dp2/bed/fakeAgp
    cd /cluster/data/dp2/bed/fakeAgp
    cat ../../scaffolds/*.fa \
    | faGapSizes stdin \
        -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
    # More blocks of N's than we would expect are exactly 10bp long.  That is 
    # probably the min gap size for the assembly:
    cat ../../scaffolds/*.fa \
    | hgFakeAgp -minContigGap=10 stdin fake.agp
    ssh hgwdev
    hgLoadGap -unsplit dp2 /cluster/data/dp2/bed/fakeAgp/fake.agp


# TWINSCAN (FROM BAYLOR) (DONE 8/4/04 angie)
    ssh kksilo
    mkdir /cluster/data/dp2/bed/twinscan
    cd /cluster/data/dp2/bed/twinscan
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/freeze01_twinscan.pep.fa
    wget -r ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/twinscan
    cat ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/twinscan/*/*.gff \
    | sed -e 's/\.fon\.ace_linear//g' > twinscan.gtf
    # Doh, freeze01_twinscan.pep.fa has names that don't match with the names 
    # in the GTF.... and the pep names have coords that don't quite jive with 
    # the coords in the GTF.  Looks like a version difference -- discard.
    rm freeze01_twinscan.pep.fa
    ssh hgwdev
    cd /cluster/data/dp2/bed/twinscan
    ldHgGene -gtf -genePredExt dp2 twinscan twinscan.gtf
    featureBits -enrichment dp2 xenoRefGene twinscan
#xenoRefGene 10.760%, twinscan 15.319%, both 8.474%, cover 78.76%, enrich 5.14x
    # Compare with genscan:
    featureBits -enrichment dp2 xenoRefGene genscan
#xenoRefGene 10.760%, genscan 18.141%, both 9.393%, cover 87.30%, enrich 4.81x


#TODO : ask somebody at Baylor about AGP/gap info for the scaffolds, and the 
#twinscan coord differences.  


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 8/26/04 angie)
    ssh hgwdev
    echo 'insert into blatServers values("dp2", "blat3", "17778", 1, 0); \
          insert into blatServers values("dp2", "blat3", "17779", 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE Drosophila Proteins track (DONE 2004-08-24 braney)
    ssh kksilo
    mkdir -p /cluster/data/dp2/blastDb
    cd /cluster/data/dp2/blastDb
    for i in ../scaffolds/*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/dp2/blastDb
    cp /cluster/data/dp2/blastDb/* /iscratch/i/dp2/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/dp2/bed/tblastn.dm1FB
    cd /cluster/data/dp2/bed/tblastn.dm1FB
    ls -1S /iscratch/i/dp2/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kksilo
    cd /cluster/data/dp2/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/759) = 94.799100
    split -l 95 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp 
exit 1
'_EOF_'

    gensub2 bug.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/dp2/bed/tblastn.dm1FB
    para create blastSpec
    para try, push
#  Completed: 150282 of 150282 jobs
#  CPU time in finished jobs:    2977330s   49622.16m   827.04h   34.46d  0.094 y
#  IO & Wait Time:                999926s   16665.44m   277.76h   11.57d  0.032 y
#  Average job time:                  26s       0.44m     0.01h    0.00d
#  Longest job:                      815s      13.58m     0.23h    0.01d
#  Submission to last job:          8894s     148.23m     2.47h    0.10d


    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.list
    gensub2 chain.lst single chainGsub chainSpec
    para create chainSpec

    # should run this on the mini-cluster or with my shove script
    # so you can limit the number of jobs starting to 3 or 4
    para try, push...

    exit
    # back to kksilo
    cd /cluster/data/dp2/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done
    cat u.*.psl m60* |  sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/dp2/bed/tblastn.dm1FB

    hgLoadPsl dp2 blastDm1FB.psl
    echo "drop index tName on blastDm1FB ;" | hgsql dp2
    echo "drop index tName_2 on blastDm1FB ;" | hgsql dp2
    echo "drop index tName_3 on blastDm1FB ;" | hgsql dp2
    echo "create index tName on blastDm1FB (tName(9), bin) ;" | hgsql dp2
    echo "create index tName_2 on blastDm1FB  (tName(9), tStart);" | hgsql dp2
    echo "create index tName_3 on blastDm1FB  (tName(9), tEnd);" | hgsql dp2

# End tblastn


# GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie)
    ssh hgwdev
    mkdir /cluster/data/dp2/bed/geneMapper
    cd /cluster/data/dp2/bed/geneMapper
    wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroPse_1.gff
    # Don't use -genePredExt... there are no start/stop_codon items, so 
    # all get marked "incmpl", and name2 always gets the same value as name.
    # Get rid of custom track header lines:
    egrep -v '^(track|browser) ' DroPse_1.gff \
    | ldHgGene -gtf dp2 geneMapper stdin


