# for emacs: -*- mode: sh; -*-

# Drosophila virilis -- Agencourt "CAF1" via Eisen's 12-fly site

# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT


#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/21/06 angie)
    ssh kkstore05
    mkdir /cluster/store12/droVir3
    ln -s /cluster/store12/droVir3 /cluster/data/droVir3
    mkdir /cluster/data/droVir3/downloads
    cd /cluster/data/droVir3/downloads
    wget http://rana.lbl.gov/drosophila/caf1/dvir_caf1.tar.gz
    tar xvzf dvir_caf1.tar.gz
    cd dvir
    faSize scaffolds.bases
#206026697 bases (16820834 N's 189205863 real 189205863 upper 0 lower) in 13530 sequences in 1 files
#Total size: mean 15227.4 sd 412436.0 min 43 (scaffold_474) max 25233164 (scaffold_13049) median 1215
#N count: mean 1243.2 sd 14416.5
#U count: mean 13984.2 sd 405263.9
#L count: mean 0.0 sd 0.0


#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/21/06 angie)
    ssh kkstore05
    cd /cluster/data/droVir3
    cat > droVir3.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db droVir3
clade insect
scientificName Drosophila virilis
assemblyDate Feb. 2006
assemblyLabel Agencourt CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/droVir3/downloads/dvir/scaffolds.bases
agpFiles /cluster/data/droVir3/downloads/dvir/assembly.agp
dbDbSpeciesDir drosophila
EOF

    # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
    # because we're not building an actual browser for now.
    makeGenomeDb.pl droVir3.config.ra -stop=db \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log


#########################################################################
# REPEATMASKER (DONE 9/21/06 angie)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl droVir3 -verbose 3 -debug
    # Run it for real and tail the log:
    doRepeatMasker.pl droVir3 -species drosophila -verbose 3 \
      >& /cluster/data/droVir3/bed/RepeatMasker.2006-09-21/do.log &
    tail -f /cluster/data/droVir3/bed/RepeatMasker.2006-09-21/do.log
    # RepeatMasker and lib version from do.log:
#    March 20 2006 (open-3-1-5) version of RepeatMasker
#CC   RELEASE 20060315;                                            *
    # Compare coverage to previous assembly:
    featureBits -chrom=scaffold_13049 droVir3 rmsk
#2148620 bases of 24930185 (8.619%) in intersection
    featureBits -chrom=scaffold_13049 droVir2 rmsk
#2157147 bases of 24956889 (8.643%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/21/06 angie)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/droVir3/bed/simpleRepeat
    cd /cluster/data/droVir3/bed/simpleRepeat
    twoBitToFa ../../droVir3.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~100 minutes (longer than D. mel, must be because of the scaffolds)

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed droVir3 simpleRepeat \
      /cluster/data/droVir3/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Compare coverage to previous assembly:
    featureBits -chrom=scaffold_13049 droVir3 simpleRepeat
#935315 bases of 24930185 (3.752%) in intersection
    featureBits -chrom=scaffold_13049 droVir2 simpleRepeat
#933182 bases of 24956889 (3.739%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/21/06 angie)
    ssh kolossus
    cd /cluster/data/droVir3
    time twoBitMask droVir3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droVir3.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.301u 0.301s 0:01.91 31.4%     0+0k 0+0io 0pf+0w

    # Because this is a no-browser build (just masking for alignment)
    # I did not make the usual /gbdb/$db/$db.2bit link.


###########################################################################
# WINDOWMASKER EXPERIMENT (DONE 10/23/06 angie)
    # The droVir3-droGri2 blastz run  was just destroyed by mega-output,
    # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not
    # too hopeful)... so let's try a de-novo masker before alignment:
    ssh kolossus
    mkdir /cluster/data/droVir3/bed/windowmasker.2006-10-23
    cd /cluster/data/droVir3/bed/windowmasker.2006-10-23
    twoBitToFa /cluster/data/droVir3/droVir3.2bit tmp.fa
    # First, collect counts:
    time /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \
      -output wm.counts
#116.091u 3.329s 2:01.89 97.9%   0+0k 0+0io 115pf+0w
    # Then use those counts to mask sequence:
    time /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \
      -output wm.intervals 
#194.555u 1.303s 3:17.49 99.1%   0+0k 0+0io 0pf+0w
    perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
               if (/^(\d+) - (\d+)/) { \
                 $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
               }' wm.intervals > windowmasker.bed
    # Quick coverage:
    awk '{print $3 - $2;}' windowmasker.bed | total
#66922828
    awk '{print $2;}' ../../chrom.sizes  | total
#206026697
    calc 66922828 / 206026697
#66922828 / 206026697 = 0.324826
    # Make a masked .2bit:
    twoBitMask ../../droVir3.2bit windowmasker.bed ../../droVir3.WM.2bit

    # Now try with -sdust to additionally mask low-complexity sequence:
    time /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \
       -input tmp.fa -output wm.sdust.intervals
#545.704u 1.433s 9:09.64 99.5%   0+0k 0+0io 0pf+0w
    perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
               if (/^(\d+) - (\d+)/) { \
                 $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
               }' wm.sdust.intervals > windowmasker.sdust.bed
    awk '{print $3 - $2;}' windowmasker.sdust.bed | total
#86036805
    calc 86036805 / 206026697
#86036805 / 206026697 = 0.417600
    # Make a masked .2bit (even if we don't end up needing it):
    twoBitMask ../../droVir3.2bit windowmasker.sdust.bed \
      ../../droVir3.WMSDust.2bit
    rm tmp.fa


###########################################################################
# BLASTZ/CHAIN/NET DROGRI2 (DONE 10/23/06 angie)
    # Using WindowMasker'd sequence since prior runs died due to massive 
    # output (netChainSubset out of memory), even with chainMinScore=10000
    # or chainMinScore=8000 + M=20.
    ssh kkstore05
    mkdir /cluster/data/droVir3/bed/blastz.droGri2.2006-10-23
    cd /cluster/data/droVir3/bed/blastz.droGri2.2006-10-23
    cat << '_EOF_' > DEF
# D. virilis vs. D. grimshawi

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. virilis
SEQ1_DIR=/san/sanvol1/scratch/droVir3/droVir3.WM.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/san/sanvol1/scratch/droVir3/chrom.sizes
SEQ1_LIMIT=50

# QUERY - D. grimshawi
SEQ2_DIR=/san/sanvol1/scratch/droGri2/droGri2.WM.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droGri2/chrom.sizes
SEQ2_LIMIT=200

BASE=/cluster/data/droVir3/bed/blastz.droGri2.2006-10-23
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub=pk \
      -blastzOutRoot /san/sanvol1/scratch/droVir3droGri2 >& do.log &
    tail -f do.log
    ln -s blastz.droGri2.2006-10-23 /cluster/data/droVir3/bed/blastz.droGri2


#########################################################################
# SWAP DM3 CHAIN/NET (DONE 4/6/09 angie)
    mkdir /hive/data/genomes/droVir3/bed/blastz.dm3.swap
    cd /hive/data/genomes/droVir3/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \
      -workhorse kolossus \
      /hive/data/genomes/dm3/bed/blastz.droVir3/DEF.hive >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /hive/data/genomes/droVir3/bed/blastz.dm3


#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2011-01-17 -Chin)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # worm size is ~6.5% of human judging by gapless ce4 vs. hg18 genome 
    # size from featureBits. So we would use 64, but that yields a very
    # high number of tiles to ignore, especially for a small more compact 
    # genome.  Bump that up a bit to be more conservative.
    cd /hive/data/genomes/droVir3
    blat droVir3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/droVir3.11.ooc -repMatch=100
    # Wrote 14348 overused 11-mers to jkStuff/droVir3.11.ooc
    #   Wrote 8514 overused 11-mers to jkStuff/11.ooc

    mkdir /hive/data/staging/data/droVir3
    cd  /hive/data/genomes/droVir3
    cp -p droVir3.2bit jkStuff/droVir3.11.ooc chrom.sizes /hive/data/staging/data/droVir3

    #   request rsync to cluster nodes


############################################################################
# LIFTOVER TO droVir2 (DONE - 2011-01-17 - Chin )

    mkdir /hive/data/genomes/droVir3/bed/blat.droVir2.2011-01-15
    cd /hive/data/genomes/droVir3/bed/blat.droVir2.2011-01-15
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
        -buildDir=`pwd` \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/droVir3/jkStuff/droVir3.11.ooc -debug droVir3 droVir2
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl \
        -buildDir=`pwd` \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/droVir3/jkStuff/droVir3.11.ooc droVir3 droVir2 > do.log 2>&1
    # real    29m40.917s
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/droVir3/liftOver
    md5sum *.gz > md5sum.txt

#########################################################################
