# for emacs: -*- mode: sh; -*-

# elephant ( Loxodonta africana)
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kolossus

    mkdir /hive/data/genomes/loxAfr2
    ln -s /hive/data/genomes/loxAfr2 /cluster/data
    mkdir /cluster/data/loxAfr2/broad
    cd /cluster/data/loxAfr2/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin loxAfr2.qual.qac

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/BasicStats.out

# no BasicStats.out

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 225378


#########################################################################
# Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07)
    ssh hgwdev
    echo "drop database loxAfr2" | hgsql ""
    ssh kolossus
    cd /cluster/data/loxAfr2
cat << _EOF_ >loxAfr2.config.ra
# Config parameters for makeGenomeDb.pl:
db loxAfr2
clade mammal
genomeCladePriority 35
scientificName Loxodonta africana
commonName Elephant
assemblyDate Jul. 2008
assemblyLabel Broad Institute loxAfr2 
orderKey 346.1
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/loxAfr2/broad/assembly.bases.gz
agpFiles /cluster/data/loxAfr2/broad/assembly.agp
qualFiles /cluster/data/loxAfr2/broad/loxAfr2.qual.qac
dbDbSpeciesDir elephant
_EOF_

    screen
    makeGenomeDb.pl -verbose=2 loxAfr2.config.ra > makeGenomeDb.out 2>&1 &

    # when done
    cut -f 2 chrom.sizes | ave stdin

# Q1 2133.000000
# median 5499.000000
# Q3 13596.000000
# average 18504.090248
# min 568.000000
# max 644881.000000
# count 225378
# total 4170414852.000000
# standard deviation 35140.147724

#########################################################################
# REPEATMASKER (DONE braney 2008-10-07)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/loxAfr2/bed/repeatMasker
    cd /cluster/data/loxAfr2/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/loxAfr2/bed/repeatMasker \
        loxAfr2 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    ssh pk
    para time

# Completed: 9212 of 9212 jobs
# CPU time in finished jobs:   22535275s  375587.92m  6259.80h  260.82d  0.715 y
# IO & Wait Time:                 75555s    1259.25m    20.99h    0.87d  0.002 y
# Average job time:                2454s      40.91m     0.68h    0.03d
# Longest finished job:            7588s     126.47m     2.11h    0.09d
# Submission to last job:         80995s    1349.92m    22.50h    0.94d

    cd /cluster/data/loxAfr2/bed/repeatMasker
    doRepeatMasker.pl -workhorse kolossus -buildDir=/cluster/data/loxAfr2/bed/repeatMasker \
        loxAfr2 -continue cat > do2.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/loxAfr2/bed/repeatMasker

    time nice -n +19 featureBits loxAfr2 rmsk > fb.loxAfr2.rmsk.txt 2>&1 &
# 1088787467 bases of 2444975542 (44.532%) in intersection

#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-10-07)
    ssh kolossus
    screen # use a screen to manage this job
    mkdir /cluster/data/loxAfr2/bed/simpleRepeat
    cd /cluster/data/loxAfr2/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/loxAfr2/bed/simpleRepeat \
	loxAfr2 > do.log 2>&1 &

    ssh pk
    para time
# Completed: 84 of 84 jobs
# CPU time in finished jobs:      43572s     726.19m    12.10h    0.50d  0.001 y
# IO & Wait Time:                  6346s     105.77m     1.76h    0.07d  0.000 y
# Average job time:                 594s       9.90m     0.17h    0.01d
# Longest finished job:            5352s      89.20m     1.49h    0.06d
# Submission to last job:          5409s      90.15m     1.50h    0.06d

  doSimpleRepeat.pl -buildDir=/cluster/data/loxAfr2/bed/simpleRepeat \
        loxAfr2 -continue filter > do2.log 2>&1 &

    featureBits loxAfr2 simpleRepeat
# 41337206 bases of 2444975542 (1.691%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/loxAfr2
    twoBitMask loxAfr2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed loxAfr2.2bit

    twoBitToFa loxAfr2.2bit stdout | faSize stdin
# 4170414852 bases (1725439310 N's 2444975542 real 1366083755 upper 1078891787
# lower) in 225378 sequences in 1 files
# Total size: mean 18504.1 sd 35140.2 min 568 (scaffold_218703) max 644881
# (scaffold_0) median 5499
# N count: mean 7655.8 sd 14579.3
# U count: mean 6061.3 sd 13409.2
# L count: mean 4787.0 sd 9801.0
# %25.87 masked total, %44.13 masked real

    twoBitToFa loxAfr2.rmsk.2bit stdout | faSize stdin

# 4170414852 bases (1725439310 N's 2444975542 real 1366661766 upper 1078313776
# lower) in 225378 sequences in 1 files
# Total size: mean 18504.1 sd 35140.2 min 568 (scaffold_218703) max 644881
# (scaffold_0) median 5499
# N count: mean 7655.8 sd 14579.3
# U count: mean 6063.9 sd 13412.5
# L count: mean 4784.5 sd 9797.5
# %25.86 masked total, %44.10 masked real

    ssh hgwdev
    # Link to it from /gbdb
    rm -f  /gbdb/loxAfr2/loxAfr2.2bit
    ln -s /cluster/data/loxAfr2/loxAfr2.2bit /gbdb/loxAfr2/loxAfr2.2bit

#########################################################################
# prepare for kluster runs (DONE - 2008-10-22 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	loxAfr2: 2444975542
    # thus: 1024 * 2444975542/2881421696 = 868
    #	rounding up to 900 for a bit more conservative masking
    cd /hive/data/genomes/loxAfr2
    time blat loxAfr2.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=loxAfr2.11.ooc -repMatch=900
    #	Wrote 38872 overused 11-mers to loxAfr2.11.ooc
    #	real    2m46.741s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/loxAfr2
    cp -p loxAfr2.2bit chrom.sizes loxAfr2.11.ooc \
	/hive/data/staging/data/loxAfr2
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-22 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute loxAfr2 (NCBI project 12569, AAGU02000000)" where name="loxAfr2";' hgcentraltest
###########################################################################
