# for emacs: -*- mode: sh; -*-

# sloth ( Choloepus hoffmanni )
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kkstore05
    mkdir /cluster/store12/choHof1
    ln -s /cluster/store12/choHof1 /cluster/data
    mkdir /cluster/data/choHof1/broad
    cd /cluster/data/choHof1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/sloth/ChoHof1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/sloth/ChoHof1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/sloth/ChoHof1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin choHof1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 481259

#########################################################################
# Create .ra file and run makeGenomeDb.pl (DONE braney 2008-10-07)
    ssh kolossus
    cd /cluster/data/choHof1
cat << _EOF_ >choHof1.config.ra
# Config parameters for makeGenomeDb.pl:
db choHof1
clade mammal
genomeCladePriority 35
scientificName Choloepus hoffmanni
commonName Sloth
assemblyDate Jul. 2008
assemblyLabel Broad Institute choHof1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/choHof1/broad/assembly.bases.gz
agpFiles /cluster/data/choHof1/broad/assembly.agp
qualFiles /cluster/data/choHof1/broad/choHof1.qual.qac
dbDbSpeciesDir sloth
_EOF_

    makeGenomeDb.pl -workhorse kolossus choHof1.config.ra > makeGenomeDb.out 2>&1 &

    cut -f 2 chrom.sizes | ave stdin
# Q1 1235.000000
# median 2269.000000
# Q3 6207.000000
# average 5109.364438
# min 250.000000
# max 197267.000000
# count 481259
# total 2458927620.000000
# standard deviation 6671.452585

#########################################################################
## Repeat Masker (DONE - 2008-10-16 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/choHof1/bed/repeatMasker
    cd /hive/data/genomes/choHof1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` choHof1 > do.log 2>&1 &
    #	real    907m8.655s
    twoBitToFa choHof1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2458927620 bases (398507935 N's 2060419685 real 1325807439 upper 734612246
# lower) in 481259 sequences in 1 files
# %29.88 masked total, %35.65 masked rea

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-15 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/choHof1/bed/simpleRepeat
    cd /hive/data/genomes/choHof1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/hive/data/genomes/choHof1/bed/simpleRepeat choHof1 \
	> do.log 2>&1 &
    #	real    276m12.206s
    cat fb.simpleRepeat
    #	32246798 bases of 2060419685 (1.565%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/choHof1
    rm choHof1.2bit
    twoBitMask choHof1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed choHof1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa choHof1.2bit stdout | faSize stdin > choHof1.2bit.faSize.txt
# 2458927620 bases (398507935 N's 2060419685 real 1325416974 upper 735002711
# lower) in 481259 sequences in 1 files
# %29.89 masked total, %35.67 masked real

    #	link to gbdb
    ln -s `pwd`/choHof1.2bit /gbdb/choHof1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	choHof1: 2060419685
    # thus: 1024 * 2060419685/2881421696 = 732
    #	rounding up to 800 for a bit more conservative masking
    cd /hive/data/genomes/choHof1
    time blat choHof1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=choHof1.11.ooc -repMatch=800
    #	Wrote 34983 overused 11-mers to choHof1.11.ooc
    #	real    2m5.136s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/choHof1
    cp -p choHof1.2bit chrom.sizes choHof1.11.ooc \
	/hive/data/staging/data/choHof1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/
