# for emacs: -*- mode: sh; -*-

# Oreochromis niloticus  - Nile Tilapia
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus
# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AERX01
#

#############################################################################
# Download sequence from NCBI  (DONE - 2011-04-20 - Hiram)
mkdir -p /hive/data/genomes/oreNil1/genbank
cd /hive/data/genomes/oreNil1/genbank

wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus/Orenil1.0/*"

    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

# 927725912 bases (111657865 N's 816068047 real 816068047 upper 0 lower)
#	in 5900 sequences in 1 files

    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| grep "^#" > ucsc.agp
    # remove the .1 or .2 accession versions from the scaffold/contig names
    # our database tables can not function with periods in the scaffold names
    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| grep -v "^#" | sed -e "s/\.[12]//" >> ucsc.agp
    gzip ucsc.agp
    # extract the scaffold/contig name and remove the .1 or .2 accession version
    zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
	| sed -e "s/^>.*GL83/>GL83/; s/^>.*AERX01/>AERX01/; s/\.[12].*//" \
	    > ucsc.fa
    gzip ucsc.fa
    # verify nothing lost in the translation, should be same numbers as above
    faSize ucsc.fa.gz

# 927725912 bases (111657865 N's 816068047 real 816068047 upper 0 lower) in 5900 sequences in 1 files
    # verify the names are OK
    zcat ucsc.fa.gz | grep "^>" | sort | head
    zcat ucsc.fa.gz | grep "^>" | sort | tail

#############################################################################
#  Initial database build (DONE - 2011-09-08 - Hiram)

    cd /hive/data/genomes/oreNil1
    cat << '_EOF_' > oreNil1.config.ra
# Config parameters for makeGenomeDb.pl:
db oreNil1
clade vertebrate
genomeCladePriority 85
scientificName Oreochromis niloticus
commonName Nile tilapia
assemblyDate Jan. 2011
assemblyLabel Broad Institute of MIT and Harvard Orenil1.0 (GCA_000188235.1)
assemblyShortLabel Nile tilapia
orderKey 468
mitoAcc NC_013663
fastaFiles /cluster/data/oreNil1/genbank/ucsc.fa.gz
agpFiles /cluster/data/oreNil1/genbank/ucsc.agp
dbDbSpeciesDir oreNil
taxId 8128
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -stop=agp oreNil1.config.ra > agp.log 2>&1
    #	real    1m26.563s

    time makeGenomeDb.pl -continue=db oreNil1.config.ra > db.log 2>&1
    #	about 9 minutes

##########################################################################
# running repeat masker (DONE - 2011-11-30 - Hiram)
    mkdir /hive/data/genomes/oreNil1/bed/repeatMasker
    cd /hive/data/genomes/oreNil1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk oreNil1 > do.log 2>&1 &
    #	real    73m53.508s

XXX - running - Wed Nov 30 09:27:18 PST 2011
    #	real    35m31.331s
    cat faSize.rmsk.txt
    #	927742539 bases (111657865 N's 816084674 real 786293778 upper
    #		29790896 lower) in 5901 sequences in 1 files
    #	%3.21 masked total, %3.65 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920; 
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps oreNil1 rmsk
    #	30136368 bases of 927742539 (3.248%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

    #	rmsk is masking N's sometimes:
    featureBits oreNil1 -not gap -bed=notGap.bed
    #	816084674 bases of 816084674 (100.000%) in intersection

    featureBits oreNil1  rmsk notGap.bed -bed=stdout \
	| gzip -c > cleanRmsk.bed.gz
    #	29790896 bases of 816084674 (3.650%) in intersection

    #	rmsk bases: 30136368 minus non-N bases 29790896 == N's masked: 345472

##########################################################################
# running simple repeat (DONE - 2011-11-30 - Hiram)
    mkdir /hive/data/genomes/oreNil1/bed/simpleRepeat
    cd /hive/data/genomes/oreNil1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	oreNil1 > do.log 2>&1 &
    #	real    11m55.138s
    cat fb.simpleRepeat 
    #	7954018 bases of 816084674 (0.975%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/oreNil1
    twoBitMask oreNil1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed oreNil1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa oreNil1.2bit stdout | faSize stdin > faSize.oreNil1.2bit.txt
    cat faSize.oreNil1.2bit.txt
    #	1511735326 bases (153400444 N's 1358334882 real 1024824487 upper
    #	333510395 lower) in 19550 sequences in 1 files
    #	%22.06 masked total, %24.55 masked real

    rm /gbdb/oreNil1/oreNil1.2bit
    ln -s `pwd`/oreNil1.2bit /gbdb/oreNil1/oreNil1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2011-11-30 - Hiram)
    mkdir /hive/data/genomes/oreNil1/bed/gap
    cd /hive/data/genomes/oreNil1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../oreNil1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m9.711s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits oreNil1 -not gap -bed=notGap.bed
    #	816084674 bases of 816084674 (100.000%) in intersection
    featureBits oreNil1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 816084674 (0.000%) in intersection

    # No new gaps to mark.  To see the complete sequence here if there are
    #	some, look in xenTro3.txt

##########################################################################
## WINDOWMASKER (DONE - 2011-11-30 - Hiram)
    mkdir /hive/data/genomes/oreNil1/bed/windowMasker
    cd /hive/data/genomes/oreNil1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev oreNil1 > do.log 2>&1 &
    #	real    55m3.404s

    # Masking statistics
    twoBitToFa oreNil1.wmsk.2bit stdout | faSize stdin
    #	927742539 bases (111657865 N's 816084674 real 612807423 upper
    #	203277251 lower) in 5901 sequences in 1 files
    #	%21.91 masked total, %24.91 masked real

    twoBitToFa oreNil1.wmsk.sdust.2bit stdout | faSize stdin
    #	927742539 bases (111657865 N's 816084674 real 608110777 upper
    #	207973897 lower) in 5901 sequences in 1 files
    #	%22.42 masked total, %25.48 masked real

    hgLoadBed oreNil1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 5156979 elements of size 3

    featureBits -countGaps oreNil1 windowmaskerSdust
    #	319631762 bases of 927742539 (34.453%) in intersection

    #	eliminate the gaps from the masking
    featureBits oreNil1 -not gap -bed=notGap.bed
    #	816084674 bases of 816084674 (100.000%) in intersection
    time nice -n +19 featureBits oreNil1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	 207973897 bases of 816084674 (25.484%) in intersection
    #	real    1m44.710s
    #	reload track to get it clean
    hgLoadBed oreNil1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 5181457 elements of size 4
    featureBits -countGaps oreNil1 windowmaskerSdust
    #	207973897 bases of 927742539 (22.417%) in intersection
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../oreNil1.unmasked.2bit stdin \
	    -type=.bed oreNil1.cleanWMSdust.2bit
    twoBitToFa oreNil1.cleanWMSdust.2bit stdout | faSize stdin \
        > oreNil1.cleanWMSdust.faSize.txt
    cat oreNil1.cleanWMSdust.faSize.txt
    #	927742539 bases (111657865 N's 816084674 real 608110777 upper
    #	207973897 lower) in 5901 sequences in 1 files
    #	%22.42 masked total, %25.48 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps oreNil1 rmsk windowmaskerSdust
    #	21441295 bases of 927742539 (2.311%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2011-11-30 - Hiram)
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
    cd /hive/data/genomes/oreNil1
    twoBitMask -add bed/windowMasker/oreNil1.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed oreNil1.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa oreNil1.2bit stdout | faSize stdin > faSize.oreNil1.txt
    cat faSize.oreNil1.txt
    #	927742539 bases (111657865 N's 816084674 real 607935881 upper
    #	208148793 lower) in 5901 sequences in 1 files
    #	%22.44 masked total, %25.51 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/oreNil1/oreNil1.2bit
    ln -s `pwd`/oreNil1.2bit /gbdb/oreNil1/oreNil1.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2011-11-30 - Hiram)
    # Use -repMatch=650, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 816084674 / 2897316137 \) \* 1024
    #	( 816084674 / 2897316137 ) * 1024 = 288.429245
    # round up to 300

    cd /hive/data/genomes/oreNil1
    blat oreNil1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/oreNil1.11.ooc -repMatch=300
    #	Wrote 16716 overused 11-mers to jkStuff/oreNil1.11.ooc

    #	copy all of this stuff to the klusters:
    # there are no non-bridged gaps
    hgsql -N -e "select bridge from gap;" oreNil1 | sort | uniq -c
    #	71854 yes
    #	If there were:
    #	cd /hive/data/genomes/oreNil1/jkStuff
    #	gapToLift oreNil1 nonBridged.lift -bedFile=nonBridged.bed
    cd /hive/data/genomes/oreNil1
    mkdir /hive/data/staging/data/oreNil1
    cp -p jkStuff/oreNil1.11.ooc chrom.sizes \
	oreNil1.2bit /hive/data/staging/data/oreNil1
    # request rsync copy from cluster admin

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2011-12-02 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("oreNil1", "blat1", "17816", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("oreNil1", "blat1", "17817", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# set default position on the RHOA gene sequence (via blat)
#	(DONE - 2011-12-02 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="GL831146:690296-694012" where name="oreNil1";' \
	hgcentraltest

############################################################################
