# for emacs: -*- mode: sh; -*-

# Ornithorhynchus anatinus -- WUSTL version 5.0 (Dec. 2006)


#########################################################################
# DOWNLOAD SEQUENCE (DONE 12/15/06 angie - REDONE 2/5/07 angie)
    ssh kkstore05
    mkdir /cluster/store12/ornAna1
    ln -s /cluster/store12/ornAna1 /cluster/data/ornAna1
    mkdir /cluster/data/ornAna1/downloads
    cd /cluster/data/ornAna1/downloads
    set baseUrl = http://genome.wustl.edu/pub/user/lhillier/private/platypus/
    wget --timestamping $baseUrl/platypus.tar.gz
    wget --timestamping $baseUrl/sctgs_hit_by_ESTs.not_yet_submitted.gz
    gunzip sctgs_hit_by_ESTs.not_yet_submitted.gz
    tar xvzf platypus.tar.gz
    faSize *.fa
#2016937414 bases (174725237 N's 1842212177 real 1842212177 upper 0 lower) in 20 sequences in 20 files
#Total size: mean 100846870.7 sd 348736694.0 min 1399469 (chr17) max 1579857390 (chrUn) median 16302927
#N count: mean 8736261.8 sd 32604763.8
#U count: mean 92110608.8 sd 316138018.7
#L count: mean 0.0 sd 0.0
    # Wow, chrUn is 1.5G even with 100bp contig gaps... that won't cause 
    # the browser to die (though it did provoke an integer-overflow -> 
    # infinite loop in faSize!) but it seems like it would be nicer to 
    # have sequence for just the super- and ultra-contigs for this assembly.
    # OK, LaDeana made new chrUn files that contain ultracontigs...
    wget --timestamping $baseUrl/chrUn.agp.gz
    wget --timestamping $baseUrl/chrUn.fa.gz
    mv chrUn.fa chrUn.fa.orig
    mv chrUn.agp chrUn.agp.orig
    gunzip chrUn.fa.gz chrUn.agp.gz
    # checkAgpAndFa found a slight hiccup: chrUn.fa had an empty Ultra673 
    # which was not in any .agp file.  Removed Ultra673 from chrUn.fa -- 
    # LaDeana confirmed that's OK.

    # RELOADING 2/5/07 to pick up slight changes to chr5 and chrUn so we 
    # will be completely in sync with GenBank
    cd /cluster/data/ornAna1
    mv downloads downloads.bak
    mkdir downloads
    cd downloads
    wget --timestamping $baseUrl/platypus.070113.tar.gz
    tar xvzf platypus.070113.tar.gz
    faSize *.fa
#Invalid fasta format: sequence size == 0 for element Ultra673
#1996794193 bases (154575690 N's 1842218503 real 1842218503 upper 0 lower) in 201523 sequences in 20 files
#Total size: mean 9908.5 sd 331992.5 min 0 (Ultra673) max 59581953 (chr3) median 1069
#N count: mean 767.0 sd 20573.1
#U count: mean 9141.5 sd 311758.8
#L count: mean 0.0 sd 0.0
    # I edited out the empty Ultra673 (LaDeana OK'd).


#########################################################################
# MAKE GENOME DB (DONE 12/19/06 angie - REDONE 2/5/07 angie)
    ssh kkstore05
    cd /cluster/data/ornAna1
    cat > ornAna1.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db ornAna1
clade vertebrate
genomeCladePriority 335
commonName Platypus
scientificName Ornithorhynchus anatinus
assemblyDate Dec. 2006
assemblyLabel WUSTL version 6.0?
orderKey 365
mitoAcc 5836058
fastaFiles /cluster/data/ornAna1/downloads/chr*.fa
agpFiles /cluster/data/ornAna1/downloads/chr*.agp
dbDbSpeciesDir platypus
EOF

    ~/kent/src/hg/utils/automation/makeGenomeDb.pl ornAna1.config.ra \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log
    # Followed instructions printed out at the end of the script for 
    # editing the template description files and checked them in.


#########################################################################
# RELOAD GOLD/GAP, REBUILD DOWNLOAD AFTER AGP TWEAK (DONE 1/8/07 angie - OBSOLETED BY TOTAL REBUILD)
# LaDeana sent a manual tweak to chr5.agp:  change "p" to "-" for
# this one entry:
# chr5    647409    657392    203    W    Contig2532.3    1    9984    p
# So reload the agp, re-compress for download.
    ssh kkstore05
    cd /cluster/data/ornAna1
    # Make the necessary edits:
    $EDITOR downloads/chr5.agp
    $EDITOR ornAna1.agp
    # Reload the table
    ssh -x hgwdev hgGoldGapGl -noGl ornAna1 /cluster/data/ornAna1/ornAna1.agp
    # Regenerate the download file and update md5sum.txt:
    cd /cluster/data/ornAna1/goldenPath/bigZips
    gzip -c /cluster/data/ornAna1/ornAna1.agp > ornAna1.agp.gz
    md5sum ornAna1.agp.gz
#e8539f496b2343c5e802b2362e61c9a3  ornAna1.agp.gz
    $EDITOR md5sum.txt


#########################################################################
# REPEATMASKER (DONE 12/20/06 angie - REDONE 2/5/07)
# REPLACED 7/20/07 by beta lib re-run -- see REPEATMASKER BETA TEST below.
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl ornAna1 -verbose 3 -debug
    # Run it for real and tail the log:
    doRepeatMasker.pl ornAna1 -verbose 3 \
      >& /cluster/data/ornAna1/bed/RepeatMasker.2007-02-05/do.log &
    tail -f /cluster/data/ornAna1/bed/RepeatMasker.2007-02-05/do.log
    # RepeatMasker and lib version from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
#    October 6 2006 (open-3-1-6) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20061006;                                            *
    # Coverage:
    featureBits ornAna1 rmsk
#633511004 bases of 1842236818 (34.388%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 12/20/06 angie - REDONE 2/6/07)
#                      (dropped chromEnd index 07-10-07 kuhn)
    # The first time around I just piped twoBitToFa|trfBig but due to the
    # large number of scaffolds, that took 12.5 hours.  This time, make it
    # a small cluster job.
    ssh kki
    mkdir /cluster/data/ornAna1/bed/simpleRepeat
    cd /cluster/data/ornAna1/bed/simpleRepeat
    # no splitting -- choose a chunk size equal to the largest sequence:
    set chunkSize = `awk '{print $2;}' ../../chrom.sizes | sort -nr | head -1`
    ~/kent/src/hg/utils/automation/simplePartition.pl ../../ornAna1.unmasked.2bit $chunkSize chunks
    cat > gsub <<'_EOF_'
#LOOP
./doTrfBig.csh {check in exists $(path1)} {check out exists $(path1).bed}
#ENDLOOP
'_EOF_'
    # << emacs
    cat > doTrfBig.csh <<'_EOF_'
#!/bin/csh -ef

sed -e 's/^.*://' $1 \
| twoBitToFa ../../ornAna1.unmasked.2bit -seqList=stdin stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=$2 -tempDir=/tmp
'_EOF_'
    # << emacs
    chmod a+x doTrfBig.csh
    ls -1S chunks/*/*.lst > chunks.lst
    gensub2 chunks.lst single gsub jobList
    para make jobList
    para time
#Completed: 36 of 36 jobs
#CPU time in finished jobs:      13498s     224.96m     3.75h    0.16d  0.000 y
#IO & Wait Time:                   749s      12.49m     0.21h    0.01d  0.000 y
#Average job time:                 396s       6.60m     0.11h    0.00d
#Longest finished job:            1017s      16.95m     0.28h    0.01d
#Submission to last job:          1017s      16.95m     0.28h    0.01d
    cat chunks/*/*.bed > simpleRepeat.bed

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed ornAna1 simpleRepeat \
      /cluster/data/ornAna1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits ornAna1 simpleRepeat
#38774128 bases of 1842236818 (2.105%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 12/20/06 angie - REDONE 2/6/07 - REDONE 7/24/07)
# new assembly in Feb, new RM lib in July
    ssh kolossus
    cd /cluster/data/ornAna1
    time twoBitMask ornAna1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ornAna1.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#5.490u 3.275s 0:18.91 46.3%     0+0k 0+0io 4pf+0w
    # Link to it from /gbdb:
    ssh hgwdev
    ln -s /cluster/data/ornAna1/ornAna1.2bit /gbdb/ornAna1/ornAna1.2bit


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 12/21/06 angie - REDONE 2/6/07 - REDONE 7/24/07)
# new assembly in Feb, new RM lib in July
    cd /cluster/data/ornAna1

    # If rerunning, save the edited README.txt files aside...
    mkdir tmpReadmes
    foreach d (bigZips database)
      mkdir -p tmpReadmes/$d
      cp -p goldenPath/$d/README.txt tmpReadmes/$d/README.txt
    end
    rm -f ornAna1.fa.out

    ln -s bed/RepeatMasker.2007-07-17/ornAna1.fa.out .
    makeDownloads.pl ornAna1 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
    # Edit README.txt files when done:
    # If rerunning, use info in tmpReadmes/ files but update where appropriate.
# *** Edit each README.txt to resolve any notes marked with "***":
#     /cluster/data/ornAna1/goldenPath/database/README.txt
#     /cluster/data/ornAna1/goldenPath/bigZips/README.txt
    rm -rf tmpReadmes


#########################################################################
# WINDOWMASKER FOR COMPARISON (IN PROGRESS 1/11/07 angie)
    cd /cluster/data/ornAna1
    ~/kent/src/hg/utils/automation/doWindowMasker.pl -debug ornAna1
    ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \
      >& bed/WindowMasker.2006-12-22/do.log &
    tail -f bed/WindowMasker.2006-12-22/do.log
    # Needs -workhorse kolossus so it can connect to the NCBI server...
    #     ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \
    ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \
      -buildDir /cluster/data/ornAna1/bed/WindowMasker.2006-12-22/do.log \
      -workhorse kolossus -continue mask \
      >& bed/WindowMasker.2006-12-22/do.log &
    tail -f bed/WindowMasker.2006-12-22/do.log
    cd bed/WindowMasker.2006-12-22
    # Compare coverage to rmsk which was 633510998 / 1842236818 = 34.388%
    zcat windowmasker.bed.gz | awk '{print $3 - $2}' | total
#844961729
    # 844961729 / 1842236818 = 45.866%
    zcat windowmasker.sdust.bed.gz | awk '{print $3 - $2}' | total
#1011460283
    # 1011460283 / 1842236818 = 54.904%
    # The coverage is higher, but do we really want to mask 55% of the 
    # assembly before alignment??
#TODO - evaluate!!


#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 12/22/06 angie - REDONE 2/6/07)
# (probably frivolous to redo because the assembly changes were so small, 
# but redo anyway so it's reproduceable later.)
    # Use -repMatch=608 
    # To determine -repMatch for new species, first scale 1024 by size 
    # relative to human (e.g. divide ornAna1 / hg18 gapless bases from 
    # featureBits).  That gives ~607, rounding up to 608.
    ssh kolossus
    blat /cluster/data/ornAna1/ornAna1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/ornAna1/11.ooc -repMatch=608
#Wrote 35954 overused 11-mers to /cluster/bluearc/ornAna1/11.ooc


#########################################################################
# GENBANK AUTO UPDATE (DONE 1/5/07 angie - REDONE 2/6/07)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add ornAna1 just after dm2...

# ornAna1 (Platypus -- O. anatinus)
ornAna1.serverGenome = /cluster/data/ornAna1/ornAna1.2bit
ornAna1.clusterGenome = /iscratch/i/ornAna1/ornAna1.2bit
ornAna1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
ornAna1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
ornAna1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
ornAna1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
ornAna1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
ornAna1.ooc = /cluster/bluearc/ornAna1/11.ooc
ornAna1.lift = no
ornAna1.genbank.mrna.xeno.load = yes
ornAna1.downloadDir = ornAna1
ornAna1.perChromTables = no

    cvs ci -m "Added ornAna1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added O. anatinus (platypus)." src/lib/gbGenome.c
    make install-server

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial ornAna1 &
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad ornAna1 &

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add ornAna1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added ornAna1." etc/align.dbs etc/hgwdev.dbs
    make etc-update


#########################################################################
# SWAP HG18 CHAIN/NET (DONE 4/9/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/blastz.hg18.swap
    cd /cluster/data/ornAna1/bed/blastz.hg18.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/hg18/bed/blastz.ornAna1.2007-02-21/DEF >& do.log &
    tail -f do.log
    ln -s blastz.hg18.swap /cluster/data/ornAna1/bed/blastz.hg18


#########################################################################
# SWAP MM8 CHAIN/NET (DONE 3/2/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/blastz.mm8.swap
    cd /cluster/data/ornAna1/bed/blastz.mm8.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27/DEF >& do.log &
    tail -f do.log
    ln -s blastz.mm8.swap /cluster/data/ornAna1/bed/blastz.mm8


#########################################################################
# SWAP GALGAL3 CHAIN/NET (DONE 4/9/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/blastz.galGal3.swap
    cd /cluster/data/ornAna1/bed/blastz.galGal3.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06/DEF >& do.log &
    tail -f do.log
    ln -s blastz.galGal3.swap /cluster/data/ornAna1/bed/blastz.galGal3


#########################################################################
# SWAP MONDOM4 CHAIN/NET (DONE 4/3/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/blastz.monDom4.swap
    cd /cluster/data/ornAna1/bed/blastz.monDom4.swap
    doBlastzChainNet.pl -swap -workhorse=kkr3u00 \
      /cluster/data/monDom4/bed/blastz.ornAna1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.monDom4.swap /cluster/data/ornAna1/bed/blastz.monDom4


#######################################################################
# BLASTZ LIZARD ANOCAR1 (DONE 4/18/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03
    cd /cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03
    cat << '_EOF_' > DEF
# chicken vs. platypus

# Use "human-fish" params.
# If that causes us to get swamped, we can increase L.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Platypus ornAna1
SEQ1_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit
SEQ1_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LIMIT=100
SEQ1_LAP=10000

# QUERY: Lizard anoCar1
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk -smallClusterHub pk \
      >& do.log & tail -f do.log
    ln -s blastz.anoCar1.2007-04-03 /cluster/data/ornAna1/bed/blastz.anoCar1


#########################################################################
# UPDATE DEFAULT POSITION (DONE 4/30/07 angie)
# This is a fairly well-conserved region of chrX5 with homology to
# chicken chrZ and an intronEst (platypus GenBank seqs are rare!)...
# also has an aligned human protein.
    ssh hgwdev
    hgsql hgcentraltest -e \
      'update dbDb set defaultPos = "chrX5:26250001-26320000" where \
       name = "ornAna1";'


#########################################################################
# REPEATMASKER BETA TEST (DONE 4/19/07 angie -- REDONE 7/20/07)
    # 7/17/07: Trying out development lib version.
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 -debug
    # Run it for real and tail the log:
    cd /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17
    ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 \
      -stop mask \
      >& do.log & tail -f do.log
    # RepeatMasker and lib version from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker.20070717-dev/RepeatMasker
#    May 17 2007 (open-3-1-8) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker.20070717-dev/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20070717;                                            *
    # Now load the new files as table rmskBeta for comparison:
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17
    hgLoadOut -table=rmskBeta -nosplit ornAna1 ornAna1.fa.out
    # Coverage:
    featureBits ornAna1 rmskBeta
#882976774 bases of 1842236818 (47.930%) in intersection
    # wow!  big increase from the initial rmsk:
#633511004 bases of 1842236818 (34.388%) in intersection
    hgLoadBed ornAna1 nestedRepeats ornAna1.nestedRepeats.bed \
      -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql
    # Robert Hubley and Arian looked at it & approved for release.
    # Continue the doRepeatMasker run so that the rmsk table is reloaded.
    ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 \
      -continue load \
      -buildDir /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17 \
      >& do.log & tail -f do.log
    hgsql ornAna1 -e 'drop table rmskBeta'


#########################################################################
# ENSEMBL GENES (DONE 4/26/07 angie)
    ssh hgwdev
    mkdir /cluster/data/ornAna1/bed/ensembl
    cd /cluster/data/ornAna1/bed/ensembl
    # Go to www.ensembl.org and click around their evolving interface 
    # to get the following types of data:
    # 1. a tab-separated file that relates Ensembl gene, transcript and
    #    protein IDs.  Save as ensGtp.txt.gz  
    # 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz

    # They have started dumping GTF files so we can download that directly:
    wget 'ftp://ftp.ensembl.org/pub/current_ornithorhynchus_anatinus/data/gtf/*'

    # Add "chr" to chrom names in the gene data gtf file to make 
    # it compatible with our software.
    gunzip -c Ornithorhynchus_anatinus.OANA5.44.gtf.gz \
    | perl -wpe 's/^([\dX]+)/chr$1/; s/^MT/chrM/;' \
    > ensGene.gtf
    ldHgGene -gtf -genePredExt ornAna1 ensGene \
      /cluster/data/ornAna1/bed/ensembl/ensGene.gtf
    nice featureBits ornAna1 ensGene
#22058521 bases of 1842236818 (1.197%) in intersection

    # strip header line:
    gunzip -c ensGtp.txt.gz | tail +2 > ensGtp.txt
    hgLoadSqlTab ornAna1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt

    gunzip ensPep.fa.gz
    hgPepPred ornAna1 generic ensPep ensPep.fa


#########################################################################
# MULTIZ7WAY (DONE 4/25/07 angie)
# ((ornAna1 (monDom4 (hg18 mm8))) (galGal3 anoCar1))
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24

    # Prune the hg18 28way tree to just these 6:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=ornAna1,monDom4,hg18,mm8,galGal3,anoCar1 \
      /cluster/data/hg18/bed/multiz28way.2007-03-20/cons/28way.mod \
    | awk '$1 == "TREE:" {print $2;}' \
      > 6way.nh
    # *carefully* edit 6way.nh to put ornAna1 first.
    # create species list and stripped down tree for autoMZ
    sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 6way.nh \
      > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst

    # Largest ornAna1 seq size is ~60M, so no need to split sequences (whew):
    awk '{print $2;}' ../../chrom.sizes | sort -nr | head -1
#59581953
    wc -l ../../chrom.sizes
#201523 ../../chrom.sizes

    # Split MAFs by sequence onto cluster-friendly server.
    # Since there are 200k scaffolds, and each other species may have 
    # alignments to a different subset, use the new -useHashedName both to 
    # reduce the number of split files and to ensure consistent split 
    # file names across all other species.
    mkdir /san/sanvol1/scratch/ornAna1/mafNet
    foreach s (monDom4 hg18 mm8 galGal3 anoCar1)
      echo $s
      time nice mafSplit -byTarget -useHashedName=10 \
        dummyArg /san/sanvol1/scratch/ornAna1/mafNet/$s/split \
        /cluster/data/ornAna1/bed/blastz.$s/mafNet/*
    end

    ssh pk
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24
    mkdir maf run
    cd run

    # stash binaries 
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = ornAna1
    set c = $1
    set mafOut = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        if ($s == $db) then
            continue
        endif
        set clusterMaf = $pairs/$s/$c.maf
        set localMaf = $db.$s.sing.maf
        if (-e $clusterMaf) then
            cp $clusterMaf $localMaf
        else
            echo "##maf version=1 scoring=autoMZ" > $localMaf
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $mafOut
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs

    # List scaffolds in the dir structure created by mafSplit above:
    perl -we 'for ($i=0; $i<1024; $i++) { printf "split%03d\n", $i; }' \
      > split.lst

    gensub2 split.lst single spec jobList
    para make jobList
    para time
#Completed: 1024 of 1024 jobs
#CPU time in finished jobs:      10728s     178.80m     2.98h    0.12d  0.000 y
#IO & Wait Time:                  3278s      54.63m     0.91h    0.04d  0.000 y
#Average job time:                  14s       0.23m     0.00h    0.00d
#Longest finished job:             330s       5.50m     0.09h    0.00d
#Submission to last job:           479s       7.98m     0.13h    0.01d


#########################################################################
# ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE 4/25/2007 angie)
    ssh kolossus
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    foreach db (`cat /cluster/data/ornAna1/bed/multiz6way.2007-04-24/species.lst`)
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
    end

    ssh kki
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno/run
    cp /dev/null jobList
    foreach f (/cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/*)
      set maf = $f:t
      echo mafAddIRows -nBeds=nBeds $f \
        /iscratch/i/ornAna1/ornAna1.2bit ../maf/$maf >> jobList
    end
    para make jobList
    para time
#Completed: 1024 of 1024 jobs
#CPU time in finished jobs:       2571s      42.85m     0.71h    0.03d  0.000 y
#IO & Wait Time:                  2745s      45.75m     0.76h    0.03d  0.000 y
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest finished job:              42s       0.70m     0.01h    0.00d
#Submission to last job:           447s       7.45m     0.12h    0.01d

    # Consolidate multi-level maf to monolithic file
    # No need to sort chunks by position because the maf chunk size is greater 
    # than the largest scaffold size.  That may not be true in other 
    # assemblies.
    ssh kkstore05
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    cat maf/*.maf >> ornAna1.maf
 
   mafFilter -overlap -reject=rf ornAna1.maf > ornAna1.mf.maf
#rejected minRow: 66
#total rejected: 66 blocks
    # mafFilter rejected 66 single-row (ornAna1 only) blocks due to 
    # its default minRow of 2.  That's reasonable, so replace the original 
    # with the filtered version (and load the db tables w/filtered).
    gzip -c ornAna1.maf > ornAna1.preFilter.maf.gz
    mv ornAna1.mf.maf ornAna1.maf

    # Load annotated maf
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    mkdir -p /gbdb/ornAna1/multiz6way/anno
    ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno/ornAna1.maf \
      /gbdb/ornAna1/multiz6way/anno/
    time nice hgLoadMaf -pathPrefix=/gbdb/ornAna1/multiz6way/anno ornAna1 multiz6way
#Loaded 2665247 mafs in 1 files from /gbdb/ornAna1/multiz6way/anno
#60.126u 5.767s 1:22.72 79.6%    0+0k 0+0io 0pf+0w

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    time nice hgLoadMafSummary ornAna1 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000 -test multiz6waySummary ornAna1.maf
#Created 1130326 summary blocks from 5153951 components and 2665247 mafs from ornAna1.maf
#158.600u 10.810s 2:50.16 99.5%  0+0k 0+0io 4pf+0w

    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno
    sed -e 's/mafSummary/multiz6waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz6waySummary.sql
    time nice hgLoadSqlTab ornAna1 multiz6waySummary \
      /tmp/multiz6waySummary.sql multiz6waySummary.tab
#0.002u 0.000s 0:24.69 0.0%      0+0k 0+0io 4pf+0w
    rm *.tab
    ln -s multiz6way.2007-04-24 /cluster/data/ornAna1/bed/multiz6way

#########################################################################
# Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram)
    # edit src/hg/makeDb/genbank/genbank.conf to add:
ornAna1.upstreamGeneTbl = ensGene
ornAna1.upstreamMaf = multiz6way /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/species.lst

#########################################################################
# MULTIZ6WAY DOWNLOADABLES (DONE 4/26/2007 angie)
    ssh hgwdev
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads

    # upstream mafs... not sure how good these will be since I expect 
    # a lot of genes are fragmented on contigs, so upstream may not 
    # be upstream, but worth a try...
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits ornAna1 ensGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags ornAna1 multiz6way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'
    # << emacs
    time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#152.641u 50.687s 5:40.89 59.6%  0+0k 0+0io 0pf+0w

    # Make a gzipped version of the monolithic annotated maf file:
    ssh kolossus
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24
    time nice gzip -c anno/ornAna1.maf > mafDownloads/ornAna1.maf.gz
#623.839u 5.221s 10:29.86 99.8%  0+0k 0+0io 1pf+0w

    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads
    time nice gzip up*.maf
#0.849u 0.094s 0:01.02 91.1%     0+0k 0+0io 0pf+0w
    time nice md5sum *.gz > md5sum.txt
#3.246u 0.574s 0:03.84 99.2%     0+0k 0+0io 0pf+0w

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/ornAna1/multiz6way
    mkdir $dir
    ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/xenTro2/multiz7way/README.txt $dir
    # edit README.txt


#########################################################################
# MULTIZ6WAY MAF FRAMES (DONE 4/26/2007 angie)
# this was redone 2011-05-12 see below - Hiram
    ssh hgwdev
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames
    mkdir genes

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred

    # using ensGene for ornAna1
    # no genes for monDom4, anoCar1
    # using knownGene for mm8 hg18
    # using refGene for galGal3
    # genePreds (name specific fields to get around bin, knownGene extras)
    foreach queryDb (ornAna1 mm8 hg18 galGal3)
      if ($queryDb == "ornAna1") then
        set geneTbl = ensGene
      else if ($queryDb == "galGal3") then
        set geneTbl = refGene
      else
        set geneTbl = knownGene
      endif
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
    end

    #------------------------------------------------------------------------
    # create frames
    ssh kolossus
    set multizDir = /cluster/data/ornAna1/bed/multiz6way.2007-04-24
    set mafDir = $multizDir/mafDownloads
    set geneDir = $multizDir/frames/genes
    cd $multizDir/frames
    genePredToMafFrames ornAna1 $mafDir/ornAna1.maf.gz ornAna1.mafFrames \
      ornAna1 genes/ornAna1.gp.gz \
      hg18 genes/hg18.gp.gz \
      mm8 genes/mm8.gp.gz \
      galGal3 genes/galGal3.gp.gz
    gzip ornAna1.mafFrames

    #------------------------------------------------------------------------
    # load the database
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames
    hgLoadMafFrames ornAna1 multiz6wayFrames ornAna1.mafFrames.gz


#########################################################################
# PHASTCONS (DONE 4/27/2007 angie)
# Using Kate's process from makeHg17.doc.
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 16way track.  Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model 
# from the (37-way) ENCODE alignments.

    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    # Prune 28-way model to just our 6 species:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=ornAna1,monDom4,hg18,mm8,galGal3,anoCar1 \
      /cluster/data/hg18/bed/multiz28way.2007-03-20/cons/28way.mod \
      > 6way.mod

    # Split MAF into windows and use to generate "sufficient statistics" 
    # (ss) files for phastCons input.
    # msa_split (and the .ss format) can't deal with more than one
    # target sequence in a .maf file.  So unfortunately we will need
    # to split the 1k files further into tens of thousands of
    # per-sequence files.
    # This could be a small cluster job, especially with the mafSplit in the 
    # loop, but there would still be a lot of I/O so just do it all on the
    # fileserver...

    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    cat > doSplitOnFileserver.csh << '_EOF_'
#!/bin/csh -fex
set WINDOWS = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss
# this would be /scratch/tmp but kkstore05 won't let me create it:
set tmpDir = /tmp
rm -fr $WINDOWS
mkdir -p $WINDOWS
date
foreach f (/cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/*.maf)
  # skip the maf files that have only comments -- those crash msa_split:
  if (`grep -v ^\# $f | head | wc -l`) then
    set c = $f:t:r
    mkdir $WINDOWS/$c
    rm -rf $tmpDir/$c
    mkdir $tmpDir/$c
    pushd $tmpDir/$c
    mafSplit dummyArg -byTarget tmp $f
    foreach t (tmp*.maf)
      set seq = `grep '^s ornAna1\.' $t | head -1 | perl -wpe 's/^s ornAna1.(\w+)\s.*$/$1/ || die;'`
      twoBitToFa /cluster/data/ornAna1/ornAna1.2bit -seq=$seq $seq.fa
      /cluster/bin/phast/$MACHTYPE/msa_split $t -i MAF \
         -M $seq.fa \
         -o SS -r $WINDOWS/$c/$seq -w 10000000,0 -I 1000 -B 5000
    end
    popd
    rm -r $tmpDir/$c
  endif
end
date
'_EOF_'
    # << emacs
    chmod a+x doSplitOnFileserver.csh
    nice ./doSplitOnFileserver.csh >& split.log & tail -f split.log
    # That took 2:45... should be a small cluster job next time.

    # check tree model on a single chunk, using params recommended by Adam,
    # (to verify branch lengths on 2X species -- though we aren't using any 
    #  of those here)
    ssh kolossus
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss/split873/chr3.30000001-40000000.ss \
        -o phyloFit.tree
    # The numbers in general are a lot lower... I'll chalk it up to 
    # bias of what can be aligned and go ahead with the human 28-way 
    # which was generated w/more data.

    # Run phastCons
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set d = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons
cp -p $san/ss/$d/$f.ss ../6way.mod $tmp
pushd $tmp > /dev/null
set c = $f:r:r
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss 6way.mod \
  --rho $rho --expected-length $len --target-coverage $cov --quiet \
  --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$d $san/bed/$d
sleep 1
mv $tmp/$f.pp $san/pp/$d
mv $tmp/$f.bed $san/bed/$d
rm -fr $tmp
'EOF'
    # << emacs
    chmod a+x doPhast.csh

    /cluster/bin/phast/$MACHTYPE/phastCons --estimate-rho /tmp/estimatedRho.mod \
      --target-coverage 0.005 --expected-length 12 --no-post-probs \
        /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss/split873/chr3.30000001-40000000.ss \
      ../6way.mod
#(rho = 0.248574)
# for target-coverage 0.1: (rho = 0.229248)

    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(dir1) $(file1) 12 .005 .26
#ENDLOOP
'EOF'
    # << emacs

    # Create parasol batch and run it
    ssh pk
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons
    pushd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss
    cp /dev/null /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons/in.list
    foreach d (*)
      ls -1S $d/*.ss | sed 's/.ss$//' \
        >> /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons/in.list
    end
    popd

    gensub2 in.list single template jobList
    para make jobList
    para time
#Completed: 35180 of 35180 jobs
#CPU time in finished jobs:       4947s      82.45m     1.37h    0.06d  0.000 y
#IO & Wait Time:                128549s    2142.48m    35.71h    1.49d  0.004 y
#Average job time:                   4s       0.06m     0.00h    0.00d
#Longest finished job:             148s       2.47m     0.04h    0.00d
#Submission to last job:           527s       8.78m     0.15h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with 
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "*.bed" \
    | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
    | sort -k7,7 -k9,9n \
    | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
    | xargs cat \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements6way.bed
    cp -p phastConsElements6way.bed /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons

    # Measure coverage.  If good, load elements into database and proceed with wiggle.
    # Try for somewhere in the neighborhood of 5% overall cov, and 70% CDS cov.
    # (Jim tried for 4% overall in xenTro1)
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    featureBits -chrom=chr1 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed
    featureBits -chrom=chr3 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed
    featureBits -chrom=chr17 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed
    featureBits -chrom=Ultra1 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed

    # FIRST ITERATION: 6way.mod, (len cov rho) = (12 .005 .26)
#ensGene:cds 0.762%, phastConsElements6way.bed 5.003%, both 0.604%, cover 79.35%, enrich 15.86x
#ensGene:cds 0.822%, phastConsElements6way.bed 4.681%, both 0.658%, cover 80.11%, enrich 17.11x
#ensGene:cds 2.775%, phastConsElements6way.bed 9.169%, both 1.663%, cover 59.92%, enrich 6.53x
#ensGene:cds 0.241%, phastConsElements6way.bed 3.012%, both 0.171%, cover 70.84%, enrich 23.52x
    # Close enough!  Let's see how it looks in the browser.
    mv phastConsElements6way.bed phastConsElements6way_12_005_26.bed

    # When happy:
    hgLoadBed ornAna1 phastConsElements6way phastConsElements6way_12_005_26.bed

    # Create merged posterior probability file and wiggle track data files
    ssh kolossus
    cd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    time find ./pp -name "*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
        sort -k7,7 -k9,9n | \
        sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
        xargs cat | \
        nice wigEncode -noOverlap stdin phastCons6way.wig phastCons6way.wib
    cp -p phastCons6way.wi? \
      /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons
    ln -s `pwd`/phastCons6way.wib /gbdb/ornAna1/multiz6way/
    hgLoadWiggle -pathPrefix=/gbdb/ornAna1/multiz6way ornAna1 \
        phastCons6way phastCons6way.wig

    # Tally up tree distances.
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24
    tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' \
      > 6way.phyloFit.nh

    /cluster/bin/phast/all_dists 6way.nh > 6way.distances.txt
    grep ornAna1 6way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    cat distances.txt
#0.8975  monDom4
#0.9673  hg18
#1.0957  galGal3
#1.1659  mm8
#1.2146  anoCar1
    # the order in the browser display will be by tree topology,
    # not by distance.

    # Just for reference, these were the distances from phyloFit.tree.mod...
    # Quite different!
#0.5175  monDom4
#0.5186  galGal3
#0.5363  hg18
#0.5544  anoCar1
#0.6066  mm8

    # Make .gif for tree, check in to browser/images and install in
    # htdocs/images/phylo/... don't forget to request a push of that
    # file.  The treeImage setting in trackDb.ra is
    # phylo/ornAna1_6way.gif (relative to htdocs/images).  
    /cluster/bin/phast/tree_doctor \
      --rename="ornAna1 -> platypus ; hg18 -> human ; mm8 -> mouse ; monDom4 -> opossum; galGal3 -> chicken;  anoCar1 -> lizard;" \
      6way.nh > 6way_species.nh
    /data/apache/cgi-bin/phyloGif -phyloGif_tree=6way_species.nh \
      -phyloGif_width=260 -phyloGif_height=260 \
      > ornAna1_6way.gif
    # If you haven't already, check out the browser CVS tree in your ~/:
    # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser)
    cp ornAna1_6way.gif ~/browser/images/phylo/
    cd ~/browser/images/phylo
    cvs add ornAna1_6way.gif
    # note:  markd says it is best to use "cvs add -kb" for binary files - b0b
    cvs ci ornAna1_6way.gif
    cd ../..
    cvsup
    make alpha


#########################################################################
# PHASTCONS SCORES DOWNLOADABLES FOR 6WAY (DONE 5/1/07 angie)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads
    set ppDir = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/pp
    set dDir = `pwd`
    # Some maf's were split into .ss chunks, so make sure to cat them in order.
    cp /dev/null ornAna1.pp
    foreach d ($ppDir/split*)
      pushd $d >& /dev/null
      ls -1 *.pp \
      | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
      | sort -k1,1 -k3,3n \
      | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
      | xargs cat >> $dDir/ornAna1.pp
      popd >& /dev/null
    end
    nice gzip ornAna1.pp
    md5sum ornAna1.pp.gz > md5sum.txt

    ssh hgwdev
    cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads
    set dir = /usr/local/apache/htdocs/goldenPath/ornAna1/phastCons6way
    mkdir $dir
    ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/xenTro2/phastCons7way/README.txt $dir
    # edit README.txt
    # Clean up after phastCons run.
    ssh kkstore05
    rm /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/*.tab
    rm -r /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons


#########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-28)
    ssh kkstore05
    bash 

    mkdir /cluster/data/ornAna1/blastDb
    cd /cluster/data/ornAna1
    ls downloads/*.fa M/*.fa | grep -v chrUn > temp.lst
    cat `cat temp.lst` > temp.fa
    faSplit gap temp.fa 10000000 blastDb/x -lift=blastDb.lft
    faSplit sequence downloads/chrUn.fa 100 blastDb/un
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa *.log

    mkdir -p /san/sanvol1/scratch/ornAna1/blastDb
    cd /cluster/data/ornAna1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/ornAna1/blastDb
    done

    mkdir -p /cluster/data/ornAna1/bed/tblastn.hg18KG
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/ornAna1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 154 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/154) = 22.623832

   mkdir -p /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa
   split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/ornAna1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/ornAna1/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0 
    fi
fi 
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG
    para create blastSpec

    para try, push, etc.

# Completed: 94402 of 94402 jobs
# CPU time in finished jobs:   16325847s  272097.44m  4534.96h  188.96d  0.518 y
# IO & Wait Time:                603783s   10063.06m   167.72h    6.99d  0.019 y
# Average job time:                 179s       2.99m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1411s      23.52m     0.39h    0.02d
# Submission to last job:         75065s    1251.08m    20.85h    0.87d

    ssh kkstore05
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG

    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 613 of 613 jobs
# CPU time in finished jobs:       5247s      87.45m     1.46h    0.06d  0.000 y
# IO & Wait Time:                 61655s    1027.58m    17.13h    0.71d  0.002 y
# Average job time:                 109s       1.82m     0.03h    0.00d
# Longest finished job:             305s       5.08m     0.08h    0.00d
# Submission to last job:          2287s      38.12m     0.64h    0.03d

    ssh kkstore05
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/ornAna1/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/ornAna1/bed/tblastn.hg18KG
    hgLoadPsl ornAna1 blastHg18KG.psl

    # check coverage
    nice featureBits -noRandom  ornAna1 blastHg18KG
# 16607325 bases of 1842236818 (0.901%) in intersection

    nice featureBits -noRandom  -enrichment ornAna1 ensGene:cds  blastHg18KG
# ensGene:cds 1.193%, blastHg18KG 0.901%, both 0.763%, cover 63.91%, enrich 70.89x
    ssh kkstore05
    rm -rf /cluster/data/ornAna1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut
#end tblastn


#########################################################################
# UPDATE ASSEMBLY VERSION STRING AND VERIFY SAME DATA (DONE 5/7/07 angie)
# WUSTL's platypus project website now calls the assembly rev Mar. 2007,
# 5.0.1 (as opposed to 5.0 or 6.0) so follow suit:
    ssh hgwdev
    hgsql hgcentraltest -e \
      'update dbDb set description = "Mar. 2007" where name = "ornAna1";'
    hgsql hgcentraltest -e \
      'update dbDb set sourceName = "WUSTL version 5.0.1" where name = "ornAna1";'
    # We got the data in Feb. not Mar., so I redownloaded files from 
    # http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/
    # and verified it's the same seq/agp.
    ssh kolossus
    mkdir /tmp/checkOrnAna1
    cd /tmp/checkOrnAna1
    wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/chromosomes.agp.gz
    wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/chromosomes.fa.gz
    gunzip chr*gz

    # Compare AGP:
    wc -l /cluster/data/ornAna1/ornAna1.agp 
#689178 /cluster/data/ornAna1/ornAna1.agp
    wc -l chr*.agp
#689178 chromosomes.agp
    sort -k 1,1 -k 2n,2n /cluster/data/ornAna1/ornAna1.agp > tmp1
    sort -k 1,1 -k 2n,2n chromosomes.agp > tmp2
    cmp tmp1 tmp2
    # No output -- good, no change!!

    # Compare FASTA:
    faSize chromosomes.fa
#871323289 bases (58126753 N's 813196536 real 813196535 upper 1 lower) in 125840 sequences in 1 files
#Total size: mean 6924.1 sd 381372.6 min 108 (Contig271463) max 59581953 (chr3) median 894
    # OK, that only has chroms and Contigs, not Ultras!
    wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/supercontigs.fa.gz
    gunzip su*.gz
#1996203893 bases (153985390 N's 1842218503 real 1842218503 upper 0 lower) in 205536 sequences in 1 files
    # The "1842218503 real" base count matches expected.

    # Neither downloaded fasta file has Ultra497... but it would be 
    # reconstructed from AGP to be the same as what we have.  
    # We can compare chr5's affected region:
    echo chr5 > list
    faSomeRecords chromosomes.fa list stdout | faFrag stdin 647408 657392 tmp1
    twoBitToFa /cluster/data/ornAna1/ornAna1.2bit \
      -seq=chr5 -start=647408 -end=657392 tmp2
    faCmp tmp1 tmp2
#tmp1 and tmp2 are the same

    # Whew!


#########################################################################
# WSSD AND SEGMENTAL DUPLICATIONS (DONE 8/3/07 angie - WSSD UPDATED 9/12/07)
    ssh kkstore05
    mkdir /cluster/data/ornAna1/bed/genomicSuperDups
    cd /cluster/data/ornAna1/bed/genomicSuperDups
    # Unpack file emailed from Ginger Cheng <ginger@gs.washington.edu>
    # in Evan Eichler's lab:
    tar xvzf platypusDUP_track.tar.gz

    ssh hgwdev
    cd /cluster/data/ornAna1/bed/genomicSuperDups
    # 9/12/07: Ginger sent new WSSD file, and said we can filter out 
    # single-base items.
    awk '($3 - $2) > 1 {print;}' SmallContig_wssdGE10K_nogap.tab \
    | sed -e 's/^chr_Ultra/Ultra/' \
    | hgLoadBed ornAna1 wssdCoverage stdin

    awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' genomicSuperDups \
    | hgLoadBed ornAna1 genomicSuperDups stdin \
      -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
    # 8/29/07 Gak!  Kayla found that the strand values were "+" and "_" -- fix:
    hgsql ornAna1 -e 'update genomicSuperDups set strand = "-" where strand = "_";'


###########################################################################
# Blastz Platypus ornAna1 (WORKING - 2007-11-20 - Hiram)
#	This is a second attempt after a blastz of Platypus query to
#	Marmoset target produced no results for some unknown reason.
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20
    cd /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20

    #	this chunking will produce 542,178 jobs
    #	This takes way too long, even on pk, 23 days
    cat << '_EOF_' > DEF
# Platypus vs Marmoset
BLASTZ_M=50
BLASTZ_K=2200
BLASTZ_L=6000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_Y=3400

# Target: Platypus ornAna1, largest chrom 59,581,953, 201,523 sequences
SEQ1_DIR=/cluster/bluearc/scratch/data/ornAna1/ornAna1.2bit
SEQ1_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=300
SEQ1_LAP=10000

# QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=5000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-stop=blastz \
	-chainMinScore=5000 -chainLinearGap=loose \
	-bigClusterHub=pk > blastz.log 2>&1 &
    #	real    5494m4.009s
Completed: 541845 of 542178 jobs
Crashed: 333 jobs
CPU time in finished jobs:  575423244s 9590387.40m 159839.79h 6659.99d 18.247 y
IO & Wait Time:               8698684s  144978.06m  2416.30h  100.68d  0.276 y
Average job time:                1078s      17.97m     0.30h    0.01d
Longest finished job:            2887s      48.12m     0.80h    0.03d
Submission to last job:       1991620s   33193.67m   553.23h   23.05d
    #	the crashes are tracking errors
    #	continuing after verifying that all result files are present
    #	testing some changes to the script
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 `pwd`/DEF \
	-continue=cat \
	-chainMinScore=5000 -chainLinearGap=loose \
	-bigClusterHub=pk > cat.log 2>&1 &
    #	real    142m40.246s

    mkdir /cluster/data/calJac1/bed/blastz.ornAna1.swap
    cd /cluster/data/calJac1/bed/blastz.ornAna1.swap
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	/cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    40m3.380s
    cat fb.calJac1.chainOrnAna1Link.txt
    #	211526899 bases of 2929139385 (7.221%) in intersection
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	/cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-continue=cleanup -swap -bigClusterHub=pk > cleanup.log 2>&1 &

###########################################################################

############################################################################
# SWAP ornAna1 -> equCab2 (DONE - 2008-04-29 - larrym)
    mkdir /cluster/data/ornAna1/bed/blastz.equCab2.swap
    cd /cluster/data/ornAna1/bed/blastz.equCab2.swap
    time /bin/nice -n 19 doBlastzChainNet.pl \
        /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	    -swap -syntenicNet >& do.log &
    0.883u 0.754s 2:33:21.36 0.0%   0+0k 0+0io 16pf+0w

############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
# XENO REFGENE - added xeno refGene (2008-06-03 markd)

add to genbank.conf
    ornAna1.refseq.mrna.xeno.load  = yes
    ornAna1.refseq.mrna.xeno.loadDesc = yes

    ssh genbank
    cd /cluster/data/genbank
    ((./bin/gbAlignStep -initial -srcDb=refseq ornAna1)|&mail markd&)
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# QUALITY TRACK (DONE - 2008-11-24 - Hiram)
    mkdir /hive/data/genomes/ornAna1/downloads
    cd /hive/data/genomes/ornAna1/downloads
    qaToQac contigs.fa.qual.gz assembly.qac
    qacAgpLift ../ornAna1.agp assembly.qac scaffolds.qac
    mkdir /hive/data/genomes/ornAna1/bed/qual
    cd /hive/data/genomes/ornAna1/bed/qual
    #	the qac file was created by Rico during 28-way annotations
    qacToWig -fixed ../../downloads/scaffolds.qac stdout \
	| wigEncode stdin qual.wig qual.wib
    ln -s `pwd`/qual.wib /gbdb/ornAna1/wib
    hgLoadWiggle -pathPrefix=/gbdb/ornAna1/wib ornAna1 quality qual.wig

############################################################################
#  ornAna1 - Platypus - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
    ssh kolossus
    cd /hive/data/genomes/ornAna1
    cat << '_EOF_' > ornAna1.ensGene.ra
# required db variable
db ornAna1
# optional nameTranslation, the sed command that will transform
#	Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9][0-9]*\)/chr\1/; s/^\(X[0-9]\)/chr\1/; s/^MT/chrM/"
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly, 365 items
skipInvalid yes
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 ornAna1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/ornAna1/bed/ensGene.51
    featureBits ornAna1 ensGene
    # 24491979 bases of 1842236818 (1.329%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/ornAna1/bed/ensGene.51

############################################################################
# lastz swap from Horse EquCab2 (DONE - 2009-07-02 - Hiram)
    # original lastz run
    cd /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01
    cat fb.equCab2.chainOrnAna1Link.txt 
    #	132750149 bases of 2428790173 (5.466%) in intersection

    mkdir /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap
    cd /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap
    time doBlastzChainNet.pl \
	/hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-swap -workhorse=hgwdev \
	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #	real    115m47.611s
    cat fb.ornAna1.chainEquCab2Link.txt 
    #	132776204 bases of 1842236818 (7.207%) in intersection

############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# lastz Lizard swap anoCar2 (DONE - 2011-04-27 - Hiram)
    # the original alignment
    cd /cluster/data/anoCar2/bed/lastzOrnAna1.2011-04-26
    cat fb.anoCar2.chainOrnAna1Link.txt 
    #	77594222 bases of 1701353770 (4.561%) in intersection

    ## running the swap
    mkdir /hive/data/genomes/ornAna1/bed/blastz.anoCar2.swap
    cd /hive/data/genomes/ornAna1/bed/blastz.anoCar2.swap
    time nice -n +25 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/anoCar2/bed/lastzOrnAna1.2011-04-26/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
	-swap -bigClusterHub=swarm -tRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    40m51.778s
    cat fb.ornAna1.chainAnoCar2Link.txt 
    #	75402821 bases of 1842236818 (4.093%) in intersection

############################################################################
# rerun maf annotation since mafAddIRows can only work on one sequence
#	at a time
    mkdir /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/mafSplit
    cd /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/mafSplit
    #	need to split up the single maf file into individual
    #	per-scaffold maf files to run annotation on
    #	create bed files to list approximately 1000 scaffolds in
    #	a single list, approximately 202 lists
    cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $bedCount = 0;
my $i = 0;

my $bedFile = sprintf("file_%d.bed", $bedCount);

open (BF,">$bedFile") or die "can not write to $bedFile $!";

open (FH,"</hive/data/genomes/ornAna1/chrom.sizes") or
        die "can not read /hive/data/genomes/ornAna1/chrom.sizes $!";
while (my $line = <FH>) {
    chomp $line;
    if ( (($i + 1) % 1000) == 0 )  {
        printf "%s\n", $line;
        close (BF);
        ++$bedCount;
        $bedFile = sprintf("file_%d.bed", $bedCount);
        open (BF,">$bedFile") or die "can not write to $bedFile $!";
    }
    ++$i;
    my ($chr, $size) = split('\s+',$line);
    printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BH);
'_EOF_'
    # << happy emacs
    chmod +x mkBedLists.pl
    ./mkBedLists.pl
    cat *.bed | wc -l
    #	201523
    wc -l /hive/data/genomes/ornAna1/chrom.sizes
    #	201523 /hive/data/genomes/ornAna1/chrom.sizes

    #	now, run a mafsInRegion on each one of those lists
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/ornAna1/bed/multiz6way/anno/mafSplit"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/ornAna1/$resultDir
pushd /scratch/tmp/ornAna1/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
    /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/ornAna1.maf
popd
rsync -q -a /scratch/tmp/ornAna1/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/ornAna1/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/ornAna1
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    # running this on memk with 24 CPUs
    ls file*.bed > runList
    gensub2 runList single template jobList
    para create jobList
    para try ... check ... push ... etc
# Completed: 202 of 202 jobs
# CPU time in finished jobs:       7640s     127.34m     2.12h    0.09d  0.000 y
# IO & Wait Time:                 13270s     221.16m     3.69h    0.15d  0.000 y
# Average job time:                 104s       1.73m     0.03h    0.00d
# Longest finished job:             245s       4.08m     0.07h    0.00d
# Submission to last job:          1071s      17.85m     0.30h    0.01d

    cd /hive/data/genomes/ornAna1/bed/multiz6way/anno/run
for DB in `cat ../../species.lst`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/ornAna1/ornAna1.2bit $result
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../mafSplit -type f -name "*.maf" > maf.list
    gensub2 maf.list single template jobList
    # slowing this down on swarm, 1 job per node with this memory limit:
    para -ram=8g create jobList
    para try ... check ...  etc.
    para push
# Completed: 83266 of 83266 jobs
# CPU time in finished jobs:      17759s     295.98m     4.93h    0.21d  0.001 y
# IO & Wait Time:                212068s    3534.47m    58.91h    2.45d  0.007 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              19s       0.32m     0.01h    0.00d
# Submission to last job:          1106s      18.43m     0.31h    0.01d

    #	put the results back together into a single file
    cd /hive/data/genomes/ornAna1/bed/multiz6way/anno
    # combine into one file
    head -q -n 1 maf/file_0/chr1.maf > ornAna1.6way.maf
    time find ./maf -type f | while read F
do
    grep -h -v "^#" ${F}
done >> ornAna1.6way.maf
    real    23m17.855s
    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 maf/file_0/chr1.maf >> ornAna1.6way.maf
    # How about an official end marker:
    echo "##eof maf" >> ornAna1.6way.maf

    cd /cluster/data/ornAna1/bed/multiz6way/anno
    mkdir -p /gbdb/ornAna1/multiz6way/anno
    ln -s `pwd`/ornAna1.6way.maf /gbdb/ornAna1/multiz6way/anno/multiz6way.maf

    #	by loading this into the table multiz6way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ornAna1/multiz6way/anno \
                ornAna1 multiz6way
    #	Loaded 2917036 mafs in 1 files from /gbdb/ornAna1/multiz6way/anno
    #	real    1m17.387s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything:
    time nice -n +19 hgLoadMafSummary ornAna1 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000  multiz6waySummary \
	    /gbdb/ornAna1/multiz6way/anno/multiz6way.maf
    #	Created 572679 summary blocks from 2650531 components
    #	and 2917036 mafs from /gbdb/ornAna1/multiz6way/anno/multiz6way.maf
    #	real    1m13.692s

    #	by loading this into the table multiz6waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz6way*.tab files in this /scratch/tmp directory
    rm multiz6way*.tab

############################################################################
# rerun frames on new annotation and more gene sources (2011-05-12 - Hiram)
    cd /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/frames
# ensGene is good for most
    for qDB in ornAna1 anoCar1 galGal3 mm8
do
  echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB}
      hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
done

# monDom4 ensGene is an older version with only 10 columns
for qDB in monDom4
do
  echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB}
      hgsql -N -e "select * from ensGene" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
done

# knownGenes for hg18
for qDB in hg18
do
  echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB}
      hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
done

    cat ../anno/ornAna1.6way.maf \
	| nice -n +19 genePredToMafFrames ornAna1 stdin stdout \
	    `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.lst` \
		| gzip > ornAna1.mafFrames.gz
    # a couple of minutes

    # verify there are frames on everything:
    zcat ornAna1.mafFrames.gz | awk '{print $4}' | sort | uniq -c
    #	225496 anoCar1
    #	318691 galGal3
    #	428985 hg18
    #	489490 mm8
    #	341401 monDom4
    #	149620 ornAna1

    ssh hgwdev
    cd /hive/data/genomes/ornAna1/bed/multiz6way/frames
    time hgLoadMafFrames ornAna1 multiz6wayFrames ornAna1.mafFrames.gz
    #	real    0m15.994s
#########################################################################
# SWAP BOSTAU6 CHAIN/NET (DONE 2011-05-20 - Chin)
    cd /cluster/data/ornAna1/bed/blastz.bosTau6.swap
    cat fb.ornAna1.chainBosTau6Link.txt
    #   190401000 bases of 1842236818 (10.335%) in intersection
    cd /cluster/data/ornAna1/bed
    ln -s blastz.bosTau6.swap lastz.bosTau6


############################################################################
# running cpgIsland business (DONE 2011-11-22 - Chin)
    mkdir /hive/data/genomes/ornAna1/bed/cpgIsland
    cd /hive/data/genomes/ornAna1/bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    #   needed to fixup this source, adding include to readseq.c:
#include "string.h"
    #   and to cpg_lh.c:
#include "unistd.h"
#include "stdlib.h"
    # and fixing a declaration in cpg_lh.c
    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
    mv tmp.c cpg_lh.c
    make
    cd ../../
    ln -s hg3rdParty/cpgIslands/cpglh.exe
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../ornAna1.2bit:$C stdout \
        | maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/ornAna1/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #   the faCount business is to make sure there is enough sequence to
    #   work with in the fasta.  cpglh.exe does not like files with too
    #   many
    #   N's - it gets stuck
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | grep -e ^chr -e ^Contig -e ^Ultra | awk '{print  $2 - $7}'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs


    chmod 775 runOne
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
XXXX 2011-11-28
    para time
    para problems
    para status
    # then, kick it with para push
    # check it with plb
    # when all are done, para time shows:
    # Checking finished jobs
    # Completed: 201523 of 201523 jobs
    # CPU time in finished jobs:   145s       2.42m     0.04h    0.00d  0.000 y
    # IO & Wait Time:           526359s    8772.65m   146.21h    6.09d  0.017 y
    # Average job time:                   3s       0.04m     0.00h    0.00d
    # Longest finished job:              46s       0.77m     0.01h    0.00d
    # Submission to last job:         21512s     358.53m     5.98h    0.25d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    ssh hgwdev
    cd /hive/data/genomes/ornAna1/bed/cpgIsland
    hgLoadBed ornAna1 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    # Loaded 82257 elements of size 10
    # Sorted
    # Creating table definition for cpgIslandExt
    # Saving bed.tab
    # Loading ornAna1

    #   cleanup
    rm -fr hardMaskedFa

############################################################################
