# for emacs: -*- mode: sh; -*-


# This file describes how to make the browser database for the
# worm C. elegans
# 2004-03-30 [DONE, 2004-05-20, hartera]

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +------------------+-------------------------+
#  | tableName        | type                    |
#  +------------------+-------------------------+
#  | sangerGene       | genePred sangerPep      |
#  | sangerGenefinder | genePred                |
#  | refGene          | genePred refPep refMrna |
#  | twinscan         | genePred twinscanPep    |
#  | geneid           | genePred geneidPep      |
#  +------------------+-------------------------+


# DOWNLOAD SEQUENCE (DONE, 2004-03-31, hartera)

    # next machine
    ssh eieio
    mkdir -p /cluster/store5/worm/ce2/sanger
    mkdir -p /cluster/store5/worm/ce2/tmp
    ln -s /cluster/store5/worm/ce2 /cluster/data/ce2
    cd /cluster/data/ce2/sanger

    wget -o ce2.fetch.log -r -l1 --no-directories \
        ftp://ftp.sanger.ac.uk/pub/wormbase/WS120/CHROMOSOMES/
                                                                           
    # This site is now at:                                                          # ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/CHROMOSOMES/

# Takes about five minutes
# This current_release is updated every two weeks.  Although the
#     sequence is quite stable at this time and changes very little.
# Check that you have some valid files. Should be chroms I II III IV V X
#     in dna, gff and agp formats, also Mitochondrial DNA, MtDNA in dna and 
#     gff formats.

    ls -ogrt

# Rename CHROMOSOME_MtDNA files to CHROMOSOME_M and change to 
# "CHROMOSOME_M" inside files 
    chmod u+w CHROMOSOME_M*
    zcat CHROMOSOME_MtDNA.dna.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
        | gzip > CHROMOSOME_M.dna.gz
    zcat CHROMOSOME_MtDNA.gff.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
        | gzip > CHROMOSOME_M.gff.gz
    # remove old files
    rm CHROMOSOME_MtDNA* 

# CHROMOSOME_M sequence is identical to X54252.1 in GenBank
# create a .agp file for chrM as there is none and hgGoldGapGl and other 
# programs require a .agp file so create CHROMOSOME_M.agp:
# M       1       13794   1       F       X54252.1	1       13794   +

# translate to unzipped .fa, all upper case, and
# rename the agp files so hgGoldGap can find them
    mkdir sangerFa
    foreach f (I II III IV V X M)
        echo -n "${f} "
        zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
                sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
        mkdir -p sangerFa/${f}
        ln -s /cluster/data/ce2/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
    end
    # hgGoldGap does not handle dir names over 2 characters:
    mv sangerFa/III sangerFa/3

# CREATING DATABASE (DONE, 2004-03-31 - hartera)

    # Create the database.
    # next machine

    ssh hgwdev
    echo 'create database ce2' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
    echo 'drop database ce2' | hgsql ce2
    # Use df to ake sure there is at least 5 gig free on
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  280G  1.4T  17% /var/lib/mysql    

# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-03-31 - hartera)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #  grp in the database galGal2 to our new database ce2
    echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
      | hgsql ce2
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
    echo 'drop table grp;' | hgsql ce2
    
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-04-01, hartera)
    # This used to hold scripts -- better to keep them inline here so
    # they're in CVS.  Now it should just hold lift file(s) and
    # temporary scripts made by copy-paste from this file.
    mkdir /cluster/data/ce2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/ce2/bed

# PREPARE Split contigs into 100,000 bp chunks for cluster runs
# (DONE, 2004-04-01, hartera)
    # next machine
    ssh eieio
    cd /cluster/data/ce2
    rm -fr ./split
    mkdir split
    foreach f (I II III IV V X M)
        mkdir split/$f
        faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
    end
                                                                          
151 pieces of 151 written
153 pieces of 153 written
138 pieces of 138 written
175 pieces of 175 written
210 pieces of 210 written
178 pieces of 178 written
1 pieces of 1 written

   cat split/*.lft > liftAll.lft
   # copy them to /iscratch/i for cluster rsync
   # next machine
   ssh kkr1u00
   cd /cluster/data/ce2/split
   foreach c (I II III IV V X M)
       echo -n "${c} "
       mkdir -p /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
       cp -p ${c}/c*.fa /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
   end
   iSync

# Run RepeatMasker on the chromosomes (DONE, 2004-04-02 - hartera)
    # next machine
    ssh kk
    cd /cluster/data/ce2
    # make run directory and job list, create the script to use 
    # for the RepeatMasker run
    cat << '_EOF_' > jkStuff/RMWorm
#!/bin/csh -fe
#
#       This is a slight rearrangement of the
#       RMChicken script used in makeGalGal2.doc
#       The results here need to go to a different location
#       $1 == chrom name: I II III IV V X M
#       $2 == directory where split contig .fa is found
#       $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/ce2/$3/$1
/bin/cp $3 /tmp/ce2/$3/$1
cd /tmp/ce2/$3/$1
/cluster/bluearc/RepeatMasker/RepeatMasker -alignments -s -species elegans $3
popd
/bin/cp /tmp/ce2/$3/$1/$3.out ./
if( -e /tmp/ce2/$3/$1/$3.align ) /bin/cp /tmp/ce2/$3/$1/$3.align ./
if (-e /tmp/ce2/$3/$1/$3.tbl) /bin/cp /tmp/ce2/$3/$1/$3.tbl ./
if (-e /tmp/ce2/$3/$1/$3.cat) /bin/cp /tmp/ce2/$3/$1/$3.cat ./
/bin/rm -r /tmp/ce2/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce2/$3
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMWorm
    # create job list
    mkdir RMRun
    rm -f RMRun/RMJobs
    foreach c (I II III IV V X M)
        mkdir /cluster/data/ce2/RMRun/${c}
        foreach t ( /iscratch/i/worms/Celegans2/unmaskedSplit/$c/c*.fa )
            set d = $t:h
            set f = $t:t
            echo /cluster/data/ce2/jkStuff/RMWorm ${c} ${d} ${f} \
            '{'check out line+ /cluster/data/ce2/RMRun/$c/${f}.out'}' \
            >> RMRun/RMJobs
        end
    end
    # Do the run
    cd /cluster/data/ce2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check, ...
para try:
# para time
# Checking finished jobs
# Completed: 1006 of 1006 jobs
# CPU time in finished jobs:     821747s   13695.78m   228.26h    9.51d  0.026 y
# IO & Wait Time:                 13643s     227.39m     3.79h    0.16d  0.000 y
# Average job time:                 830s      13.84m     0.23h    0.01d
# Longest job:                      935s      15.58m     0.26h    0.01d
# Submission to last job:          5837s      97.28m     1.62h    0.07d
  
    # when they are finished, liftUp and load the .out files into the database:
    # next machine
    ssh eieio
    cd /cluster/data/ce2/RMRun
    foreach c (I II III IV V X M)
        liftUp chr${c}.fa.out /cluster/data/ce2/split/chr${c}.lft warn ${c}/*.fa.out
    end

    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/RMRun
    hgLoadOut ce2 chr*.fa.out

    # Noticed one error in this load (reported to Robert Hubley):
    # Processing chrV.fa.out
    # Strange perc. field -0.1 line 2196 of chrV.fa.out

# SIMPLE REPEAT [TRF] TRACK  (DONE, hartera 2004-04-01)
    # ensure chr*.fa files exist on /iscratch/i
    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Celegans2/unmaskedFa
    cp -p /cluster/data/ce2/sangerFa/*.fa \
        /iscratch/i/worms/Celegans2/unmaskedFa
    iSync
# done iSync, 
    # Create cluster parasol job:
    # next machine
    ssh kk
    mkdir -p /cluster/data/ce2/bed/simpleRepeat
    cd /cluster/data/ce2/bed/simpleRepeat
    mkdir trf
    ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > genome.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
                                                                                
    echo "" > dummy.lst
    gensub2 genome.lst dummy.lst gsub spec
    para create spec
    para try    # there are only 7, so this runs them all
    para check

# para time
# Checking finished jobs
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       3301s      55.01m     0.92h    0.04d  0.000 y
# IO & Wait Time:                    38s       0.64m     0.01h    0.00d  0.000 y
# Average job time:                 477s       7.95m     0.13h    0.01d
# Longest job:                      975s      16.25m     0.27h    0.01d
# Submission to last job:           975s      16.25m     0.27h    0.01d

    #  When cluster run is done, combine into one:
    cat trf/*.bed > simpleRepeat.bed

    # Load into the database:
    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/bed/simpleRepeat
    hgLoadBed ce2 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 28598 elements of size 16

# PROCESS SIMPLE REPEATS INTO MASK (DONE,  2004-04-02 - hartera)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:

    # next machine
    ssh eieio
    cd /cluster/data/ce2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

#  create Soft and Hard masks from RepeatMaster and TRF outputs:
#  and rebuild the nib files using the soft masking in the fa:
    # next machine
    ssh eieio
    cd /cluster/data/ce2
    mkdir softMask
    mkdir nib
    cd /cluster/data/ce2
    foreach c (I II III IV V X M)
        echo -n "masking chr${c} "
        maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
                softMask/chr${c}.fa -soft
        maskOutFa softMask/chr${c}.fa \
                bed/simpleRepeat/trfMask/chr${c}.bed \
                softMask/chr${c}.fa -softAdd
                faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
    end

# output:
# masking chrI Writing 15080483 bases in 7540250 bytes
# masking chrII Writing 15279308 bases in 7639662 bytes
# masking chrIII Writing 13783313 bases in 6891665 bytes
# masking chrIV Writing 17493791 bases in 8746904 bytes
# masking chrV Writing 20922231 bases in 10461124 bytes
# masking chrX Writing 17718849 bases in 8859433 bytes
# masking chrM Writing 13794 bases in 6905 bytes

# create hard masks 
mkdir hardMask
    foreach c (I II III IV V X M)
        echo "masking chr${c}"
        /cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
                hardMask/chr${c}.fa
    end

    ssh kkr1u00
    cd /cluster/data/ce2/softMask
    mkdir -p /iscratch/i/worms/Celegans2/bothMasksFa
    mkdir -p /iscratch/i/worms/Celegans2/nib
    cp -p *.fa /iscratch/i/worms/Celegans2/bothMasksFa
    cd /cluster/data/ce2/nib
    cp -p c*.nib /iscratch/i/worms/Celegans2/nib
    iSync

STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2004-04-02 - hartera)

    # Make symbolic links from /gbdb/ce1/nib to the real nibs.
    # next machine
    ssh hgwdev
    mkdir -p /gbdb/ce2/nib

    foreach f (/cluster/data/ce2/nib/*.nib) 
      ln -s $f /gbdb/ce2/nib
    end
    cd /cluster/data/ce2/tmp
    # Load /gbdb/ce2/nib paths into database and save size info
    # hgNibSeq creates chromInfo table
    hgNibSeq -preMadeNib ce2 /gbdb/ce2/nib /cluster/data/ce2/sangerFa/chr*.fa

# Typical output:
# Processing /cluster/data/ce2/sangerFa/chrI.fa to /gbdb/ce2/nib/chrI.nib
# Processing /cluster/data/ce2/sangerFa/chrII.fa to /gbdb/ce2/nib/chrII.nib
# Processing /cluster/data/ce2/sangerFa/chrIII.fa to /gbdb/ce2/nib/chrIII.nib
# Processing /cluster/data/ce2/sangerFa/chrIV.fa to /gbdb/ce2/nib/chrIV.nib
# Processing /cluster/data/ce2/sangerFa/chrM.fa to /gbdb/ce2/nib/chrM.nib
# Processing /cluster/data/ce2/sangerFa/chrV.fa to /gbdb/ce2/nib/chrV.nib
# Processing /cluster/data/ce2/sangerFa/chrX.fa to /gbdb/ce2/nib/chrX.nib
# 100291769 total bases

    #   Verify the hgNibSeq load functioned OK:
    hgsql -e "select chrom, size from chromInfo" ce2 > chrom.sizes
    cat chrom.sizes
# chrom.sizes:
# chrom   size
# chrI    15080483
# chrII   15279308
# chrIII  13783313
# chrIV   17493791
# chrM    13794
# chrV    20922231
# chrX    17718849

# MAKE GAP tracks AND LOAD ASSEMBLY FRAGMENTS INTO DATABASE (DONE, 2004-04-05, hartera)
    # next machine
     ssh hgwdev
     mkdir -p /cluster/data/ce2/bed/gap
     cd /cluster/data/ce2/bed/gap
     # finds motifs and finds location of gaps as part of output
     foreach c (I II III IV V X M)
        findMotif -chr=chr${c} -verbose=4 -motif=gcatg /gbdb/ce2/nib >& chr${c}Bed.stderr 
     end
     grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed

    # hgGoldGap does not handle dir names over 2 characters
    #   directory III has been moved to 3 when all this was created above
    hgGoldGapGl -noGl ce2 /cluster/data/ce2 sangerFa

    # All the gap tables are empty
    # Load in gap.bed
    # Need to add extra fields to gap.bed file

    cat << '_EOF_' > /cluster/data/ce2/jkStuff/createGapFile.pl
#!/usr/bin/perl -w
use strict;

my $oldchr = "";
while (<STDIN>) {
  my @fields = split(/\t/);
  my $chr = $fields[0];
  if ($chr ne $oldchr) {
     open(OUT, ">$chr"."_gap.bed");
     $oldchr = $chr;
  }
  print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
}
'_EOF_'
 
    perl /cluster/data/ce2/jkStuff/createGapFile.pl < gap.bed
    # load into relevant tables
    foreach c (I II III IV V X M) 
        hgLoadBed -tab -oldTable ce2 chr${c}_gap chr${c}_gap.bed
    end 

# CREATE gc5Base wiggle TRACK (DONE, 2004-04-05, hartera)

    # Perform a gc count with a 5 base window. 
    # Also compute a "zoomed" view for display efficiency.

    mkdir /cluster/data/ce2/bed/gc5Base
    cd /cluster/data/ce2/bed/gc5Base

    #   in the script below, the 'grep -w GC' selects the lines of
    #   output from hgGcPercent that are real data and not just some
    #   information from hgGcPercent.  The awk computes the number
    #   of bases that hgGcPercent claimed it measured, which is not
    #   necessarily always 5 if it ran into gaps, and then the division
    #   by 10.0 scales down the numbers from hgGcPercent to the range
    #   [0-100].  Two columns come out of the awk print statement:
    #   <position> and <value> which are fed into wigAsciiToBinary through
    #   the pipe.  It is set at a dataSpan of 5 because each value
    #   represents the measurement over five bases beginning with
    #   <position>.  The result files end up in ./wigData5.
    cat << '_EOF_' > /cluster/data/ce2/jkStuff/runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in /cluster/data/ce2/nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
        wigAsciiToBinary \
        -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
        -name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'

    chmod +x /cluster/data/ce2/jkStuff/runGcPercent.sh

    #   This is going to take perhaps two hours to run.  It is a lot of
    #   data.  make sure you do it on the fileserver:
    ssh eieio
    cd /cluster/data/ce2/bed/gc5Base
    /cluster/data/ce2/jkStuff/runGcPercent.sh
    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/ce2/bed/gc5Base
    hgLoadWiggle ce2 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/ce2/wib
    ln -s `pwd`/wigData5/*.wib /gbdb/ce2/wib

    # to speed up display for whole chromosome views, compute a "zoomed"
    # view and load that on top of the existing table.  The savings
    # comes from the number of data table rows the browser needs to load
    # for a full chromosome view.  Without the zoomed view there are
    # over 43,000 data rows for chrom 1.  With the zoomed view there are
    # only 222 rows needed for the display.  If your original data was
    # at 1 value per base the savings would be even greater.
    #   Pretty much the same data calculation
    # situation as above, although this time note the use of the
    # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
    # together the data points coming out of the awk print statment over
    # a span of 1000 bases.  Thus each <position> coming out of wigZoom
    # will represent the measurement of GC in the next 1000 bases.  Note
    # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
    # this type of data.  You want your dataSpan here to be an exact
    # multiple of your original dataSpan (5*200=1000) and on the order
    # of at least 1000, doesn't need to go too high.  For data that is
    # originally at 1 base per value, a convenient span is: -dataSpan=1024
    # A new set of result files ends up in ./wigData5_1K/*.wi[gb]

    cat << '_EOF_' > /cluster/data/ce2/jkStuff/runZoom.sh
#!/bin/sh
                                                                                
mkdir -p wigData5_1K
mkdir -p dataLimits5_1K
                                                                                
for n in /cluster/data/ce2/nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
        wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
        -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
        -name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'

    chmod +x /cluster/data/ce2/jkStuff/runZoom.sh
    #   This is going to take even longer than above, certainly do this
    #   on the fileserver
    ssh eieio
    cd /cluster/data/ce2/bed/gc5Base
    time /cluster/data/ce2/jkStuff/runZoom.sh
    # 520.000u 19.310s 6:15.09 143.7% 0+0k 0+0io 7875pf+0w

    #   Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -oldTable ce2 gc5Base wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    mkdir /gbdb/ce2/wib
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/ce2/wib

    #   The browser needs to be fixed so it can display the assembly track 
    #   without the need for the gap tables to exist.

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE2 (DONE, 2002-04-05, hartera)
    # next machine
    ssh hgwdev
    # Make trackDb table so browser knows what to expect:
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add ce2 in all the right places and do
    make update
    make alpha
    cvs commit -m "Added ce2" makefile

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk)  \
          values("ce2", "March 2004", \
          "/gbdb/ce2/nib", "Worm", "chrII:14642289-14671631", 1, \
          60, "C. elegans", "Caenorhabditis elegans", \
          "/gbdb/ce2/html/description.html", 0);' \
    | hgsql -h genome-testdb hgcentraltest
    echo 'update defaultDb set name = "ce2" where genome = "C. elegans"' \
        | hgsql -h genome-testdb hgcentraltest
                                                                               
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-04-05, hartera)
    ssh hgwdev
    mkdir /cluster/data/ce2/html
    cd /cluster/data/ce2/html
    # make a symbolic link from /gbdb/ce2/html to /cluster/data/ce2/html
    ln -s /cluster/data/ce2/html /gbdb/ce2/html
    # Write a description.hmtl - copy from /cluster/data/ce1/html/
    # with a description of the assembly and some sample position queries.
    # create ce2 dir in /trackDb/worm and commit to CVS
    mkdir ~/kent/src/hg/makeDb/trackDb/worm/ce2
    cvs add ce2
    cvs commit ce2
    # Add this also to ~/kent/src/hg/makeDb/trackDb/worm/ce2/description.html
    chmod a+r $HOME/kent/src/hg/makeDb/trackDb/worm/ce2/description.html    
    # Check it in and copy (ideally using "make alpha" in trackDb) to
    # /gbdb/ce2/html
    cvs commit description.html
  
# MAKE SANGER GENE (WORM BASE GENES) TRACK (DONE, 2004-04-09, hartera)
# ADD MISSING ORFS AND PROTEIN NAMES TO sangerLinks TABLE AND 
# ADD proteinID COLUMN TO SANGERGENE TABLE FOR USE BY THE 
# FAMILY BROWSER (DONE, 2004-05-20, hartera)
                                                                                
   # next machine
   ssh eieio
   mkdir -p /cluster/data/ce2/bed/sangerGene
   cd /cluster/data/ce2/bed/sangerGene
    # the perl trims these files down to a reasonable size.  As they are
    #   they cause ldHgGene to hangup due to memory size.
    foreach f (I II III IV V X M)
        echo -n "${f} "
        zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
        sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
        -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
        -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
        -e "s/CHROMOSOME_M/chrM/g" \
        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
        perl -ne '@a = split; \
        print if($a[1] =~ /curated|DNA|RNA/i && \
        $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
    end
    echo
    # remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
    perl -pi.bak -e 's/CDS "//' chr*.gff
    perl -pi.bak -e 's/"//' chr*.gff
    # check this worked ok then cleanup
    rm *.bak
    #  check file sizes, should be reasonable
    ls -ogrt
# total 23352
# -rw-rw-r--    1  3430979 Apr  9 16:09 chrIII.gff
# -rw-rw-r--    1  3885105 Apr  9 16:09 chrII.gff
# -rw-rw-r--    1  3680753 Apr  9 16:09 chrI.gff
# -rw-rw-r--    1  4926390 Apr  9 16:09 chrV.gff
# -rw-rw-r--    1     3635 Apr  9 16:09 chrM.gff
# -rw-rw-r--    1  3854560 Apr  9 16:09 chrIV.gff
# -rw-rw-r--    1  4085489 Apr  9 16:09 chrX.gff                                                                                
    # now load database with those transformed gff files
    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/bed/sangerGene
    # 2004-05-10, hartera, Reload sangerGene table using extended GenePred
    # format. 2004-05-11, hartera. Extended format frame information does not
    # look correct. Reload without the extended fields.
    ldHgGene ce2 sangerGene *.gff
# Read 41259 transcripts in 423845 lines in 7 files
#  41259 groups 7 seqs 11 sources 6 feature types
# 23076 gene predictions

    # if you need to delete that table:
    echo 'drop table sangerGene' | hgsql ce2

#    worm/ce2 trackDb.ra entry
# track sangerGene
# shortLabel WormBase Genes
# longLabel WormBase Gene Annotations
# group genes
# priority 48
# visibility pack
# color 0,0,200
# chromosomes chrI,chrII,chrIII,chrIV,chrV,chrX,chrM
# type genePred sangerPep
# url http://www.wormbase.org/db/gene/gene?name=$$
# hgGene on
    
    # Add proteinID field to sangerGene table, used by Gene Sorter
    ssh hgwdev
    cd /cluster/data/ce2/bed/sangerGene
    hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce2
    # To add index on this column
    hgsql -e 'alter table sangerGene add index(proteinID);' ce2
    # Add Swiss-Prot protein IDs to this column
    # There are 23076 entries in sangerGene and 21780 of the names 
    # are in sangerLinks as orfName
    hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \
           where g.name = l.orfName;' ce2
    # 21780
    # get names from sangerGene and sangerLinks tables
    hgsql -N -e "select name from sangerGene;" ce2 | sort > name.sangerGene
    hgsql -N -e "select orfName from sangerLinks;" ce2 | sort > orfName.sangerLinks
    # get list of names in sangerGene not in sangerLinks
    comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks
    # Go to the WS120 WormBase mirror site and check SwissProt IDs
    # http://http://ws120.wormbase.org/db/searches/info_dump
    # Some of these genes are non coding RNAs or have no SwissProt ID BUT
    # others in the geneNames.notin.sangerLinks list do have a Swiss-Prot ID 
    # Download IDs in using batches of about 400 for gene names from 
    # geneNames.notin.sangerLinks and add to file, SPIds.wormPep.WS120
    # Create a perl script to parse out Swiss-Prot IDs for those genes that 
    # have them and create the sql statements to insert them into sangerLinks

cat << '_EOF_' > getSPIDsandLoad.pl
#!/usr/bin/perl -w
use strict;
my %SPhash;
my $found = "false";
my $gene = "";
my $SP = "";

open(SP, ">addSPIds.sql") || die "Can not create addSPIds.sql: $!";
while(<STDIN>) {
    chomp;
                                                                                    
    my @fields = split(/\t/);
    if ($fields[0] =~ /^Gene:/) {
       $gene = $fields[1];
    }
    elsif (($fields[0] =~ /^GenPep:/) && ($gene ne "n/a") ){
       # if there are 2 fields then store second as Swiss-Prot ID
       if ($#fields == 1) {
          $SP = $fields[1];
       }
                                                                                    
       # add gene and Swiss-Prot ID to hash
       $SPhash{$gene} = $SP;
       $gene = "";
       $SP = "";
    }
}
foreach my $g (keys(%SPhash) ) {
   my $s = $SPhash{$g};
   # print if there is a Swiss-Prot ID
   if ($s =~ /^[A-Z]{1}[0-9]+/) {
       print "Gene: $g\tSP ID: $s\n";
       # create mySQL to add this to the sangerLinks table;
       print SP "INSERT into sangerLinks (orfName, protName, description) VALUES(\"$g\",\"$s\",\"\");\n"
   }
}
close SP;
'_EOF_'

      # run perl script
      perl getSPIDsandLoad.pl < SPIds.wormPep.WS120 > orfsandSPIds.out
      # There are 111 of these genes with SwissProt IDs
      # To insert new orfNames and SwissProt IDs into table:
      hgsql ce2 < addSPIds.sql 
      # Now 21891 names in sangerGene that match orfNames in sangerLinks

      # Use SwissProt IDs in sangerLinks table to fill proteinID 
      # column in the sangerGene table
      hgsql -e 'update sangerGene as g, sangerLinks as l \
          set g.proteinID = l.protName where g.name = l.orfName;' ce2
      # check there are 21891 rows with the protName field filled

# MYTOUCH FIX - Jen - 2006-01-24
  sudo mytouch ce2 orfToGene 0505031600.00
  sudo mytouch ce2 sangerBlastTab 0505031600.00
  sudo mytouch ce2 sangerBlastTab 0505031600.00
  sudo mytouch ce2 sangerCanonical 0505031600.00
  sudo mytouch ce2 sangerIsoforms 0505031600.00
  sudo mytouch ce2 sangerLinks 0505031600.00
  sudo mytouch ce2 sangerPep 0505031600.00
  sudo mytouch ce2 sangerToKim 0505031600.00
  sudo mytouch ce2 sangerToPfam 0505031600.00
  sudo mytouch ce2 sangerToRefSeq 0505031600.00


# MAKE WORMBASE GENEFINDER TRACKS (DONE, 2004-04-12, hartera)
    # next machine
    ssh eieio
    mkdir -p /cluster/data/ce2/bed/sangerGenefinder
    cd /cluster/data/ce2/bed/sangerGenefinder
    # the perl trims these files down to a reasonable size.  As they are
    #   they cause ldHgGene to hangup due to memory size.
    foreach f (I II III IV V X M)
        echo -n "${f} "
        zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
        sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
        -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
        -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
        -e "s/CHROMOSOME_M/chrM/g" \
        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
        perl -ne '@a = split; \
        print if($a[1] =~ /Genefinder/i && \
        $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
    end
    echo
    # remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
    perl -pi.bak -e 's/CDS "//' chr*.gff
    perl -pi.bak -e 's/"//' chr*.gff
    # check this worked ok then cleanup
    rm *.bak
    #  check file sizes, should be reasonable
    ls -ogrt
# total 23712
# -rw-rw-r--    1  3984558 Apr 12 09:14 chrII.gff
# -rw-rw-r--    1  3702836 Apr 12 09:14 chrI.gff
# -rw-rw-r--    1        0 Apr 12 09:14 chrM.gff
# -rw-rw-r--    1  3948579 Apr 12 09:14 chrIV.gff
# -rw-rw-r--    1  3364561 Apr 12 09:14 chrIII.gff
# -rw-rw-r--    1  5303266 Apr 12 09:14 chrV.gff
# -rw-rw-r--    1  3929597 Apr 12 09:14 chrX.gff

    # now load database with those transformed gff files
    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/bed/sangerGenefinder
    # 2004-05-10, hartera, Reload sangerGene table using extended GenePred 
    # format. 2004-05-11, hartera. Extended format frame information does not
    # look correct. Reload without the extended fields.
    ldHgGene ce2 sangerGenefinder *.gff
# Read 36976 transcripts in 401688 lines in 7 files
#   36976 groups 6 seqs 1 sources 4 feature types
# 21180 gene predictions

    # if you need to delete that table:
    echo 'drop table sangerGenefinder' | hgsql ce2

# RUN Waba alignment with briggsae  (WORKING - 2004-04-06 - Hiram)
    # prepare contigs from C. briggsae
    #   Assumes C. briggsae data has been downloaded according to
    #   makeCb1.doc
    # using briggsae contigs from previous work:
	/iscratch/i/worms/Cbriggsae/contigs
    
    # next machine
    ssh kk
    mkdir -p /cluster/data/ce2/bed/waba/out
    cd /cluster/data/ce2/bed/waba
    ls -1S /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
    ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > elegans.lst
    cat elegans.lst | while read FN
    do
	b=`basename ${FN}`
	mkdir out/${b%%.fa}
    done
    # create scripts to be used here
    cat << '_EOF_' > wabaRun
#!/bin/csh -fe
#
#	$1 - full pathname to a briggsae contig
#	$2 - file path to an elegans chrom.fa
#	$3 - result file full pathname
#
set f = $1:t
set chr = $2:t
set d = $chr:r
mkdir -p /tmp/$d/$f
cp $1 /tmp/$d/$f
pushd .
cd /tmp/$d/$f
set t = $f:r
/cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
/cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
/cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
cp $t.wab $3
popd
rm -f /tmp/$d/$f/$t.*
rmdir --ignore-fail-on-non-empty /tmp/$d/$f
rmdir --ignore-fail-on-non-empty /tmp/$d
'_EOF_'
    chmod +x wabaRun

    cat << '_EOF_' > jobTemplate
#LOOP
/cluster/store5/worm/ce2/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce2/bed/waba/out/$(root2)/$(root1).wab}
#ENDLOOP
'_EOF_'

    gensub2 briggsae.lst elegans.lst jobTemplate jobList
    para create jobList
    para try
    para check
    para push
    # one of the jobs takes quite a while.  Most of the others are OK:
Completed: 6950 of 6951 jobs
Crashed: 1 jobs
CPU time in finished jobs:   19284472s  321407.87m  5356.80h  223.20d  0.612 y
IO & Wait Time:                 63556s    1059.26m    17.65h    0.74d  0.002 y
Average job time:                2784s      46.40m     0.77h    0.03d
Longest job:                    47017s     783.62m    13.06h    0.54d
Submission to last job:         58555s     975.92m    16.27h    0.68d

    #	The failed job is:
    # /cluster/store5/worm/ce2/bed/waba/wabaRun \
    #	/iscratch/i/worms/Cbriggsae/contigs/c0907.fa \
    #	/iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \
    #	/cluster/store5/worm/ce2/bed/waba/out/chrI/c0907.wab
XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00
    # after about 1.5 hours fails with the error:
    # waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart &&
    #	c->qEnd == qEnd' failed.
    # Abort

    # Ignoring that error, and moving on

    # next machine
    ssh hgwdev

    cd /cluster/data/ce2/bed/waba
    mkdir Load
    cd Load
    #  The cat through the pipe to hgWaba will avoid making large files
    # 	that are not needed.
    cat << '_EOF_' > loadEm.sh
#!/bin/sh
#
for c in I II III IV V X M
do
	echo -n "${c} "
        cat /cluster/store5/worm/ce2/bed/waba/out/chr${c}/c*.wab |
        /cluster/bin/i386/hgWaba ce2 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
done

exit 0
'_EOF_'
    chmod +x loadEm.sh

    # run it to load the waba track:
    ./loadEm.sh
    # remove garbage temp file:
    rm full_waba.tab chrom_waba.tab

    # worm/ce2/trackDb.ra entry:
#   track cbrWaba
#   shortLabel Briggsae Waba
#   longLabel C. briggsae Waba Alignments
#   group compGeno
#   priority 20
#   visibility dense
#   color 140,0,200
#   altColor 210,140,250

# MAKE 2BIT FILE FOR BLAT (DONE, 2004-04-07, hartera)  
    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/ce2/nib because hgBlat looks there:
    ssh eieio
    cd /cluster/data/ce2
    faToTwoBit softMask/chr*.fa ce2.2bit
    ssh hgwdev
    ln -s /cluster/data/ce2/ce2.2bit /gbdb/ce2/nib/

# MAKE 11.OOC FILE FOR BLAT (DONE, 2004-04-07, hartera)
# Use -repMatch=40 (based on size, for human use 1024)
    ssh kkr1u00
    mkdir /cluster/bluearc/ce2
    mkdir /cluster/data/ce2/bed/ooc
    cd /cluster/data/ce2/bed/ooc
    ls -1 /cluster/data/ce2/nib/chr*.nib > nib.lst
    /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/ce2/11.ooc -repMatch=40
# Writing /cluster/bluearc/ce2/11.ooc
# Wrote 43676 overused 11-mers to /cluster/bluearc/ce2/11.ooc
   cp -p /cluster/bluearc/ce2/11.ooc /iscratch/i/worms/Celegans2/
   iSync

# AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-04-07, hartera)
    # next machine
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf: default includes native genbank mRNAs and ESTs, 
    # genbank xeno mRNAs but no ESTs, native RefSeq mRNAs but not xeno RefSeq
    # Add these lines:
# ce2 (C. elegans)
ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
ce2.lift = no
ce2.downloadDir = ce2

    cvs commit -m "Added ce2" etc/genbank.conf
    make

    # markd added -maxIntron=100000 for ce to genbank/src/align/gbBlat
    # Edit src/align/gbBlat to add /iscratch/i/worms/Celegans2/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs commit -m "Added 11.ooc for ce2" src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server
    
    # next machine
    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, RefSeq mRNA only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial ce2

    # Load results for RefSeq mRNAs:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad ce2
    # To start next run, results need to be moved out the way
    cd /cluster/data/genbank/work
    mv initial.ce2 initial.ce2.refseq.mrna

    # -initial for GenBank mRNAs
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial ce2

    # Load results for GenBank mRNAs
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 ce2 
    cd /cluster/data/genbank/work
    mv initial.ce2 initial.ce2.genbank.mrna

    # -initial for ESTs
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial ce2

    # Load results for GenBank ESTs
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 ce2 
    cd /cluster/data/genbank/work
    mv initial.ce2 initial.ce2.genbank.est
   
    # Everything loaded ok, so clean up
    rm -r /cluster/data/genbank/work/initial.ce2*
    
# REMOVE XENO mRNAs (DONE, 2004-04-09, hartera)
    # Remove xeno mRNAs, too many are distantly related and do 
    # not have meaningful alignments using Blat
    ssh hgwdev
    # Remove all mrna.xeno.* files from
    # /cluster/data/genbank/data/aligned/genbank.140.0/ce2
    # edit /cluster/data/genbank/etc/genbank.conf
    cd ~/kent/src/hg/makeDb/genbank
    # add line to ce2:
ce2.genbank.mrna.xeno.load = no
# so now it is:
# ce2 (C. elegans)
ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
ce2.lift = no
ce2.genbank.mrna.xeno.load = no
ce2.downloadDir = ce2
    make
    cvs update etc/genbank.conf
    cvs commit etc/genbank.conf
    make install-server

    cd /cluster/data/genbank/
    # reload only the native mRNAs to the database
    nice bin/gbDbLoadStep -reload -type=mrna -srcDb=genbank -verbose=1 ce2
    # drop xenoMrna table
    echo 'drop table xenoMrna' | hgsql ce2
    
# PRODUCING FUGU BLAT ALIGNMENTS (DONE, 2004-04-07, hartera)
    # Assumes masked NIBs have been prepared as above
    # and Fugu pieces are already on kluster /iscratch/i.
    # next machine
    ssh kk
    mkdir /cluster/data/ce2/bed/blatFr1
    cd /cluster/data/ce2/bed/blatFr1
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /iscratch/i/worms/Celegans2/nib/*.nib > worm.lst
    cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    mkdir psl
    gensub2 worm.lst fugu.lst gsub spec
    para create spec
    para try, check, push, check, ...
# para time
# Completed: 4046 of 4046 jobs
# CPU time in finished jobs:     263359s    4389.31m    73.16h    3.05d  0.008 y
# IO & Wait Time:                 13317s     221.95m     3.70h    0.15d  0.000 y
# Average job time:                  68s       1.14m     0.02h    0.00d
# Longest job:                     1103s      18.38m     0.31h    0.01d
# Submission to last job:          2170s      36.17m     0.60h    0.03d

    # When cluster run is done, sort alignments
    # next machine
    ssh eieio
    cd /cluster/data/ce2/bed/blatFr1
    foreach d (I II III IV V X M)
        echo -n "${d} "
        pslCat psl/chr${d}_*.psl | \
        pslSortAcc nohead chrom temp stdin
        rm -f chrom/chr${d}_blatFr1.psl
        mv chrom/chr${d}.psl chrom/chr${d}_blatFr1.psl
    end

    # next machine
    ssh hgwdev
    # Load Fugu Blat alignments
    cd /cluster/data/ce2/bed/blatFr1/chrom
    cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 ce2 stdin

    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    mkdir /gbdb/ce2/fuguSeq
    ln -s cluster/data/fr1/fugu_v3.masked.fa /gbdb/ce2/fuguSeq
    cd /cluster/data/ce2/bed/blatFr1
    hgLoadSeq ce2 /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
    # Adding /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
    # 20379 sequences

    # worm/ce2 trackDb.ra entry
#   track blatFr1
#   shortLabel Fugu Blat
#   longLabel Takifugu rubripes (fr1) Translated Blat Alignments
#   group compGeno
#   priority 110
#   visibility dense
#   color 0,60,120
#   altColor 200,220,255
#   spectrum on
#   type psl xeno

# BLASTZ C. Briggsae (Cb1) (DONE, 2004-04-13, hartera)
                                                                                
    # next machine
    ssh kkr1u00
    # blastz requires lineage-specific repeats but there are none for the worms
    # so create empty files for each chromsome and iSync
    mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
    cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
    # create chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM
    mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
    # create empty chrUn.out.spec file
    iSync
    
    ssh kk
    mkdir -p /cluster/data/ce2/bed/blastz.cb1.2004-04-12
    cd /cluster/data/ce2/bed
    ln -s  blastz.cb1.2004-04-12 blastz.cb1
    cd blastz.cb1
 
cat << '_EOF_' > DEF
# C. elegans vs. C. briggsae
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1
#  when SMSK=/dev/null
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
# RMSK not currently used
SEQ1_RMSK=/iscratch/i/worms/Celegans2/rmsk
SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
# FLAG not currently used
SEQ1_FLAG=-worm
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
# RMSK not currently used
SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
# FLAG not currently used
SEQ2_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
SEQ2_FLAG=-worm
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/ce2/bed/blastz.cb1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy  
    
    # Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.ce2-cb1.2004-04-12
    # Need shell scripts from mm4 to do cluster runs
    mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/ce2/jkStuff/

    # prepare first cluster run
    ssh kk
    cd /cluster/data/ce2/bed/blastz.cb1
    source DEF
    /cluster/data/ce2/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# para time
# Completed: 56 of 56 jobs
# CPU time in finished jobs:     155723s    2595.39m    43.26h    1.80d  0.005 y
# IO & Wait Time:                  6387s     106.45m     1.77h    0.07d  0.000 y
# Average job time:                2895s      48.25m     0.80h    0.03d
# Longest job:                     6137s     102.28m     1.70h    0.07d
# Submission to last job:         11078s     184.63m     3.08h    0.13d

    #   Second cluster run to convert the .out's to .lav's
    cd /cluster/data/ce2/bed/blastz.cb1
    source DEF
    /cluster/data/ce2/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 14 of 14 jobs
# CPU time in finished jobs:        762s      12.70m     0.21h    0.01d  0.000 y
# IO & Wait Time:                    62s       1.03m     0.02h    0.00d  0.000 y
# Average job time:                  59s       0.98m     0.02h    0.00d
# Longest job:                      115s       1.92m     0.03h    0.00d
# Submission to last job:           293s       4.88m     0.08h    0.00d

    #   Third cluster run to convert lav's to axt's
    cd /cluster/data/ce2/bed/blastz.cb1
    source DEF
    /cluster/data/ce2/jkStuff/BlastZ_run2.sh
    cd run.2
    para try
    para check
    # only 7 jobs so all completed by para try
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:        187s       3.11m     0.05h    0.00d  0.000 y
# IO & Wait Time:                   169s       2.82m     0.05h    0.00d  0.000 y
# Average job time:                  51s       0.85m     0.01h    0.00d
# Longest job:                       77s       1.28m     0.02h    0.00d
# Submission to last job:            77s       1.28m     0.02h    0.00d

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/ce2/bed/blastz.cb1
    mkdir -p pslChrom
    set tbl = "blastzCb1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

   # Load database tables
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastz.cb1/pslChrom
    foreach d (I II III IV V X M)
        /cluster/bin/i386/hgLoadPsl -noTNameIx ce2 chr${d}_blastzCb1.psl
    end

# CHAIN Cb1 BLASTZ (DONE, 2004-04-30, hartera)
# CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
    ssh kk
    mkdir -p /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
    mkdir out chain
   
    ls -1S /cluster/data/ce2/bed/blastz.cb1/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
        /iscratch/i/worms/Celegans2/nib \
        /iscratch/i/worms/Cbriggsae/bothMasksNib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    # only 7 jobs so all done by para try
    para try
    para check
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       1265s      21.08m     0.35h    0.01d  0.000 y
# IO & Wait Time:                    69s       1.16m     0.02h    0.00d  0.000 y
# Average job time:                 191s       3.18m     0.05h    0.00d
# Longest job:                      632s      10.53m     0.18h    0.01d
# Submission to last job:           632s      10.53m     0.18h    0.01d

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    time chainMergeSort run1/chain/*.chain > all.chain
    # User 37.040u
    # System 3.090s
    # Elapsed Real 0:42.54

    time chainSplit chain all.chain
    # User 37.310u
    # System 2.930s
    # Elapsed Real 0:43.85

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain ce2 ${c}_chainCb1 $i
        echo done $c
    end

# NET Cb1 (DONE, 2004-04-30, hartera)
# REMAKE NET Cb1 WITH NEW CHAINS
   ssh eieio
   cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
   mkdir preNet
   mv /cluster/data/ce2/tmp/chrom.sizes /cluster/data/ce2/
   # remove header line in file so chainPreNet will work
   cd chain
   foreach i (*.chain)
   echo preNetting $i
   /cluster/bin/i386/chainPreNet $i /cluster/data/ce2/chrom.sizes \
                        /cluster/data/cb1/chrom.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/ce2/chrom.sizes \
                            /cluster/data/cb1/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 84246528, utime 710 s/100, stime 70

    ssh hgwdev
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    # add classification info using db tables 
    # use -noAr option - don't look for ancient repeats
    time netClass -noAr hNoClass.net ce2 cb1 cb1.net
    # User 11.860u
    # System 2.090s 
    # Elapsed Real 0:16.24

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    netFilter -minGap=10 cb1.net |  hgLoadNet ce2 netCb1 stdin

# MAKE axtNet TRACK (DONE, 2004-04-30, hartera)
# TRACK REMADE WITH NEW CHAINS 
    # Move back to the file server to create axt files corresponding to the net
    ssh kksilo
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    # try without netFilter as not working well
    # netFilter -minGap=10 cb1.net > cb1Filt.net
    mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
    netSplit cb1.net cb1NetSplit
    cd cb1NetSplit
    foreach i (*.net)
        set c = $i:r
        netToAxt -maxGap=300 $i \
           /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain/$c.chain \
           /iscratch/i/worms/Celegans2/nib \
           /iscratch/i/worms/Cbriggsae/bothMasksNib \
           /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/$c.axt
        echo done ../axtNet.2004-04-29/$c.axt
    end
    cd ..
    rm -r cb1NetSplit
    # sort axt files based on target position
    mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
    cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
    
    foreach i (*.axt) 
       set c = $i:r
       axtSort $c.axt ./sortedaxtNet/$c.axt
       echo done sorting $c.axt
    end

    # Load up the axtNet (alignment score wiggle) track:
    ssh hgwdev
    mkdir /gbdb/ce2/axtNetCb1
    foreach f (/cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/chr*.axt)
       ln -s $f /gbdb/ce2/axtNetCb1
    end
    cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
    ${HOME}/bin/i386/hgLoadAxt ce2 axtNetCb1

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE2 (DONE, 2003-04-13, hartera)
    # next machine
    ssh hgwdev
    # DNA port is "0", trans prot port is "1"
    # 8/26/04: set canPcr=1 for untranslated blat server.
    echo 'insert into blatServers values("ce2", "blat1", 17781, 0, 1); \
          insert into blatServers values("ce2", "blat1", 17780, 1, 0);' \
      | hgsql -h genome-testdb hgcentraltest
        # if you need to delete those entries
        echo 'delete from blatServers where db="ce2";' \
        | hgsql -h genome-testdb hgcentraltest
        # to check the entries:
        echo 'select * from blatServers where db="ce2";' \
        | hgsql -h genome-testdb hgcentraltest

# CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (DONE, 2004-04-27, hartera)
   mkdir /cluster/data/ce2/bed/orfToGene
   cd !$
   # gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu
   # ORF e.g. Y110A7A.10 gene e.g. aap-1
   awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
   # reformat this for use in creating Sanger Links with hgWormLinks
   awk 'NF == 2' orfToGene.txt > orfToGene.txt2
   # use perl script to move ORFs with the same gene name onto separate lines
   # and remove lines where there is a gene name with an alternate name
   # in parentheses. this output file is used for Sanger Links
   /cluster/data/ce2/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2   
   hgCeOrfToGene ce2 gene_names.txt sangerGene orfToGene
                                                                                
# DOWNLOAD WORMBASE FOR WS120 RELEASE (DONE, 2004-04-23, hartera)
# REMOVED acedb DIRECTORY AS NO LONGER NEEDED (DONE, 2008-08-18, hartera)
    ssh eieio
    mkdir -p /cluster/store7/acedb
    cd /cluster/store7/acedb
    wget -o ce2.fetchws120.log -r -l1 --timestamp --no-directories \
        ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/
    gunzip *.gz
    tar xvf *.tar

    # 2008-08-18, hartera. Remove the acedb directory.
    cd /cluster/store7
    rm -rf acedb

# CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
# (DONE, 2004-05-03, hartera)
   mkdir /cluster/data/ce2/bed/steinHelp
# Mail Lincoln Stein and ask for mappings from wormPep to
# SwissProt and from ORF to wormPep to description.  Place
# these files in the steinHelp directory:
#       swissprot_dump.txt  - contains ORF/WormPep/Description
#       and contains SwissProt Ids and Accessions
    # Jorge downloaded and installed the software for using AceDB 
    # to /cluster/bin/acedb/
    # To use 
    ssh hgwdev
    cd /cluster/store7/acedb
    # can use /cluster/bin/acedb/xace . to get X-Window query interface 
    # To use AceDB Query Language (AQL)
    cd /cluster/store7/acedb
    /cluster/bin/acede/tace .
    # At prompt, can type ? for help, type the following queries
#    wbGeneClass.txt - associates 3 letter first part of worm gene
#             name with a brief description.  
    acedb> AQL -o wbGeneClass.txt select l,l->Description  from l in class Gene_Clas
#    wbConcise.txt - associates worm gene name with a several
#             sentence description. 
    acedb> AQL -o wbConcise.txt select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
    # remove "" in wbGeneClass.txt and wbConcise.txt
    sed -e 's/"//g;' wbGeneClass.txt > wbGeneClass.new
    sed -e 's/"//g;' wbConcise.txt > wbConcise.new
    mv wbGeneClass.new wbGeneClass.txt
    mv wbConcise.new wbConcise.txt
    # first remove header lines
    sed -e 's/Format//' wbConcise.txt | sed -e 's/Locus[:]*//' > tmp
    sed -e 's/Text//' tmp > tmp2
    mv tmp2 wbConcise.txt
    rm tmp
    sed -e 's/Format//' wbGeneClass.txt | sed -e 's/Gene_class[:]*//' > tmp
    sed -e 's/Text//' tmp > tmp2
    mv tmp2 wbGeneClass.txt
    rm tmp
    
    # get SwissProt and TrEMBL Ids and accessions from ftp site
    # TrEMBL Ids missing from swissprot_dump.txt
    # Need these for details pages to work for sangerGene track
    wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
        ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep.table120
    
    # parse out CDS to Swiss-Prot IDs mappings
    cat << '_EOF_' > /cluster/data/ce2/jkStuff/getCDSandSPId.pl
#!/usr/bin/perl -w
use strict;
                                                                                
my $found = "false";
while (<STDIN>) {
    chomp;
    my @fields = split(/\t/);
    foreach my $f (@fields) {
        # get CDS name
        if ($f =~ /^>([0-9A-Za-z\.]+)/) {
             print "$1\t";
             $found = "false";
        }
        elsif ($f =~ /^(TR:|SP:)([0-9A-Z]{4,})$/) {
             print $2;
             $found = "true";
        }
    }
    if ($found eq "false") {
        print "NULL";
    }
    print "\n";
}
'_EOF_'
    chmod a+x /cluster/data/ce2/jkStuff/getCDSandSPId.pl
    perl /cluster/data/ce2/jkStuff/getCDSandSPId.pl < wormpep.table120 > spCdsToId.txt
    # rewrote hgWormLinks to extract information from these files
    hgWormLinks spCdsToId.txt swissprot_dump.txt \
        wbConcise.txt wbGeneClass.txt \
        /cluster/data/ce2/bed/orfToGene/orfToGene.tab2 sangerLinks.txt
    # create table in ce2 database and load data
    hgsql ce2 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
    hgsql -e \
        'load data local infile "sangerLinks.txt" into table sangerLinks' \
        ce2

# CREATE SANGERPEP TABLE (DONE, 2004-04-29, hartera)
# RECREATED SANGERPEP TABLE SO AS all.joiner EXPECTS THE CREATE TIME TO
# BE LATER THAN THAT FOR THE SANGERGENE TABLE (DONE, 2004-05-21, hartera)
    mkdir -p /cluster/data/ce2/bed/sangerPep
    cd /cluster/data/ce2/bed/sangerPep
    # Download peptide sequences from the Sanger Centre ftp site:
    wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
        ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep120
    # Load into database
    # 2004-05-10 and 2004-05-11 hartera dropped sangerPep table and recreated it
    # all.joiner expects sangerGene table to have an earlier
    # load time than sangerPep so recreate sangerPep after sangerGene
    hgPepPred ce2 generic sangerPep wormpep120
    # the sangerPep table is used by the sangerGene track

# TWINSCAN GENE PREDICTIONS (DONE, 2004-04-16, hartera)
    # These are Iscan (new version of Twinscan) gene predictions
    # e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/twinscan
    cd /cluster/data/ce2/bed/twinscan
    wget --timestamping -r \
        http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/
    mv ./genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/* \
        /cluster/data/ce2/bed/twinscan/
    rm -r genes.cs.wustl.edu
    rm index* ./chr_gtf/index* ./chr_ptx/index* ./chr_tx/index*

    # Clean up chrom field of GTF files
    foreach c (I II III IV V X)
        set f = chr_gtf/chr_${c}.iscan_pred_gtf
        echo ${f}
        sed -e "s/CHROMOSOME_/chr/g" ${f} | \
        sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
                chr_gtf/chr${c}-fixed.gtf
    end

    # pare down protein FASTA header to id and add missing .1:
    foreach c (I II III IV V X)
        set f = chr_ptx/CHROMOSOME_${c}.ptx
        echo ${f}
        perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
                ${f} > chr_ptx/chr${c}-fixed.fa
    end
    # load data into database, need -gtf option as no stop codons in
    # these predictions. also use new -frame -id and -geneName options
    ldHgGene -gtf -frame -id -geneName ce2 twinscan chr_gtf/chr*-fixed.gtf
    
    # Read 20990 transcripts in 147535 lines in 6 files
    # 20990 groups 6 seqs 1 sources 3 feature types
    # 20990 gene predictions
    
    # 2004-05-10, hartera dropped twinscanPep table and re-created it.
    # twinscan table was reloaded using extended options after loading 
    # twinscanPep. all.joiner expects twinscan table to have an earlier
    # load time than twinscanPep
    hgPepPred ce2 generic twinscanPep chr_ptx/chr*-fixed.fa
  
#  MAKING BLASTZ SELF (DONE, 2004-04-20, hartera)
    # The procedure for lineage spec business with self is to simply
    # use the actual repeat masker output for this C. elegans assembly as
    # the lineage specific repeats for itself.  Thus, merely make
    # symlinks to the repeat masker out files and name them as expected
    # for blastz.  In this case they are called notinCelegans but they
    # really mean inCelegans.  Yes, it is confusing, but that's just the
    # nature of the game in this case. 

    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
    cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
    foreach f (/iscratch/i/worms/Celegans2/bothMasksFa/*.fa)
        set base = $f:t:r:r
        echo $base.out.spec
        ln -s $f $base.out.spec
    end
    iSync

    # next machine
    ssh hgwdev
    mkdir -p /cluster/data/ce2/bed/blastzSelf
    cd /cluster/data/ce2/bed/blastzSelf

    cat << '_EOF_' > DEF
# C. elegans vs. C. elegans
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
                                                                                
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
                                                                                
# TARGET
# C. elegans
SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
                                                                                
# QUERY
# C. elegans
SEQ2_DIR=/iscratch/i/worms/Celegans2/nib
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/ce2/bed/blastzSelf
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
# Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.ce2-ce2.2004-04-14
    ssh kk
    cd /cluster/data/ce2/bed/blastzSelf
                                                                                
    # source the DEF file to establish environment for following commands
    source DEF
    /cluster/data/mm4/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# para time
# Completed: 196 of 196 jobs
# CPU time in finished jobs:     418569s    6976.16m   116.27h    4.84d  0.013 y
# IO & Wait Time:                  2416s      40.26m     0.67h    0.03d  0.000 y
# Average job time:                2148s      35.80m     0.60h    0.02d
# Longest job:                    17917s     298.62m     4.98h    0.21d
# Submission to last job:         53181s     886.35m    14.77h    0.62d

    cd /cluster/data/ce2/bed/blastzSelf
    source DEF
    /cluster/data/ce2/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...

# para time
# Completed: 14 of 14 jobs
# CPU time in finished jobs:       3340s      55.67m     0.93h    0.04d  0.000 y
# IO & Wait Time:                    98s       1.63m     0.03h    0.00d  0.000 y
# Average job time:                 246s       4.09m     0.07h    0.00d
# Longest job:                      439s       7.32m     0.12h    0.01d
# Submission to last job:           892s      14.87m     0.25h    0.01d

    # Third cluster run to convert lav's to axt's
    # lavToAxt has a new -dropSelf option that replaces axtDropOverlap
    # to remove the alignment blocks on the diagonal
    # When this was first run, it crashed for chrV with the 
    # error: breakUpIfOnDiagonal: Too many fragmented block lists!
    # This means that when a blockList is taken and broken up everywhere
    # there is a block along the diagonal and added to an array of 
    # fragmented sub-lists then in this chrosomosome the list has exceeded
    # the array size of 256 for an alignment. Change the line:
    # struct block *bArr[256] to struct block *bArr[1024] in the
    # parseIntoAxt function, make and re-run 
  
    cd /cluster/data/ce2/bed/blastzSelf
    mkdir axtChrom
    mkdir run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| ~/bin/i386/lavToAxt -dropSelf stdin /iscratch/i/worms/Celegans2/nib \
    /iscratch/i/worms/Celegans2/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/ce2/bed/blastzSelf/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    # check jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check ....

    # only 7 jobs so all completed by para try
    para try, para check ...etc.
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:        796s      13.27m     0.22h    0.01d  0.000 y
# IO & Wait Time:                   551s       9.18m     0.15h    0.01d  0.000 y
# Average job time:                 192s       3.21m     0.05h    0.00d
# Longest job:                      302s       5.03m     0.08h    0.00d
# Submission to last job:           303s       5.05m     0.08h    0.00d

    # Use Blastz alignments to create Self Chain. Do not create 
    # Self Blastz track since this is redundant with the Self Chain.

# CHAIN SELF BLASTZ (DONE, 2004-05-03, hartera)
# CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
# The axtChain is best run on the small kluster, or the kki kluster
#       in this case, it was run on the kk kluster
    ssh kk
    mkdir -p /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
    cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
    mkdir out chain

    ls -1S /cluster/data/ce2/bed/blastzSelf/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
# Problems running this. At first, all jobs crashed and para problems said that
# the ./out/*.out files were empty. Then copied nib files also to a nib2 
# directory and change in doChain so the 2 nib arguments are not the same.
# Then crashed saying that the ./chain/*.chain files were empty and that the 
# files in nib2 could not be opened (permissions ok). changed doChain back to 
# having the two nib directory arguments the same. Removed and recreated jobList# but left ./chain/*.chain and ./out/*.out 
# This time the jobs ran ok. Also runs ok if run axtFilter followed by 
# axtChain separately on the command line.                                                                                
    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
    /iscratch/i/worms/Celegans2/nib /iscratch/i/worms/Celegans2/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    # 7 jobs so all completed by para try
    para try, check,....etc.
# after remaking the axt files using the -dropSelf option for lavToAxt,
# these jobs no longer crashed.
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       2700s      45.00m     0.75h    0.03d  0.000 y
# IO & Wait Time:                   620s      10.34m     0.17h    0.01d  0.000 y
# Average job time:                 474s       7.90m     0.13h    0.01d
# Longest job:                      844s      14.07m     0.23h    0.01d
# Submission to last job:           844s      14.07m     0.23h    0.01d
    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end
    # for chrI
    # chrI
# large values truncated: need 1414 bins or larger binSize than 10000
#   0 ************************************************************ 710660
# 10000 ******** 90988
# 20000 ** 25092
# 30000 * 10405
# 40000  5912
# 50000  2958
# 60000  2157
# 70000  1780
# 80000  1336
# 90000  900
# 100000  636
# 110000  579
# 120000  451
# 130000  403
# 140000  351
# 150000  337
# 160000  287
# 170000  210
# 180000  88
# 190000  129
# 200000  147
# 210000  80
# 220000  60
# 230000  73
# 240000  42
    # trim to minScore=20000 to cut some of the fluff
    mkdir chainFilt
    foreach f (chain/*.chain)
      chainFilter -minScore=20000 $f > chainFilt/$f:t
    end
 
    # Load chains into database 
    # next machine
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/chainFilt
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain ce2 ${c}_chainSelf $i
        echo done $c
    end

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-04-21, hartera)
    ssh kksilo
    cd /cluster/data/ce2
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip ./sangerFa/[0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip ./RMRun/chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip ./softMask/chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip hardMask/chr*.fa
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMask/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=ce2 -native GenBank mrna \
        /cluster/data/ce2/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=ce2 -xeno GenBank mrna \
        /cluster/data/ce2/zip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=ce2 -native refseq mrna \
/cluster/data/ce2/zip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=ce2 -native GenBank est \
/cluster/data/ce2/zip/est.fa

cd /cluster/data/ce2/zip
# zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j refMrna.zip refMrna.fa
zip -j est.zip est.fa
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.

    # Make upstream files and Copy the .zip files to 
    # hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/ce2/zip
    # make upstream files for C. elegans RefSeq
    featureBits ce2 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    featureBits ce2 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    featureBits ce2 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa

    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    
    set gp = /usr/local/apache/htdocs/goldenPath/ce2
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( ../sangerFa/chr*.fa )
      zip -j $gp/chromosomes/$f:t.zip $f
    end
    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

# MAKE VSCB1 DOWNLOADS (DONE, 2004-05-11, hartera)
    # DO THIS LATER
    ssh kksilo
    cd /cluster/data/ce2/bed/blastz.cb1
    zip -j /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom/chr*.axt
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    cp all.chain cb1.chain
    zip -j /cluster/data/ce2/zip/cb1.chain.zip cb1.chain
    rm cb1.chain
    zip -j /cluster/data/ce2/zip/cb1.net.zip cb1.net
    # add axtNets
    cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
    foreach f (I II III IV V X M)
        cp chr${f}.axt chr{$f}.axtNet.axt
    end
    zip -j /cluster/data/ce2/zip/axtNetCb1.zip chr*.axtNet.axt
    rm *.axtNet.axt

    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
    cd /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
    mv /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom.zip
    mv /cluster/data/ce2/zip/cb1*.zip .
    mv /cluster/data/ce2/zip/axtNetCb1.zip .
    md5sum *.zip > md5sum.txt

    # Copy over & edit README.txt w/pointers to chain, net formats.

# MAKE VSSELF DOWNLOADS (DONE, 2004-05-11, hartera)
    ssh kksilo
    cd /cluster/data/ce2/bed/blastSelf
    zip -j /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom/chr*.axt
    cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
    cp all.chain self.chain
    zip -j /cluster/data/ce2/zip/self.chain.zip self.chain
    rm self.chain
   
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
    mv /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom.zip
    mv /cluster/data/ce2/zip/self.chain.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain formats. 

#  miRNA track (DONE - 2004-05-04 - Hiram)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/miRNA
    cd /cluster/data/ce2/bed/miRNA
    wget --timestamping \
    "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/cel_ws120.*"
    grep -v "^track " cel_ws120.bed | sed -e "s/ /\t/g" > ce2.bed
    hgLoadBed ce2 miRNA ce2.bed
    # entry in trackDb/trackDb.ra already there
    #	Compare with Ce1:
    #	featureBits ce1 miRNA
    #	10329 bases of 100277784 (0.010%) in intersection
    #	featureBits ce2 miRNA
    #	11382 bases of 100291761 (0.011%) in intersection

# MAKE ALT. SPLICING TRACK (DONE, 2004-05-13, hartera)
    # create rnaCluster table and Gene Bounds track
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/rnaCluster
    cd /cluster/data/ce2/bed/rnaCluster
    foreach f (I II III IV V X)
        echo "clusterRna ce2 /dev/null chr${f}.bed -chrom=${f}"
        clusterRna ce2 /dev/null chr${f}.bed -chrom=chr${f}
    end
    hgLoadBed ce2 rnaCluster *.bed

    hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
    (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce2 | egrep -v chromStart | cut -f2\-13 > $_.ce2.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh

    chmod a+x rnaCluster.sh
    rnaCluster.sh
    mkdir -p /cluster/data/ce2/bed/altSplice/agx
    cd /cluster/data/ce2/bed/altSplice

    hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
    perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce2 -beds=../rnaCluster/$_.ce2.rnaCluster.bed -agxOut=agx/$_.ce2.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
    # these jobs are no big deal on the worm, just run this file:
    chmod +x altSplice.para.spec
    mkdir agx
    ./altSplice.para.spec
# 0 genePredictions with 2610 clusters, 0 cassette exons, 0 of are not mod 3.
    cat agx/*.agx > altSplice.agx
    # UP TO HERE, make agxToBed and use local copy 
    ~/bin/i386/agxToBed altSplice.agx altSplice.bed

    cat << '_EOF_' >> altGraphX.sql
CREATE TABLE altGraphX (
  bin smallint unsigned not null,
  tName varchar(255) NOT NULL default '',
  tStart int(11) NOT NULL default '0',
  tEnd int(11) NOT NULL default '0',
  name varchar(255) NOT NULL default '',
  id int(10) unsigned NOT NULL auto_increment,
  strand char(2) NOT NULL default '',
  vertexCount int(10) unsigned NOT NULL default '0',
  vTypes longblob NOT NULL,
  vPositions longblob NOT NULL,
  edgeCount int(10) unsigned NOT NULL default '0',
  edgeStarts longblob NOT NULL,
  edgeEnds longblob NOT NULL,
  evidence longblob NOT NULL,
  edgeTypes longblob NOT NULL,
  mrnaRefCount int(11) NOT NULL default '0',
  mrnaRefs longblob NOT NULL,
  mrnaTissues longblob NOT NULL,
  mrnaLibs longblob NOT NULL,
  PRIMARY KEY  (id),
  KEY tName (tName(16),tStart,tEnd)
);
'_EOF_'

    hgLoadBed -sqlTable=altGraphX.sql ce2 altGraphX altSplice.agx

    # trackDb entry:
        track altGraphX
        shortLabel Alt-Splice
        longLabel Alternative Splicing
        group rna
        priority 74.1
        visibility pack
        type bed 3
        canPack off

##################################################################
# Create frames based page views of alt splice areas of interest
#	(DONE - 2004-05-16 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/altSplice/altAnalysis
    cd /cluster/data/ce2/bed/altSplice/altAnalysis
    altAnalysis ce2 ../altSplice.agx altSummary.txt links.html \
	frame.html -bedName=all
    # creates files, (as well as several .bed files):
    #	-rw-rw-r--    1      282 May 16 09:19 frame.html
    #	-rw-rw-r--    1   391029 May 16 09:19 links.html
    #	Copy these to the Intronerator WS120 location, change
    #	default opening location:

    cp -p links.html /usr/local/apache/htdocs/IntronWS120
    sed -e \
    "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
	-e "s/NM_005487/chrII:14689493-14690232/" \
	-e "s/hgwdev-sugnet/genome-test/" \
	 frame.html > /usr/local/apache/htdocs/IntronWS120/frame.html

    #	Better location is in the goldenPath itself 2006-01-12:
    mkdir /usr/local/apache/htdocs/goldenPath/ce2/altGraphX
    sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
	> /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/links.html

    sed -e \
    "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
	-e "s/NM_005487/chrII:14689493-14690232/" \
	-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
	 frame.html \
	    > /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/frame.html

# MAKE single coverage BEST track from blastz axt's (DONE - 2004-05-11 - Hiram)
#	This data is for use with the phyloHMM construction, below.
    ssh eieio
    cd /cluster/data/ce2
    cat << '_EOF_' > jkStuff/mkBest.sh
#!/bin/sh

mkdir axtBest
mkdir pslBest

for c in I II III IV V X M
do
    echo "axtBesting chr$c"
    /cluster/bin/i386/axtBest axtChrom/chr${c}.axt chr${c} \
        axtBest/chr${c}.axt -minScore=300
    echo "processing chr${c}.axt -> chr${c}_blastzBestCb1.psl"
    /cluster/bin/i386/axtToPsl axtBest/chr${c}.axt \
        S1.len S2.len pslBest/chr${c}_blastzBestCb1.psl
    echo "Done: chr${c}_blastzBestRn3.psl"
done
'_EOF_'
    chmod +x jkStuff/mkBest.sh
    cd /cluster/data/ce2/bed/blastz.cb1
    time ../../jkStuff/mkBest.sh
    #	real    1m47.069s
    #	user    1m1.880s
    #	sys     0m14.240s

    #	Load this table
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastz.cb1/pslBest
    time /cluster/bin/i386/hgLoadPsl ce2 -table=blastzBestCb1 \
	chr*_blastzBestCb1.psl
    #	real    0m21.775s
    #	user    0m8.510s
    #	sys     0m1.500s

# MAKE phyloHMM data
# (redone using new phastCons, acs 9/14/04)
# (redone using nets rather than axtBest, acs 10/10/04)
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/pHMM
    cd /cluster/data/ce2/bed/pHMM
    mkdir axt
    cd axt
    ln -s /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/*.axt .
    cd ..
    mkdir maf
    #	create maf files from axtBest axt files
for c in I II III IV M V X
do
axtToMaf axt/chr${c}.axt tSizes qSizes maf/chr${c}.Tmaf
sed -e "s/chrUn/cb1.chrUn/" maf/chr${c}.Tmaf | sed -e "s/ chr/ ce2.chr/" > \
        maf/chr${c}.maf
rm -f maf/chr${c}.Tmaf
echo "done chr${c}"
done

    # split up alignments; no need to use cluster with worm
mkdir -p /cluster/bluearc/ce2/phastCons/WINDOWS
mkdir -p /scratch/phastCons.ce2.09-14-04.WINDOWS
for i in I II III IV V X M; do \
    msa_split maf/chr${i}.maf -i MAF -M /cluster/data/ce2/sangerFa/chr${i}.fa \
	-w 1000000,0 -r /scratch/phastCons.ce2.09-14-04.WINDOWS/chr${i} -o SS \
	-I 1000 -B 5000 ;\
done
for file in /scratch/phastCons.ce2.09-14-04.WINDOWS/* ; do \
    echo $file ;\
    gzip -c $file > /cluster/bluearc/ce2/phastCons/WINDOWS/`basename $file`.gz ;\
done
rm -rf /scratch/phastCons.ce2.09-14-04.WINDOWS

    # produce rough starting model
msa_view -i MAF --aggregate ce2,cb1 -o SS -z maf/chr*.maf > all.ss
phyloFit -i SS all.ss -o starting-tree
    # also get average GC content of aligned regions
msa_view -i SS all.ss --summary-only
#   descrip.                      A          C          G          T        G+C length   all_gaps  some_gaps
#   all.ss                   0.3130     0.1872     0.1869     0.3129     0.3741   58909311          0   11236473

    # now set up cluster job to estimate model parameters.  See
    # makeHg17.doc for add'l details

cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3741 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 25 --target-coverage 0.45 --quiet --log $2 --estimate-trees $3
EOF
chmod u+x doEstimate.sh
    # (target of .45 is based on target of 0.2 and about 45% alignment
    # coverage: 0.2/0.45 is approx. 0.45)

rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobs.lst
for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do
    root=`basename $f .ss.gz` ;\
    echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
done

    # run cluster job
ssh kk... para create ...
    # (takes about 2 minutes)

    # Now combine parameter estimates.  
ls TREES/*.cons.mod > cons.txt
phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
ls TREES/*.noncons.mod > noncons.txt
phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt

    # Now set up cluster job for computing consevation scores and predicted elements
cat << 'EOF' > doPhastCons.sh
#!/bin/sh

mkdir -p /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 25 --target-coverage 0.45 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/ce2/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/ce2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
chmod u+x doPhastCons.sh

rm -fr /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
rm -f jobs2.lst
for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done

    # run cluster job
ssh kk... para create, ...
    # (takes about 30 sec)

    # set up phastConsElements track
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' /cluster/bluearc/ce2/phastCons/ELEMENTS/*.bed > all.raw.bed 
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
hgLoadBed ce2 phastConsElements all.bed
featureBits ce2 phastConsElements
#   20217282 bases of 100291761 (20.158%) in intersection

    # set up wiggle
mkdir wib
for i in I II III IV V X M; do \
    zcat `ls /cluster/bluearc/ce2/phastCons/POSTPROBS/chr${i}.*.pp.gz | sort -t\. -k2,2n` | \
	wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons stdin ;\
done
hgLoadWiggle ce2 phastCons -pathPrefix=/gbdb/ce2/phastCons/wib wib/chr*_phastCons.wig
mkdir -p /gbdb/ce2/phastCons/wib
rm -f /gbdb/ce2/phastCons/wib/chr*phastCons.wib
ln -s /cluster/data/ce2/bed/pHMM/wib/*.wib /gbdb/ce2/phastCons/wib
chmod 775 . wib /gbdb/ce2/phastCons /gbdb/ce2/phastCons/wib
chmod 664 wib/*.wib

    # move postprobs over and clean up bluearc 
rsync -av /cluster/bluearc/ce2/phastCons/POSTPROBS .
    # (people sometimes want the raw scores)
rm -r /cluster/bluearc/ce2/phastCons/ELEMENTS /cluster/bluearc/ce2/phastCons/POSTPROBS 

    # set up full alignment/conservation track
    # note that in this case the pairwise mafs are used both for the
    # "multiple" alignment and the pairwise alignments
mkdir -p /gbdb/ce2/c_briggsae_pwMaf
cd maf
ln -s `pwd`/*.maf /gbdb/ce2/c_briggsae_pwMaf
hgLoadMaf ce2 -warn c_briggsae_pwMaf -pathPrefix=/gbdb/ce2/c_briggsae_pwMaf
chmod 775 /gbdb/ce2/c_briggsae_pwMaf .
chmod 664 *.maf

    # trackDb entry:
#   track c_briggsae_pwMaf
#   shortLabel Conservation
#   longLabel Blastz Alignments with C. Briggsae & Conservation 
#   group compGeno
#   priority 100
#   visibility pack
#   type wigMaf 0.0 1.0
#   maxHeightPixels 100:40:11
#   wiggle phastCons
#   yLineOnOff Off
#   autoScale Off
#   pairwise pwMaf
#   speciesOrder c_briggsae


######## MAKING GENE SORTER TABLES #### (DONE - 2004-05-27 - hartera)
# These are instructions for building the
# neighborhood browser.  Don't start these until
# there is a sangerGene table with a proteinID column containing Swiss-Prot
# protein IDs, and also Kim lab expression data is required (in hgFixed).

    # Cluster together various alt-splicing isoforms.
    # Creates the sangerIsoforms and sangerCanonical tables
    # (DONE, 2004-05-21, hartera)
    ssh hgwdev
    hgClusterGenes ce2 sangerGene sangerIsoforms sangerCanonical
    # You may need to build this binary in src/hg/near/hgClusterGenes
    # Got 20713 clusters, from 23076 genes in 7 chromosomes
    # featureBits ce2 sangerCanonical
    # 54156601 bases of 100291761 (53.999%) in intersection
    # featureBits ce1 sangerCanonical 
    # 53654286 bases of 100277784 (53.506%) in intersection

    # Create Self blastp table - sangerBlastTab (DONE, 2004-05-24, hartera)
    # Reload blastp data into sangerBlastTab and drop knownBlastTab 
    # - table given the wrong name (DONE, 2004-05-28, hartera)
    # Get sangerPep peptide sequences fasta file from sangerPep dir
    # and create a blast database out of them.
    mkdir -p /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    mv /cluster/data/ce2/bed/sangerPep/wormPep120 wormPep120.faa
    
    formatdb -i wormPep120.faa -t wormPep120 -n wormPep120
    # This command is in /projects/compbio/bin/$MACH/formatdb

    # Copy database over to iscratch
    ssh kkr1u00
    mkdir -p mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep120.* /iscratch/i/ce2/blastp
    iSync
    # Blastall and Data directory are already in /iscratch/i/blast/
    # From the dates, this looks to be the same versio as in
    # /projects/compbio/bin/i686/
    # Split up fasta file into bite sized chunks for cluster
    cd /cluster/data/ce2/bed/blastp
    mkdir split
    faSplit sequence wormPep120.faa 8000 split/wp

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/ce2/bed/blastp/self
    cd /cluster/data/ce2/bed/blastp/self
    mkdir run
    cd run
    mkdir out

# Make blast script
    cat  << '_EOF_' > blastSome
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep120 \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
     chmod a+x blastSome
     # Make gensub2 file
     cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
     # Create parasol batch
     # 'ls ../../split/*.fa' is too long an argument list, hence the echo

     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub jobList
     para create jobList
     para try
# Wait a minute, and do a para check,  if all is good
# then do a
     para push, para check etc.
# para time
# Completed: 6684 of 6684 jobs
# CPU time in finished jobs:      35657s     594.28m     9.90h    0.41d  0.001 y
# IO & Wait Time:                 27262s     454.37m     7.57h    0.32d  0.001 y
# Average job time:                   9s       0.16m     0.00h    0.00d
# Longest job:                      193s       3.22m     0.05h    0.00d
# Submission to last job:           302s       5.03m     0.08h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/ce2/bed/blastp/self/run/out
    hgLoadBlastTab ce2 sangerBlastTab *.tab

# CREATE EXPRESSION DISTANCE TABLES FOR  
# KIM LAB EXPRESSION DATA (DONE, 2004-05-24, hartera) 
hgExpDistance ce2 hgFixed.kimWormLifeMedianRatio \
              hgFixed.kimWormLifeMedianExps kimExpDistance

# CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
# sangerToRefSeq (DONE, 2004-05-24, hartera)
hgMapToGene ce2 refGene sangerGene sangerToRefSeq

# CREATE TABLE TO MAP BETWEEN SANGER GENES AND PFAM DOMAINS
# sangerToPfam (DONE, 2004-05-24, hartera)
# Drop table knownToPfam - wrong table name.
# Reload data into sangerToPfam (DONE, 2004-05-28, hartera)
hgMapViaSwissProt ce2 sangerGene name proteinID Pfam sangerToPfam

# CREATE MAPPING OF SANGER GENES TO KIM LAB EXPRESSION DATA GENES 
# sangerToKim (DONE, 2004-05-27, hartera)
# This is actually a mapping of the sangerGene table to itself - 
# this is how this table was created for ce1.
# This means that many gene names (4614/19134) in kimWormLifeAllRatio 
# do not appear in sangerGene but when randomly checking some of these, 
# it was found they are not in WS120 WormBase and probably represent 
# alternate names or names that no longer exist or withdrawn sequences.
hgMapToGene ce2 sangerGene sangerGene sangerToKim

# SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
# set hgNearOk to 1 on hgcentraltest to make Ce2 Gene Sorter viewable

hgsql -h hgwbeta -e 'update dbDb set hgNearOk = 1 where name = "Ce2";' \
         hgcentraltest


# MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/mitopred
    cd /cluster/data/ce2/bed/mitopred
    wget http://mitopred.sdsc.edu/data/cel_30.out
    perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' cel_30.out > mitopred.tab
    cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
    name varchar(10) not null,      # SwissProt ID
    confidence varchar(8) not null, # Confidence level
              #Indices
    PRIMARY KEY(name(6))
);
'_EOF_'
    hgsql ce2 < mitopred.sql
    hgsql ce2 -e 'load data local infile "mitopred.tab" into table mitopred'

# MAKE Human Proteins track
    ssh eieio
    mkdir -p /cluster/data/ce2/blastDb
    cd /cluster/data/ce2/blastDb
    for i in ../sangerFa/*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/ce2/blastDb
    cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/ce2/bed/tblastn.hg16KG
    cd /cluster/data/ce2/bed/tblastn.hg16KG
    ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
    exit

    # back to eieio
    cd /cluster/data/ce2/bed/tblastn.hg16KG
    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc worm.lst | awk "{print \\\$1}"`\)
    # 41117/(150000/578) = 158.437507
    split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 worm.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/ce2/bed/tblastn.hg16KG
    para create blastSpec
    para shove   # jobs will need to be restarted, but they should all finish
# Completed: 150858 of 150858 jobs
# CPU time in finished jobs:   10404860s  173414.33m  2890.24h  120.43d  0.330 y
# IO & Wait Time:               2468643s   41144.06m   685.73h   28.57d  0.078 y
# Average job time:                  85s       1.42m     0.02h    0.00d
# Longest job:                      411s       6.85m     0.11h    0.00d
# Submission to last job:         26364s     439.40m     7.32h    0.31d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/ce2/bed/tblastn.hg16KG
    para create chainSpec
    para push
# Completed: 261 of 261 jobs
# CPU time in finished jobs:        828s      13.81m     0.23h    0.01d  0.000 y
# IO & Wait Time:                 19521s     325.34m     5.42h    0.23d  0.001 y
# Average job time:                  78s       1.30m     0.02h    0.00d
# Longest job:                      270s       4.50m     0.07h    0.00d
# Submission to last job:          3878s      64.63m     1.08h    0.04d

    exit
    # back to eieio
    cd /cluster/data/ce2/bed/tblastn.hg16KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/ce2/jkStuff/liftAll.lft warn stdin |  \
    	sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastHg16KG.psl
    cd ..
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/ce2/bed/tblastn.hg16KG
    hgLoadPsl ce2 blastHg16KG.psl
    exit

    # back to eieio
    rm -rf blastOut

# End tblastn Human

# MAKE Drosophila Proteins track
    ssh eieio
    mkdir -p /cluster/data/ce2/blastDb
    cd /cluster/data/ce2/blastDb
    for i in ../sangerFa/*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/ce2/blastDb
    cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/ce2/bed/tblastn.dm1FB
    cd /cluster/data/ce2/bed/tblastn.dm1FB
    ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
    exit

    # back to eieio
    cd /cluster/data/ce2/bed/tblastn.dm1FB
    mkdir fbfa
    split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 1 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp 
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 worm.lst fb.lst blastGsub blastSpec

    exit
    ssh kk
    cd /cluster/data/ce2/bed/tblastn.dm1FB
    para create blastSpec
    para push
# Completed: 3283 of 3283 jobs
# CPU time in finished jobs:    3063561s   51059.35m   850.99h   35.46d  0.097 y
# IO & Wait Time:                 25333s     422.22m     7.04h    0.29d  0.001 y
# Average job time:                 941s      15.68m     0.26h    0.01d
# Longest job:                     3435s      57.25m     0.95h    0.04d
# Submission to last job:          9387s     156.45m     2.61h    0.11d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    # << for emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/ce2/bed/tblastn.dm1FB
    para create chainSpec
    para push
# Completed: 469 of 469 jobs
# CPU time in finished jobs:        721s      12.02m     0.20h    0.01d  0.000 y
# IO & Wait Time:                  1562s      26.03m     0.43h    0.02d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest job:                       42s       0.70m     0.01h    0.00d
# Submission to last job:          5546s      92.43m     1.54h    0.06d

    exit
    # back to eieio
    cd /cluster/data/ce2/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* |  sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../blastDm1FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/ce2/bed/tblastn.dm1FB
    hgLoadPsl ce2 blastDm1FB.psl
    exit

    # back to eieio
    rm -rf blastOut

# End tblastn


# MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
    # First, get ranges of target coverage:
    ssh eieio
    mkdir /cluster/data/ce2/bed/pHMM/coverage
    cd /cluster/data/ce2/bed/pHMM/maf.new.axtNet
    cat *.maf | nice mafRanges -notAllOGap stdin ce2 \
      /cluster/data/ce2/bed/pHMM/coverage/ce2.mafRanges.bed

    ssh hgwdev
    cd /cluster/data/ce2/bed/pHMM/coverage
    # To make subsequent intersections a bit quicker, output a bed with 
    # duplicate/overlapping ranges collapsed:
    nice featureBits ce2 ce2.mafRanges.bed \
      -bed=ce2.mafRangesCollapsed.bed
#43967098 bases of 100291761 (43.839%) in intersection

    # mafCoverage barfs currently, so pass on this for now:
    #cat ../maf.new.axtNet/*.maf \
    #| nice mafCoverage -count=2 ce2 stdin > ce2.mafCoverage

    # Intersect maf target coverage with gene regions -- 
    # use Adam's files which are a union of sangerGene and refGene:
    nice featureBits ce2 -enrichment \
      /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormCds.bed \
      ce2.mafRangesCollapsed.bed \
      -bed=ce2.mafCds.bed
#wormCds.bed 25.701%, ce2.mafRangesCollapsed.bed 43.839%, both 20.701%, cover 80.55%, enrich 1.84x
    nice featureBits ce2 -enrichment \
      /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr3.bed \
      ce2.mafRangesCollapsed.bed \
      -bed=ce2.mafUtr3.bed
#wormUtr3.bed 0.323%, ce2.mafRangesCollapsed.bed 43.839%, both 0.216%, cover 66.86%, enrich 1.53x
    nice featureBits ce2 -enrichment \
      /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr5.bed \
      ce2.mafRangesCollapsed.bed \
      -bed=ce2.mafUtr5.bed
#wormUtr5.bed 0.057%, ce2.mafRangesCollapsed.bed 43.839%, both 0.041%, cover 72.37%, enrich 1.65x


# syntenic net with cb1 (acs, 11/18/04)

    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    netFilter -syn cb1.net > cb1Syn.net
    netFilter -minGap=10 cb1Syn.net | hgLoadNet ce2 syntenyNetCb1 stdin
    # (add trackDb.ra entry)

# [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 netCb1
# 88355987 bases of 100291761 (88.099%) in intersection
# [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 syntenyNetCb1
# 73427463 bases of 100291761 (73.214%) in intersection
# Hmmm -- looking in browser, seems a bit stringent for worm...

# CLEANUP CB1 BLASTZ (DONE, 2004-02-24, hartera)
    ssh eieio
    cd /cluster/data/ce2/bed/blastz.cb1
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/hNoClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/cb1Filt.net axtChain/*.net &
    nice rm -fr axtChain/chain axtChain/preNet &
    nice rm -rf raw &
    nice rm -rf lav &

# CLEANUP SELF BLASTZ (DONE, 2004-02-24, hartera)
    ssh eieio
    cd /cluster/data/ce2/bed/blastzSelf
    # remove old chains
    nice rm -fr axtChain &
    nice rm axtChain.2004-04-30/run1/chain/* &
    nice gzip axtChrom/* pslChrom/* axtChain.2004-04-30/all.chain axtChain.2004-04-30/chainFilt/* axtChain.2004-04-30/chain/* &
    nice rm -rf raw &
    nice rm -rf lav &


###########################################################################
# LOAD GENEID PREDICTIONS (DONE 3/15/05)
    ssh eieio
    mkdir /cluster/data/ce2/bed/geneid
    cd /cluster/data/ce2/bed/geneid
    foreach chr (chrI chrII chrIII chrIV chrM chrV chrX)
      wget \
  http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.gtf
      wget \
  http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.prot
    end
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    perl -wpe 's/^(>\S+)/$1.1/' *.prot > geneid.fa
    ssh hgwdev
    cd /cluster/data/ce2/bed/geneid
    ldHgGene -gtf -genePredExt ce2 geneid *.gtf 
    hgPepPred ce2 generic geneidPep geneid.fa

###########################################################################
#	DONE - 2006-01-06 - Hiram
#	Obtained a photograph from Professor Mark Blaxter
#	mark.blaxter@ed.ac.uk - http://nema.cap.ed.ac.uk/index.html
#	Saved image from email to:
#	/cluster/data/ce2/html/C.elegans.jpg
#	-rw-rw-r--    1   227317 Jan  6 09:40 C.elegans.jpg
#	Transform image to a size appropriate for our WEB pages:
    ssh hgwdev
    cd /cluster/data/ce2/html
    convert -normalize -sharpen 0 -geometry 300x200 \
	C.elegans.jpg Caenorhabditis_elegans.jpg
    cp -p Caenorhabditis_elegans.jpg /usr/local/apache/htdocs/images
#	Add IMG tag pointer to this image in the description.html page
#	and for ce1 and ce3


########################################################################
### microRNA targets tracks  (DONE - 2006-03-29 - 2006-04-26 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
    ssh hgwdev
    mkdir /cluster/data/ce2/bed/picTar
    cd /cluster/data/ce2/bed/picTar
    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/new_single_elegans_bed' \
	    -O newSingleElegans.bed

    grep -v "^track" newSingleElegans.bed \
	| hgLoadBed -strict ce2 picTar stdin
    #	Loaded 11987 elements of size 9

    nice -n +19 featureBits ce2 picTar
    #	39233 bases of 100291761 (0.039%) in intersection

########################################################################
# CLEANUP OF CB1 BLASTZ (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain
    rm -r cb1NetSplit cb1NetSplit1 tmp1
    cd ..
    rm -r axtChain.2004-04-29test
    # Looks like new chians were made on 2004-04-29 
    cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain.2004-04-29
    # remove the split net and chain directories as these can easily be
    # created from the all.chain and cb1.net files
    rm -r chain cb1NetSplit run1/chain
    # gzip files
    gzip all.chain *.net
    cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/
    rm -r axtNet1
    cd axtNet.2004-04-29
    # remove *.axt files as these are also in the sortedaxtNet directory
    rm *.axt sortedaxtNet/axtNetCb1.tab 
    cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChrom/maf
    gzip sortedaxtNet/*.axt

#######################################################################
## LIFTOVER To Ce4 (DONE - 2007-07-16 Hiram)
    ssh kkr1u00
    cd /cluster/data/ce4
    mkdir nib
    cd nib
    for C in I II III IV V X M
do
    faToNib -softMask ../goldenPath/chromosomes/chr${C}.fa.gz chr${C}.nib
done
    #	Writing 15072419 bases in 7536218 bytes
    #	Writing 15279316 bases in 7639666 bytes
    #	Writing 13783681 bases in 6891849 bytes
    #	Writing 17493784 bases in 8746900 bytes
    #	Writing 20919398 bases in 10459707 bytes
    #	Writing 17718852 bases in 8859434 bytes
    #	Writing 13794 bases in 6905 bytes

    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
    	ce4 /cluster/data/ce4/nib
    # as it says, DO THIS NEXT:
    ssh kk
    #	if bin/scripts is not in your PATH, add it for this command:
    PATH=$PATH:/cluster/bin/scripts \
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
	ce2 /cluster/data/ce2/nib ce4 /iscratch/i/ce4/split10k \
	/cluster/data/ce4/jkStuff/11.ooc
    # as it says, DO THIS NEXT:
    cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/run
    para try, check, push, check, ...
# Completed: 49 of 49 jobs
# CPU time in finished jobs:      14998s     249.97m     4.17h    0.17d  0.000 y
# IO & Wait Time:                   405s       6.74m     0.11h    0.00d  0.000 y
# Average job time:                 314s       5.24m     0.09h    0.00d
# Longest finished job:             706s      11.77m     0.20h    0.01d
# Submission to last job:           728s      12.13m     0.20h    0.01d

    # as it says, DO THIS NEXT:
    #	this does the liftUp and makes the psl files
    #	kkr1u00 is down at this time, fixup this script to work on kkr3u00
    ssh kkr3u00
    cd /cluster/data/ce2/bed
    ln -s blat.ce4.2007-07-16 ce4
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh ce2 ce4
    #	real    1m1.780s
    # as it says, DO THIS NEXT:
    #	the prepares the batch to run for the chaining
    ssh kki
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
	ce2 /cluster/data/ce2/nib ce4 /cluster/data/ce4/nib
    # as it says, DO THIS NEXT:
    #	running the chain batch
    cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/chainRun
    para try, check, push, check, ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs:         19s       0.31m     0.01h    0.00d  0.000 y
# IO & Wait Time:                    17s       0.29m     0.00h    0.00d  0.000 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:               7s       0.12m     0.00h    0.00d
# Submission to last job:             7s       0.12m     0.00h    0.00d

    ssh kkstore02
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh ce2 ce4
    #	Created /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz
    # as it says, DO THIS NEXT:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh ce2 ce4
    #	It says this:
    # 	Now, add link for
    #	/usr/local/apache/htdocs/goldenPath/ce2/liftOver/ce2ToCe4.over.chain
    #	to hgLiftOver
    #	But I believe that link was already done:
    cd /gbdb/ce2/liftOver
    ls -og ce2ToCe4*
    #	lrwxrwxrwx  1 53 Jun  5 16:35 ce2ToCe4.over.chain.gz ->
    #		/cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
ce2.upstreamGeneTbl = refGene
