# for emacs: -*- mode: sh; -*-


# This describes how to make the sacCer1 browser database.
#
# The genomic sequence was downloaded Nov. 17, 2003, and is dated
# 10/1/2003 on the Stanford FTP site.  The previous version of the
# genomic sequence is dated Nov. 2002.  I don't see version numbers.

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+-----------------+
#  | tableName | type            |
#  +-----------+-----------------+
#  | sgdGene   | genePred sgdPep |
#  +-----------+-----------------+


# Create the directory structure and download sequence from the SGD
# site at Stanford.
    mkdir /cluster/store6/sacCer1
    ln -s /cluster/store6/sacCer1 /cluster/data/sacCer1
    cd /cluster/data/sacCer1
    mkdir 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M bed download
    cd download
    mkdir chromosomes
    cd chromosomes
    foreach i (01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt)
	wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/chromosomes/fasta/chr$i.fsa
    end
    foreach i (1 2 3 4 5 6 7 8 9)
	echo ">chr$i" > chr$i.fa
	grep -v '^>' chr0$i.fsa >> chr$i.fa
    end
    foreach i (chr1?.fsa)
	echo ">$i:r" > $i:r.fa
	grep -v '^>' $i >> $i:r.fa
    end
    echo ">chrM" > chrM.fa
    grep -v '^>' chrmt.fsa > chrM.fa
    foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
	mv chr$i.fa ../../$i
    end
    gzip -4 *.fsa

# Download other info from the FTP site.
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/gene_registry
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/oracle_schema
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/protein_info
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence_similarity
    wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/systematic_results
    wget -nH --cut-dirs=5 -r -l 2 ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein

    # Check that the genome is in sync with the annotations
    cd /cluster/data/sacCer1
    checkSgdSync download
    # This showed 5 of 5788 with no ATG.  I sent these to Mike Cherry.

# CREATING DATABASE, MAKE AND LOAD NIBS (DONE 2003-11-24 - Jim)
# NOTE FOR YEAST WE DO NOT REPEAT MASK SEQUENCE.
    ssh hgwdev
    echo 'create database sacCer1' | hgsql ''
    cd /cluster/data/sacCer1
    hgNibSeq sacCer1 /cluster/data/sacCer1/nib */*.fa
    faSize -detailed */*.fa > chrom.sizes
    mkdir -p /gbdb/sacCer1/nib
    ln -s /cluster/data/sacCer1/nib/*.nib /gbdb/sacCer1/nib
    # Create a read-only alias in your .cshrc or .profile
    alias sacCer1 mysql -u hguser -phguserstuff -A sacCer1
    # Use df to ake sure there is at least 5 gig of free mySql table space
    df -h /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-11-21 - Jim)
    ssh hgwdev
    #  the following command copies all the data from the table
    #	grp in the database hg16 to our new database sacCer1
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
      | hgsql sacCer1

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR YEAST (DONE 2003-11-24 Jim)
    # Warning: must genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk) values \
        ("sacCer1", "Oct. 2003", "/gbdb/sacCer1/nib", "Yeast", \
               "chr2:827700-845800", 1, 65, "Yeast", \
                "Saccharomyces cerevisiae", "/gbdb/sacCer1/html/description.html", \
                0);' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Yeast", "sacCer1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add sacCer1 in all the right places and do
    make update

    # go public on genome-test
    #make alpha
    cvs commit makefile

    # Add trackDb directories
    mkdir sacCer
    mkdir sacCer/sacCer1
    cvs add sacCer
    cvs add sacCer/sacCer1
    cvs commit sacCer

# MAKE GCPERCENT (DONE 2003-11-24 - Jim)
     ssh hgwdev
     mkdir /cluster/data/sacCer1/bed/gcPercent
     cd /cluster/data/sacCer1/bed/gcPercent
     # create and load gcPercent table
     hgsql sacCer1  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent sacCer1 ../../nib

# RUN TANDEM REPEAT MASKER (DONE 2003-11-24 - Jim)
    # Do tandem repeat masking - this takes about 2 minutes.
    ssh hgwdev
    mkdir -p /cluster/data/sacCer1/bed/simpleRepeat
    cd /cluster/data/sacCer1
    foreach i (? ??)
	trfBig $i/chr$i.fa /dev/null \
	-bedAt=/cluster/data/sacCer1/bed/simpleRepeat/chr$i.bed
    end
    # Load into the database
    cd /cluster/data/sacCer1/bed/simpleRepeat
    hgLoadBed sacCer1 simpleRepeat *.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 1316 elements of size 16
    featureBits sacCer1 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection

# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (done)
    ssh hgwdev
    # Write ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html
    # Check it in 
    mkdir -p /gbdb/sacCer1/html
    ln -s /cluster/data/sacCer1/html/description.html /gbdb/sacCer1/html/
    # Create line in trackDb/makefile that copies description.html into
    #     /cluster/data/sacCer1/html/description.html  
    # Note, you definitely want to make the symbolic link in /gbdb/sacCer/html
    # before doing this.

# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 2003-11-24 Jim)
# AND SET UP BLAT SERVERS
    ssh blat10
    cd /scratch
    mkdir sacCer1Nib
    scp 'hgwdev:/cluster/data/sacCer1/nib/*.nib' sacCer1Nib
    # Ask admins to set up blat servers and ask which ports they assign.
    # 8/26/04: set canPcr=1 for untranslated blat server.
    ssh hgwdev
    echo 'insert into blatServers values("sacCer1", "blat10", 17788, 0, 1); \
          insert into blatServers values("sacCer1", "blat10", 17789, 1, 0);' \
      | hgsql -h genome-testdb hgcentraltest

# CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE 2003-12-02 Jim)
# Note initially the s_cerevisiae.gff3 file ended up being out of
# sync with the genome. Mike Cherry (cherry@genome.stanford.edu)
# regenerated it.  The format may end up changing in the future though.
    ssh hgwdev
    cd /cluster/data/sacCer1/bed
    mkdir sgdGene
    hgSgdGff3 ../download/chromosomal_feature/s_cerevisiae.gff3 sgdGene
    cd sgdGene
    ldHgGene sacCer1 sgdGene codingGenes.gff
    hgLoadBed sacCer1 sgdOther otherFeatures.bed \
        -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
    zcat ../../download/orf_protein/*.fasta.gz \
        | hgSgdPep -restrict=genePred.tab stdin sgdPep.fa symbol.txt
    hgPepPred sacCer1 generic sgdPep sgdPep.fa
    echo 'create table sgdToName ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  INDEX (value));' | hgsql sacCer1
    echo 'load data local infile "symbol.txt" \
          into table sgdToName;' | hgsql sacCer1
    hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdDescription.sql
    echo 'load data local infile "descriptions.txt" \
          into table sgdDescription;' | hgsql sacCer1
    hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
    echo 'load data local infile "notes.txt" \
          into table sgdOtherDescription;' | hgsql sacCer1

# ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE 2003-11-25 Jim)
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/sgdGene
    awk '$2 == "SwissProt" {printf("%s\t%s\n", $3, $1);}' \
        ../../download/chromosomal_feature/external_id.tab \
	> sgdToSwissProt.txt
    echo 'create table sgdToSwissProt ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  INDEX (value));' | hgsql sacCer1
    echo 'load data local infile "sgdToSwissProt.txt" \
          into table sgdToSwissProt;' | hgsql sacCer1
    hgProtIdToGenePred sacCer1 sgdGene sgdToSwissProt name value

# CREATE SGD-BASED CLONE TRACK (DONE 2003-11-25 Jim)
    ssh hgwdev
    cd /cluster/data/sacCer1/bed
    mkdir sgdClone
    cd sgdClone
    awk -F '\t' '{printf("chr%s\t%d\t%d\t%s\t%s\n", $3, $4-1, $5, $2, $1);}' \
    	../../download/chromosomal_feature/clone.tab > sgdClone.bed
    hgLoadBed sacCer1 sgdClone  sgdClone.bed -tab \
        -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql

# AUTO UPDATE GENBANK MRNA RUN  (Done 2003-11-24 Jim)

    # Put the nib's on /cluster/bluearc:
    ssh eieio
    mkdir -p /cluster/bluearc/sacCer/sacCer1/nib
    cp -pR /cluster/data/sacCer1/nib/*.nib /cluster/bluearc/sacCer/sacCer1/nib

    # Instructions for setting up incremental genbank updates are here:
    # http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html

    # Added
static char *sacCerNames[] = {"Saccharomyces cerevisiae", NULL};
    and
{"dm", dmNames, NULL},
    to appropriate parts of src/hg/makeDb/genbank/src/lib/gbGenome.c
    # Then make and make install
    cd src/hg/make/makeDb/genbank
    make PREFIX=/cluster/store5/genbank install

    # Edit /cluster/data/genbank/etc/genbank.conf and add:
# sacCer1
sacCer1.genome = /cluster/bluearc/sacCer/sacCer1/nib/chr*.nib
sacCer1.lift = no
sacCer1.genbank.mrna.xeno.load = no
sacCer1.genbank.est.xeno.load = no
sacCer1.downloadDir = sacCer1

    # Do the alignments
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -iserver=no \
      -clusterRootDir=/cluster/bluearc/genbank \
      -verbose=1 -initial sacCer1

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad sacCer1

# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST  (Done Jim)
# Grab sequence from all six yeast and massage
# it so that fasta records all start with sacXXX
# in the directory /cluster/data/sacCer1/bed/otherYeast/align
       ssh kkr1u00
       cd /cluster/data/sacCer1/bed/otherYeast/align

       # Create directory full of size info
       mkdir sizes
       foreach i (sac*)
           faSize $i -detailed > sizes/$i
       end

       # Create some working directories
       mkdir lav axtAll axtBest mafIn mafRawOut mafOut

       # Create little shell script to do blastz alignments
       cat > bz << end
#!/bin/csh -ef
blastz \$1 \$2 > \$3
end
       chmod a+x bz

       # Create job spec to do all blastz alignments
       ls -1 ../../../*/chr*.fa > sacCer.lst
       ls -1 sac??? > other.lst
       cat > gsub << end
#LOOP
bz \$(path1) \$(path2) {check out line+ lav/\$(root1).\$(root2)}
#ENDLOOP
end
       gensub2 sacCer.lst other.lst gsub spec

       # Do parasol blastz run
       para create spec
       para try
       # Do some para checks and if all is well
       para push
#Completed: 102 of 102 jobs
#CPU time in finished jobs:      43279s     721.31m    12.02h    0.50d  0.001 y
#IO & Wait Time:                   540s       9.00m     0.15h    0.01d  0.000 y
#Average job time:                 430s       7.16m     0.12h    0.00d
#Longest job:                     1458s      24.30m     0.41h    0.02d
#Submission to last job:          3994s      66.57m     1.11h    0.05d

       # Convert from lav to axt
       cd lav
       foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
           foreach j (*.$i)
	       lavToAxt $j ../../../../nib ../$i ../axtAll/$j -fa
	   end
	   echo done $i
       end
       cd ..

       # Run axtBest
       cd axtAll
       foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
           foreach j (*.$i)
	       axtBest $j $j:r ../axtBest/$j
	   end
	   echo done $i
       end
       cd ..

       # Convert to maf
       cd axtBest
       foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
           foreach j (*.$i)
	       axtToMaf $j ../../../../chrom.sizes ../sizes/$i ../mafIn/$j -tPrefix=sacCer1.
	   end
	   echo done $i
       end
       cd ..


       # Run multiz
       cd mafIn
       set mz = /cluster/bin/penn/tba.9.13/multiz
       foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
          set c = chr$i
	  $mz $c.sacPar $c.sacMik - > ../mafRawOut/$c.maf
	  echo done $c.sacMik
	  foreach s (sacKud sacBay sacCas sacKlu)
	      $mz $c.$s ../mafRawOut/$c.maf - > ../mafRawOut/tmp
	      mv ../mafRawOut/tmp ../mafRawOut/$c.maf
	      echo done $c.$s
	  end
       end
       cd ..

       #Load into database
       ssh hgwdev
       cd /cluster/data/sacCer1/bed/otherYeast/align
       ln -s /cluster/store6/sacCer1/bed/otherYeast/align/mafOut /gbdb/sacCer1/multizYeast
       hgLoadMaf sacCer1 multizYeast

       # Clean up
       cd mafRawOut
       foreach i (*.maf)
           mafFilter -minScore=100 $i > ../mafOut/$i
       rm -r axtAll axtBest lav err mafRawOut multizYeast.tab


# ADDING LOCALIZATION AND ABUNDANCE INFO FROM SGD AND 
# http://yeastgfp.ucsf.edu  				(DONE 2003-11-24 - Jim)
	ssh hgwdev
	cd /cluster/data/sacCer1/bed
	mkdir sgdLocalization
	cd sgdLocalization
	hgSgdGfp sacCer1 ../../download/systematic_results/localization/OS*.tab .

# UPDATE GO DATABASE  (DONE -Jim)
    ssh hgwdev
    cd /cluster/store1/geneOntology
    mkdir 20031202
    cd 20031202
    wget ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_200311-termdb-data.gz
    wget ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz
    hgsql mysql <<end
    drop database go;
    create database go;
    end
    zcat go_*data.gz | hgsql go
    zcat gene_association.goa_sptr.gz | hgGoAssociation go goaPart stdin -taxon=9606,10090,10116,6239,7227,4932

######## MAKING FAMILY BROWSER TABLES #######

# GETTING STARTED WITH FAMILY BROWSER (DONE 2003-11-29 Jim)
    # Making tables to cluster splice varients
    ssh hgwdev
    hgClusterGenes sacCer1 sgdGene sgdIsoforms sgdCanonical

    # Make self mapping table for expression. 
    echo 'create table sgdToSgd ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  UNIQUE (value));' | hgsql sacCer1
    echo 'select name from sgdGene;' | hgsql -N sacCer1 \
         | awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
    echo 'load data local infile "sgdToSgd.tab" into table sgdToSgd' \
         | hgsql sacCer1

    # Make expression similarity table. 
    cd ~/src/hg/near/hgExpDistance
    hgExpDistance sacCer1 hgFixed.yeastChoCellCycle hgFixed.yeastChoCellCycleExps choExpDistance 


#DOING SELF-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Extract peptides from genome database into fasta file
    # and create a blast database out of them. 
    mkdir -p /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    pepPredToFa sacCer1 sgdPep sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep
    cd ..

    # Copy over database to /scratch
    ssh kk
    mkdir -p /scratch/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p* /scratch/sacCer1/blastp
    sudo /cluster/install/utilities/updateLocal

    # Split up fasta file into bite sized chunks for cluster
    cd /cluster/data/sacCer1/bed/blastp
    mkdir split
    faSplit sequence sgdPep.faa 6000 split/kg

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/self
    cd /cluster/data/sacCer1/bed/blastp/self
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /scratch/blast/data
/scratch/blast/blastall -p blastp -d /scratch/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1000
end
    chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
    # This should finish in 1 minute if the cluster is free.
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs:       3570s      59.50m     0.99h    0.04d  0.000 y
#IO & Wait Time:                 14973s     249.55m     4.16h    0.17d  0.000 y
#Average job time:                   3s       0.05m     0.00h    0.00d
#Longest job:                       12s       0.20m     0.00h    0.00d
#Submission to last job:            55s       0.92m     0.02h    0.00d

    # Load into database.  This takes a minute 
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/self/run/out
    hgLoadBlastTab sacCer1 sgdBlastTab *.tab
#Scanning through 5743 files
#Loading database with 52725 rows


#DOING C.ELEGANS-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/ce1
    cd /cluster/data/sacCer1/bed/blastp/ce1
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/ce1/blastp/wormPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
Completed: 5743 of 5743 jobs
#CPU time in finished jobs:       9397s     156.62m     2.61h    0.11d  0.000 y
#IO & Wait Time:                 21849s     364.15m     6.07h    0.25d  0.001 y
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest job:                       26s       0.43m     0.01h    0.00d
#Submission to last job:            75s       1.25m     0.02h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/ce1/run/out
    hgLoadBlastTab sacCer1 ceBlastTab -maxPer=1 *.tab

#DOING MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Make mouse ortholog column using blastp on mouse known genes.
    # First make mouse protein database and copy it to iscratch/i
    # if it doesn't exist already
    cd /cluster/data/mm4/bed
    mkdir blastp
    cd blastp
    pepPredToFa mm4 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/mm4/blastp) then
       rm -r /iscratch/i/mm4/blastp
    endif
    mkdir -p /iscratch/i/mm4/blastp
    cp /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/mm4
    cd /cluster/data/sacCer1/bed/blastp/mm4
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/mm4/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs:      13913s     231.88m     3.86h    0.16d  0.000 y
#IO & Wait Time:                 27267s     454.45m     7.57h    0.32d  0.001 y
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest job:                       40s       0.67m     0.01h    0.00d
#Submission to last job:            80s       1.33m     0.02h    0.00d


    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/mm4/run/out
    hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab

#DOING ZEBRAFISH-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already
    cd /cluster/data/dr1/bed
    mkdir blastp
    cd blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz 
    zcat Dan*.pep.fa.gz > ensembl.faa
    echo "Translation:" > subs.in
    subs -e ensembl.faa > /dev/null
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/dr1/blastp) then
       rm -r /iscratch/i/dr1/blastp
    endif
    mkdir -p /iscratch/i/dr1/blastp
    cp /cluster/data/dr1/bed/blastp/ensembl.p?? /iscratch/i/dr1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/dr1
    cd /cluster/data/sacCer1/bed/blastp/dr1
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/dr1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs:      11135s     185.58m     3.09h    0.13d  0.000 y
#IO & Wait Time:                 23501s     391.69m     6.53h    0.27d  0.001 y
#Average job time:                   6s       0.10m     0.00h    0.00d
#Longest job:                       40s       0.67m     0.01h    0.00d
#Submission to last job:            71s       1.18m     0.02h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/dr1/run/out
    hgLoadBlastTab sacCer1 drBlastTab -maxPer=1 *.tab

#DOING FRUITFLY-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make SwissProt protein database and copy it to iscratch/i
    # if it doesn't exist already
    cd /cluster/data/dm1/bed
    mkdir blastp
    cd blastp
    pepPredToFa dm1 bdgpGenePep bdgp.faa
    formatdb -i bdgp.faa -t bdgp -n bdgp
    ssh kkr1u00
    if (-e /iscratch/i/dm1/blastp) then
       rm -r /iscratch/i/dm1/blastp
    endif
    mkdir -p /iscratch/i/dm1/blastp
    cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    iSync


    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/dm1
    cd /cluster/data/sacCer1/bed/blastp/dm1
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs:       9749s     162.49m     2.71h    0.11d  0.000 y
#IO & Wait Time:                 22247s     370.78m     6.18h    0.26d  0.001 y
#Average job time:                   6s       0.09m     0.00h    0.00d
#Longest job:                       28s       0.47m     0.01h    0.00d
#Submission to last job:            64s       1.07m     0.02h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/dm1/run/out
    hgLoadBlastTab sacCer1 dmBlastTab -maxPer=1 *.tab

#DOING HUMAN-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already
    cd /cluster/data/hg16/bed
    mkdir blastp
    cd blastp
    pepPredToFa hg16 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/hg16/blastp) then
       rm -r /iscratch/i/hg16/blastp
    endif
    mkdir -p /iscratch/i/hg16/blastp
    cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/hg16
    cd /cluster/data/sacCer1/bed/blastp/hg16
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/hg16/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # << this line makes emacs coloring happy

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try

    # Wait a couple of minutes, and do a para check,  if all is good
    # then do a
    para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs:      16096s     268.27m     4.47h    0.19d  0.001 y
#IO & Wait Time:                 29943s     499.05m     8.32h    0.35d  0.001 y
#Average job time:                   8s       0.13m     0.00h    0.00d
#Longest job:                       65s       1.08m     0.02h    0.00d
#Submission to last job:           100s       1.67m     0.03h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/hg16/run/out
    hgLoadBlastTab sacCer1 hgBlastTab -maxPer=1 *.tab

# LOADING ERAN SEGAL'S REGULATORY MODULE STUFF  (DONE 2004-9-22 -Jim)
    cd /cluster/data/sacCer1/bed
    mkdir eranSegal
    cd eranSegal
    # Copy module_assignments.tab, motif_attributes.tab, motif_modules.tab
    # from email from <eran@cs.stanford.edu> into this directory.
    echo "select name,chrom,txStart,txEnd,strand,500,0 from sgdGene;" | \
        hgsql -N sacCer1 > gene_position.tab
    echo "select name,chrom,chromStart,chromEnd,strand,500,0 from sgdOther;" | \
        hgsql -N sacCer1 >> gene_position.tab
    hgLoadEranModules sacCer1 module_assignments.tab motif_modules.tab \
    	modules_motifs.gxm modules_motif_positions.tab gene_position.tab

# LOADING REGULATORY CODE STUFF (In Progress 2004-9-22 -Jim)
    cd /cluster/data/sacCer1/bed
    mkdir harbisonGordon
    cd harbisonGordon

    # Create growthCondition.tab by editing by hand
    # http://jura.wi.mit.edu/young_public/regulatory_code/GrowthEnvironments.html
    # and then load:
    hgsql sacCer1 < $HOME/kent/src/hg/lib/growthCondition.sql
    hgsql sacCer1 -e 'load data local infile "growthCondition.tab" into table growthCondition'

    # Get GFF files and motif file from downloads section of 
    # http://jura.wi.mit.edu/fraenkel/regulatory_map
    # Also get v24_probes_041004.GFF from Ben Gordon dbgordon@wi.mit.edu via email.
    # Get Conditions_Summary.txt also from Ben from email.

    sort v24_probes_041004.GFF | uniq > probes.GFF
    hgYeastRegCode motifGff Final_InTableS2_v24.motifs probes.GFF Conditions_Summary.txt\
    	transRegCode.bed transRegCodeMotif.tab transRegCodeProbe.bed transRegCodeCondition.tab
    hgLoadBed sacCer1 transRegCode transRegCode.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
    hgLoadBed sacCer1 transRegCodeProbe transRegCodeProbe.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
    hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
    hgsql sacCer1 -e 'load data local infile "transRegCodeCondition.tab" into table transRegCodeCondition'
    hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
    hgsql sacCer1 -e 'load data local infile "transRegCodeMotif.tab" into table transRegCodeMotif'
    # Get rid of some crufty motif placements Ben Gordon doesn't like anymore that aren't in the
    # transRegCodeMotif table.
    hgsql sacCer1 -e 'delete from transRegCode where name="CRZ1" or name="ECM22" or name="SFL1"'
    # Trim some motif columns
    fixHarbisonMotifs sacCer1

#CREATING DOWNLOADS DIRECTORY
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath
    mkdir sacCer1
    cd sacCer1
    mkdir bigZips database
    cd bigZips
    zip -j chromFa.zip /cluster/data/sacCer1/*/chr*.fa
    zip -j multizYeast.zip /gbdb/sacCer1/multizYeast/*.maf
    # Create a README.txt file here.


# MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
    ssh hgwdev
    mkdir /cluster/data/sacCer1/bed/mitopred
    cd /cluster/data/sacCer1/bed/mitopred
    wget http://mitopred.sdsc.edu/data/yst_30.out
    perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' yst_30.out > mitopred.tab
    cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
    name varchar(10) not null,      # SwissProt ID
    confidence varchar(8) not null, # Confidence level
              #Indices
    PRIMARY KEY(name(6))
);
'_EOF_'
    # << this line makes emacs coloring happy
    hgsql sacCer1 < mitopred.sql
    hgsql sacCer1 -e 'load data local infile "mitopred.tab" into table mitopred'

# MAKE Human Proteins track (DONE 2004-08-25 braney)
    ssh kksilo
    mkdir -p /cluster/data/sacCer1/blastDb
    cd /cluster/data/sacCer1/blastDb
    for i in ../*/*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/sacCer1/blastDb
    cp /cluster/data/sacCer1/blastDb/* /iscratch/i/sacCer1/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/sacCer1/bed/tblastn.hg16KG
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG
    ls -1S /iscratch/i/sacCer1/blastDb/*.nsq | sed "s/\.nsq//" > yeast.lst
    exit

    # back to kksilo
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG
    mkdir kgfa
    ls -1S /iscratch/i/squirt/ci1/kgfas/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	    exit 0
	fi
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 yeast.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG
    para create blastSpec
    para try, push

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7500 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG
    para create chainSpec
    para push

    exit
    # back to kksilo
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/sacCer1/bed/tblastn.hg16KG
    hgLoadPsl sacCer1 blastHg16KG.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn

# BLAT SGD predicted sacCer1 proteins against sacCer1 (DONE 2004-08-31 braney)
    ssh hgwdev
    cd /cluster/data/sacCer1/bed
    mkdir blat.sacCer1SG
    cd  blat.sacCer1SG
    pepPredToFa sacCer1 sgdPep sacCer1SG.fa
    hgPepPred sacCer1 generic blastSGPep00 sacCer1SG.fa
    ssh kk
    cd /cluster/data/sacCer1/bed/blat.sacCer1SG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /cluster/bluearc/sacCer/sacCer1/nib/*.nib > yeast.lst
    mkdir sgfas
    cd sgfas
    faSplit sequence ../sacCer1SG.fa 1000 sg
    cd ..
    ls -1S sgfas/*.fa > sg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 yeast.lst sg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../yeast.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 16490 of 16490 jobs
# CPU time in finished jobs:      52796s     879.94m    14.67h    0.61d  0.002 y
# IO & Wait Time:                 44955s     749.25m    12.49h    0.52d  0.001 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest job:                       22s       0.37m     0.01h    0.00d
# Submission to last job:           714s      11.90m     0.20h    0.01d

    ssh kksilo
    cd /cluster/data/sacCer1/bed/blat.sacCer1SG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
    pslMaxMap -maxMap=1 cov90.psl sacCer1SG.psl
    pslxToFa sacCer1SG.psl sacCer1SG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
    sgName sacCer1 sacCer1SG.psl blastSGRef00
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blat.sacCer1SG
    hgsql sacCer1 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastSGRef00" | hgsql sacCer1
    echo "load data local infile 'blastSGRef00' into table blastSGRef00" | hgsql sacCer1

# PHASTCONS AND PHASTCONS ELEMENTS (acs, 9/15/04)
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/otherYeast
    mkdir phastCons
    cd phastCons

    # split up alignments; no need to use cluster here
    MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
    FA=/cluster/data/sacCer1
    CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
    mkdir -p SS
    for i in $CHROMS ; do \
	echo chr${i} ; \
	msa_view $MAF/chr${i}.maf -M $FA/$i/chr${i}.fa -i MAF -o SS -O sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu > SS/chr${i}.ss ; \
    done

    # produce rough starting model
    msa_view -i SS --aggregate sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu -o SS -z SS/*.ss > all.ss
    phyloFit -i SS all.ss --tree "((((((sacCer1,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" -o starting-tree
    # (cheat the branch lengths up a bit, since this represents an
    # average of conserved and nonconserved sites; we want to
    # initialize for nonconserved)
    tree_doctor  --scale 2 starting-tree.mod > tmp.mod
    mv tmp.mod starting-tree.mod 

    # also get average GC content of aligned regions
    msa_view -i SS all.ss --summary-only
#descrip.                      A          C          G          T        G+C  length   all_gaps  some_gaps
#all.ss                   0.3052     0.1954     0.1951     0.3042     0.3906   13251474          0    2260153

    # save some I/O and space
    gzip SS/*.ss
    
    # now set up cluster job to estimate model parameters.  See
    # makeHg17.doc for add'l details

    cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3906 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 75 --target-coverage 0.5 --quiet --log $2 --estimate-trees $3
EOF
    chmod u+x doEstimate.sh

    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobs.lst
    for f in SS/*.ss.gz ; do
	root=`basename $f .ss.gz` ;\
	echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
    done

    # run cluster job
    ssh kk9... para create ...
    # (takes about 15 minutes)

    # Now combine parameter estimates.  
    ls TREES/*.cons.mod > cons.txt
    phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    ls TREES/*.noncons.mod > noncons.txt
    phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt

    # Now set up cluster job for computing consevation scores and
    # predicted elements
cat << 'EOF' > doPhastCons.sh
#!/bin/sh

mkdir -p POSTPROBS ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 75 --target-coverage 0.5 --quiet --seqname $chr --idpref $pref --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
    chmod u+x doPhastCons.sh

    rm -fr POSTPROBS ELEMENTS
    rm -f jobs2.lst
    for f in SS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done

    # run cluster job
    ssh kk9... para create, ...
    # (takes about 30 sec)

    # set up phastConsElements track
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' ELEMENTS/*.bed > all.raw.bed 
    /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
    hgLoadBed sacCer1 phastConsElements all.bed
    featureBits sacCer1 phastConsElements
# 7517116 bases of 12156302 (61.837%) in intersection

    # set up wiggle
    mkdir -p wib
    for i in $CHROMS ; do \
	wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons POSTPROBS/chr${i}.pp.gz ;\
    done
    #	Changed this path 2004-09-27 so a new set of data could go out
    #	under the same table name - Hiram, was simply:
    #	/gbdb/sacCer1/wib/chr*_phastCons.wib
    hgLoadWiggle sacCer1 phastCons -pathPrefix=/gbdb/sacCer1/wib/phastCons wib/chr*_phastCons.wig
    mkdir -p /gbdb/sacCer1/wib/phastCons
    rm -f /gbdb/sacCer1/wib/phastCons/chr*_phastCons.wib
    ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/wib/*.wib /gbdb/sacCer1/wib/phastCons
    chmod 775 . wib /gbdb/sacCer1/wib/phastCons /gbdb/sacCer1/wib/phastCons/*.wib
    chmod 664 wib/*.wib

    # set up full alignment/conservation track
    rm -rf pwMaf /gbdb/sacCer1/pwMaf
    mkdir -p pwMaf /gbdb/sacCer1/pwMaf
    cd pwMaf ;\
    for org in sacBay sacCas sacKlu sacKud sacMik sacPar ; do \
	mkdir -p $org ; \
	cp /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$org $org ; \
	for f in $org/* ; do chr=`basename $f .$org` ; mv $f $org/$chr.maf ; done ;\
	ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/pwMaf/$org /gbdb/sacCer1/pwMaf/${org}_pwMaf ; \
	hgLoadMaf sacCer1 -warn ${org}_pwMaf -pathPrefix=/gbdb/sacCer1/pwMaf/${org}_pwMaf ;\
    done
    cd ..
    chmod 755 pwMaf pwMaf/* /gbdb/sacCer1/pwMaf /gbdb/sacCer1/pwMaf/*
    chmod 644 pwMaf/*/*.maf

    # trackDb entries:
#track multizYeast 
#shortLabel Conservation
#longLabel Seven Species of Saccharomyces, Alignments & Conservation
#group compGeno
#priority 100
#visibility pack
#type wigMaf 0.0 1.0
#maxHeightPixels 100:40:11
#wiggle phastCons
#yLineOnOff Off
#autoScale Off
#pairwise pwMaf
#speciesOrder sacPar sacMik sacKud sacBay sacCas sacKlu

#track phastConsElements
#shortLabel Most Conserved
#longLabel PhastCons Conserved Elements (Seven Species of Saccharomyces)
#group compGeno
#priority 105
#visibility hide
#spectrum on
#exonArrows off
#showTopScorers 200
#type bed 5 .


# MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
    # First, get ranges of target coverage:
    ssh kksilo
    mkdir /cluster/data/sacCer1/bed/otherYeast/align/coverage
    cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
    cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
    | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.mafRanges.bed
    # Get pairwise coverage as well.
    foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
      cat /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$other \
      | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.$other.mafRanges.bed
    end

    ssh hgwdev
    cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
    # To make subsequent intersections a bit quicker, output a bed with 
    # duplicate/overlapping ranges collapsed:
    nice featureBits sacCer1 sacCer1.mafRanges.bed \
      -bed=sacCer1.mafRangesCollapsed.bed
#11718319 bases of 12156302 (96.397%) in intersection
    # mafCoverage barfs on chr15 currently, so pass on this for now:
    #cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
    #| nice mafCoverage -count=2 sacCer1 stdin > sacCer1.mafCoverage

    # Intersect maf target coverage with gene regions:
    nice featureBits sacCer1 -enrichment sgdGene:cds \
      sacCer1.mafRangesCollapsed.bed \
      -bed=sacCer1.mafCds.bed
#sgdGene:cds 69.491%, sacCer1.mafRangesCollapsed.bed 96.397%, both 69.114%, cover 99.46%, enrich 1.03x
    # No UTR info for yeast, so can't intersect.

    # Intersect pairwise target coverages with gene regions:
    foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
      nice featureBits sacCer1 -enrichment sgdGene:cds \
        sacCer1.$other.mafRanges.bed -bed=sacCer1.${other}Cds.bed \
      |& grep -v "table gap doesn't exist"
    end
#sgdGene:cds 69.491%, sacCer1.sacBay.mafRanges.bed 89.478%, both 66.581%, cover 95.81%, enrich 1.07x
#sgdGene:cds 69.491%, sacCer1.sacCas.mafRanges.bed 63.359%, both 55.560%, cover 79.95%, enrich 1.26x
#sgdGene:cds 69.491%, sacCer1.sacKlu.mafRanges.bed 56.086%, both 49.245%, cover 70.87%, enrich 1.26x
#sgdGene:cds 69.491%, sacCer1.sacKud.mafRanges.bed 89.684%, both 64.873%, cover 93.35%, enrich 1.04x
#sgdGene:cds 69.491%, sacCer1.sacMik.mafRanges.bed 92.989%, both 67.178%, cover 96.67%, enrich 1.04x
#sgdGene:cds 69.491%, sacCer1.sacPar.mafRanges.bed 96.550%, both 68.464%, cover 98.52%, enrich 1.02x

# CREATE TABLES TO SUPPORT SHOWING SAM-T02 RESULTS (DONE 1/4/05, Fan)
# Kevin just did his monthly update, so reuild ... (REBUILT 1/7/05 Fan)   
    ssh hgwdev
    cd /cluster/data/sacCer1/bed
    mkdir sam
    cd sam

# create script to process 1 SAM subdirectory
    cat << '_EOF_' >do1Subdir
ls /projects/compbio/experiments/protein-predict/yeast/$1/*/summary.html >j1.tmp
cat j1.tmp |sed -e 's/yeast\// /g' |sed -e 's/\/summary/ /g' >j2.tmp
cat j2.tmp | awk '{print $2}' |sed -e 's/\// /g ' | awk '{print "do1 " $1 " " $2}' >> doall
'_EOF_'

    chmod +x do1Subdir

# create high level script to parse all SAM results
    ls -l /projects/compbio/experiments/protein-predict/yeast | grep drw >j1
    cp j1 j2

#edit j2 to remove non SAM result subdirectories
    vi j2
    
    cat j2 |awk '{print "do1Subdir " $9}' >doAllSubdir
    chmod +x doAllSubdir
    rm j1 j2
    
    rm doall
    doAllSubdir
    rm j1.tmp j2.tmp

# create data needed for the samSubdir table and load them
    cat doall |awk '{print $3"\t"$2}' >samSubdir.lis
    hgsql sacCer1 -e "drop table samSubdir"
    hgsql sacCer1 < ~/src/hg/lib/samSubdir.sql
    hgsql sacCer1 -e 'load data local infile "samSubdir.lis" into table samSubdir'

# create script to parse SAM output results in one subdirectory
    cat << '_EOF_' >do1
echo processing $2
samHit $2 /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.best-scores.rdb >$2.tab
'_EOF_'

    chmod +x do1
    chmod +x doall

# run the top level script to parse all SAM output results
    rm *.tab
    doall
# collect all results    
    cat *.tab |sort -u >protHomolog.all
    
# load the results into the protHomolog table
    hgsql sacCer1 -e "drop table protHomolog"
    hgsql sacCer1 < ~/src/hg/lib/protHomolog.sql
    hgsql sacCer1 -e 'load data local infile "protHomolog.all" into table protHomolog'
	
# remove all intermediate .tab files
    rm *.tab

# COPY PART OF SAM-T02 RESULTS TO UCSC GB SERVER (DONE 1/10/05, Fan)
# Kevin is going to do monthly updates on SAM-T02 results.
# To prevent data inconsistency problems, we now will copy 
# over some key files and host them on our own server.
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/sam
    mkdir yeast050107
    cd yeast050107

# create script to process 1 SAM subdirectory
    cat << '_EOF_' >do1Subdir
mkdir -p yeast/$1
'_EOF_'

    chmod +x do1Subdir
    cp -p ../doAllSubdir .
    mkdir yeast
    doAllSubdir

    cp -p ../doall .
    
# create script doall to copy over needed SAM output result files
    cat << '_EOF_' >do1
echo processing $2
mkdir yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/*.jpg yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.w0.5-logo.pdf yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.dssp-ehl2-logo.pdf yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.undertaker-align.pdb.gz yeast/$1/$2
'_EOF_'

    chmod +x do1
    doall

    ln -s ./yeast /usr/local/apache/htdocs/goldenPath/sacCer1/sam    

# REBUIL MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2005-12-16 Fan)
    # Update mouse ortholog column using blastp on mouse known genes.
    # First make mouse protein database and copy it to iscratch/i
    # if it doesn't exist already
    cd /cluster/data/mm7/bed
    mkdir blastp
    cd blastp
    pepPredToFa mm7 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/mm7/blastp) then
       rm -r /iscratch/i/mm7/blastp
    endif
    mkdir -p /iscratch/i/mm7/blastp
    cp -p /cluster/data/mm7/bed/blastp/known.p?? /iscratch/i/mm7/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/sacCer1/bed/blastp/mm7
    cd /cluster/data/sacCer1/bed/blastp/mm7
    mkdir run
    cd run
    mkdir out

    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/mm7/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome

    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

    # Create parasol batch
    echo ../../split/*.fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try,push,check ...
# Completed: 5743 of 5743 jobs
# CPU time in finished jobs:      11476s     191.27m     3.19h    0.13d  0.000 y
# IO & Wait Time:                 15126s     252.10m     4.20h    0.18d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:           700s      11.67m     0.19h    0.01d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/sacCer1/bed/blastp/mm7/run/out
    hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab


##########################################################################
# HGNEAR PROTEIN BLAST TABLES (DONE 5/22/06 angie)
    ssh hgwdev
    mkdir /cluster/data/sacCer1/bed/hgNearBlastp
    cd /cluster/data/sacCer1/bed/hgNearBlastp
    # Re-fetch current sgdPep because the one in ../blastz/sgdPep.faa 
    # has been overwritten with more recent data from SGD.  That's OK 
    # for other organisms who will be linking straight to SGD instead 
    # of to sacCer1, but sacCer1 *BlastTab should jive with sgdGene.
    pepPredToFa sacCer1 sgdPep sgdPep.faa
    cat << _EOF_ > config.ra
# Yeast vs. other Gene Sorter orgs that have recently been updated:
# human, mouse, fly

targetGenesetPrefix sgd
targetDb sacCer1
queryDbs hg18 mm8 dm2

sacCer1Fa /cluster/data/sacCer1/bed/hgNearBlastp/sgdPep.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa

buildDir /cluster/data/sacCer1/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/sacCer1HgNearBlastp
_EOF_
    doHgNearBlastp.pl -noSelf -targetOnly config.ra >& do.log & tail -f do.log
# *** hgBlastTab mmBlastTab dmBlastTab 


##########################################################################
# RELOAD GENBANK (DONE 2006-09-06 markd)
# reloaded due to xenos track being dropped confusing joinerCheck

nice bin/gbDbLoadStep -drop -initialLoad sacCer1

#########################################################################
# ORegAnno - Open Regulatory Annotations
# update Oct 26, 2007;  update July 7, 2008
# loaded by Belinda Giardine, in same manner as hg18 ORegAnno track

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
    #	repMatch = 1024 * sizeof(sacCer1)/sizeof(hg18)
    #	4.32 = 1024 * (12156302/2881515245)
    #	use 10 to be very conservative
    ssh hgwdev
    cd /hive/data/genomes/sacCer1
    blat sacCer1.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc \
	-repMatch=10
    #	Wrote 3145 overused 11-mers to 11.ooc
    #	copy this to scratch data
    cp -p 11.ooc /hive/data/staging/data/sacCer1/sacCer1.11.ooc
    #	make push request to kluster nodes /scratch/data/

#############################################################################
# LIFTOVER TO SacCer2 (DONE - 2009-01-04 - Hiram )
    mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
    cd /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug sacCer1 sacCer2
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 sacCer1 sacCer2 > do.log 2>&1
    #	real    3m21.840s

#############################################################################
# LIFTOVER TO SacCer3 (DONE - 2009-01-04 - Hiram )
    mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer3.2011-08-31
    cd /hive/data/genomes/sacCer1/bed/blat.sacCer3.2011-08-31
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug sacCer1 sacCer3
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 sacCer1 sacCer3 > do.log 2>&1
    #	real    3m31.458s

#############################################################################
# uwFootprints: (2009-04-04 markd)
# obtained from Nobel lab
#   William Stafford Noble <noble@gs.washington.edu>
#   Xiaoyu Chen <xchen@cs.washington.edu>

    mkdir /hive/data/genomes/sacCer1/bed/uwFootprints
    cd /hive/data/genomes/sacCer1/bed/uwFootprints


    wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.wig
    wget http://noble.gs.washington.edu/proj/footprinting/yeast.mappability.bed
    wget http://noble.gs.washington.edu/proj/footprinting/yeast.footprints.bed
    chmod a-w yeast.*

    wigEncode yeast.dnaseI.tagCounts.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib
    # Converted yeast.dnaseI.tagCounts.wig, upper limit 13798.00, lower limit 1.00

    ln -sf /hive/data/genomes/sacCer1/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer1/wib
    hgLoadWiggle -tmpDir=/data/tmp sacCer1 uwFootprintsTagCounts uwFootprintsTagCounts.wig 

    hgLoadBed -tab -tmpDir=/data/tmp sacCer1 uwFootprintsMappability yeast.mappability.bed
    # drop yeast.footprints.bed and truncate name to three decimal places
    tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' yeast.footprints.bed | hgLoadBed -tmpDir=/data/tmp sacCer1 uwFootprintsPrints stdin

    # lift counts to sacSer2 and give back to UW
    wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed
    liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed  /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed

#############################################################################
