# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the 
# SARS Coronavirus Tor2 isolate, draft genome sequence dated 
# April 14, 2003.  It's April 15 here in NZ but April 14 in 
# the states -- dates may shows that inconsistency.  :)

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +---------------+-----------------------+
#  | tableName     | type                  |
#  +---------------+-----------------------+
#  | softberryGene | genePred softberryPep |
#  +---------------+-----------------------+


DOWNLOAD SEQUENCE (DONE 04/15/03)

    ssh eieio
    mkdir /cluster/store5/SARS_Coronavirus_TOR2
    cd /cluster/store5/SARS_Coronavirus_TOR2
    wget http://ybweb.bcgsc.ca/sars/TOR2_draft_genome_assembly_140403.fasta
    wget -o TOR2.gbff 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=nucleotide&list_uids=29826276&dopt=GenBank'
    # translate to nib
    ln -s /cluster/store5/SARS_Coronavirus_TOR2 ~/sc1
    cd ~/sc1
    mkdir nib
    ln -s TOR2_draft_genome_assembly_140403.fasta chr1.fa
    faToNib chr1.fa nib/chr1.nib


CREATING DATABASE (DONE 04/15/03)

    # Create the database.
    ssh hgwdev
    echo 'create database sc1' | hgsql ''
    # make a semi-permanent read-only alias:
    alias sc1 "mysql -u hguser -phguserstuff -A sc1"
    # Use df to ake sure there is at least 5 gig free on 
    # hgwdev:/var/lib/mysql


CREATING GRP TABLE FOR TRACK GROUPING (DONE 04/15/03)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql sc1


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 04/15/03)

    # Make symbolic links from /gbdb/sc1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/sc1/nib
    foreach f (/cluster/store5/SARS_Coronavirus_TOR2/nib/*.nib)
      ln -s $f /gbdb/sc1/nib
    end
    # Load /gbdb/sc1/nib paths into database and save size info.
    hgsql sc1  < ~/src/hg/lib/chromInfo.sql
    cd ~/sc1
    hgNibSeq -preMadeNib sc1 /gbdb/sc1/nib chr1.fa
    echo "select chrom,size from chromInfo" | hgsql -N sc1 > chrom.sizes
    # Set up relational mrna tables.
    hgLoadRna new sc1


MAKE GCPERCENT (DONE 04/15/03)
     ssh hgwdev
     mkdir -p /cluster/store5/SARS_Coronavirus_TOR2/bed/gcPercent
     cd /cluster/store5/SARS_Coronavirus_TOR2/bed/gcPercent
     hgsql sc1  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent sc1 ../../nib


MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR SC1 (DONE 04/15/03)
    echo 'insert into defaultDb values("SARS", "sc1");' \
      | hgsql -h genome-testdb hgcentraltest
    # Note: for next assembly, set scientificName column to 
    # "SARS coronavirus"
    echo 'insert into dbDb values("sc1", "SARS Cor. TOR2 Apr. 2003", \
          "/gbdb/sc1/nib", "SARS", "chr1", 1, 10, "SARS");' \
      | hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add sc1 in all the right places and do
    make update
    make alpha
    cvs commit makefile


MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR SC1 (TODO)
    ssh hgwdev
    echo 'insert into blatServers values("sc1", "blat10", "17778", "1"); \
          insert into blatServers values("sc1", "blat10", "17779", "0");' \
      | hgsql -h genome-testdb hgcentraltest


SIMPLE REPEAT TRACK (DONE 04/15/03)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on eieio:
    ssh eieio
    mkdir ~/sc1/bed/simpleRepeat
    cd ~/sc1/bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    touch jobs.csh
    foreach f (/cluster/store5/SARS_Coronavirus_TOR2/*.fa)
      set fout = $f:t:r.bed
      echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh |& tee jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
# STOPPED HERE -- no simple repeats found.

    # When job is done do:
    liftUp simpleRepeat.bed ~/sc1/jkStuff/liftAll.lft warn trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    cd ~/sc1/bed/simpleRepeat
    hgLoadBed sc1 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


MAKING VIRAL mRNA FILES (DONE 04/15/03)
    cd /cluster/store5/mrna.134
    gunzip -c \
      /cluster/store5/genbank.134/gbvrl* \
    | gbToFaRa anyRna.fil viralRna.{fa,ra,ta} stdin
    mkdir /gbdb/sc1/mrna.134
    ln -s  /cluster/store5/mrna.134/viralRna.fa /gbdb/sc1/mrna.134/
    hgLoadRna add -type=mrna sc1 /gbdb/sc1/mrna.134/viralRna.fa \
      /cluster/store5/mrna.134/viralRna.ra
    # distribute to /iscratch/i/
    ssh kkr1u00
    mkdir -p /iscratch/i/mrna.134
    cp -p /cluster/store5/mrna.134/viralRna.fa  /iscratch/i/mrna.134
    ~kent/bin/iSync


ALIGNING VIRAL RNA (DONE 04/16/03)
    ssh kkr1u00
    mkdir -p ~/sc1/bed/mrna
    cd ~/sc1/bed/mrna
    ls -1S /cluster/store5/SARS_Coronavirus_TOR2/*.fa > genome.lst
    ls -1 /iscratch/i/mrna.134/viralRna.fa > mrna.lst
    cp ~/lastRn/bed/mrna/gsub .
    # edited gsub to remove -ooc and add query options for translated.
    mkdir psl
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try, para check, para push, para check....
    para time > time
      
    # Process alignments into near best in genome.
    ssh eieio
    cd ~/sc1/bed/mrna
    pslSort dirs raw.psl /cluster/store2/temp psl
    # Skip the pslReps -- need the sensitivity!
    cat raw.psl \
    | sed -e 's/TOR2_draft_genome_assembly_140403/chr1/' \
    > all_mrna.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl

    # Load mRNA alignments into database.
    ssh hgwdev
    cd ~/sc1/bed/mrna/chrom
    foreach i (chr?{,?}{,_random}.psl)
        mv $i $i:r_mrna.psl
    end
    hgLoadPsl -noTNameIx sc1 *.psl
    cd ..
    hgLoadPsl sc1 all_mrna.psl -nobin


LOAD SWISS PROT CORONAVIRUS PROTEINS
    cd ~/sc1/bed/swissprot
    download all ORGANISM=coronavirus proteins from SRS into file viralProt.fa
    ssh kkr1u00
    mkdir psl
    para create spec
    para push
    cd psl
    mv chr1_viralProt.psl chr1_viralProtLoose.psl
    pslReps -minAli=0.67 chr1_viralProtLoose.psl chr1_viralProt.psl    
    hgLoadPsl -noTNameIx sc1 chr1_viralProt.psl chr1_viralProtLoose.psl


LOAD CPGISSLANDS (DONE 04/15/03)
    ssh eieio
    mkdir -p ~/sc1/bed/cpgIsland
    cd ~/sc1/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    cp ~/lastRn/bed/cpgIsland/cpg_dist.tar .
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ..
    foreach f (../../*.fa)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpg_dist/cpglh.exe $f > $fout.cpg
    end
    # copy filter.awk from a previous release
    cp ~/lastRn/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed
# STOPPED HERE -- NO CPG ISLANDS FOUND
    # load into database:
    ssh hgwdev
    cd ~/sc1/bed/cpgIsland
    hgLoadBed sc1 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


LOAD SOFTBERRY GENES 
     mkdir -p ~/sc1/bed/softberry
     cd ~/sc1/bed/softberry
  # Copy in email attachment Soft_genes_SARS_Tor2.tar.gz
  # from Victor Solovyev
     gtar -zxvf Soft_gene*.gz
  # Substitute chr1 for subs in the gff file 
     ldHgGene sc1 softberryGene SARS.gff
     hgPepPred sc1 softberry *.protein
     hgSoftberryHom sc1 *.protein


LOAD GENBANK ANNOTATED PROTEINS (DONE 04/23/03)
    mkdir -p ~/sc1/bed/gbProtAnn
    cd ~/sc1/bed/gbProtAnn
    # go to http://www.ncbi.nlm.nih.gov/ -- search Protein for NP_828849
    # Save the displayed genbank record to NP_828849.gbff
    #
    # Used emacs macros (and judgment about what the short names should be) 
    # to edit the mat_peptide elements to a bed 4 + tab file like this:
    #  chrom	start	end	name	product	note	proteinId	giId
    # called NP_828849.pbed -- pbed because it's in protein coordinates.  
    #
    # Now translate the protein coordinates into genomic coords:
    #  /coded_by="join(NC_004718.1:250..13383, NC_004718.1:13383..21470)"
    #  /note="putative -1 frameshift"
    # and strip the "gi:" from genbank id numbers:
    # OK, also use the proteinId as the name instead of my short names so 
    # that others can link in more easily as Fan suggested:
    perl -wpe '@w = split(/\t/); \
               if ($w[1] < 4379) { \
                 $w[1] = (($w[1] - 1) * 3) + 250 - 1; \
                 $w[2] = (($w[2] - 1) * 3) + 250 + 2; \
               } else { \
                 $w[1] = (($w[1] - 4379) * 3) + 13383 - 1; \
                 $w[2] = (($w[2] - 4379) * 3) + 13383 + 2; \
               } \
               $w[7] =~ s/gi://i; \
               $w[3] = $w[6]; \
               $_ = join("\t", @w); \
              ' NP_828849.pbed \
    > NP_828849.bed
    # Make sure it looks right:
    sdiff NP_828849.pbed NP_828849.bed
    hgLoadBed sc1 gbProtAnn NP_828849.bed \
      -tab -noBin -sqlTable=$HOME/src/hg/lib/gbProtAnn.sql


PRODUCING GENSCAN PREDICTIONS (DONE 4/23/03)
    ssh kkr1u00
    mkdir -p ~/sc1/bed/genscan
    cd ~/sc1/bed/genscan
    # No cluster run (or liftUp) required -- just run it:
    /cluster/home/kent/bin/i386/gsBig ~/sc1/chr1.fa genscan.gtf \
      -trans=genscan.pep -subopt=genscanSubopt.bed \
      -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan \
      -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat \
      -tmp=/tmp -window=2400000
    # Load into the database as so:
    ssh hgwdev
    cd ~/sc1/bed/genscan
    ldHgGene sc1 genscan genscan.gtf
    hgPepPred sc1 generic genscanPep genscan.pep
    hgLoadBed sc1 genscanSubopt genscanSubopt.bed


BLATTING OTHER SARS WHOLE GENOMES AGAINST TOR2 (DONE 04/25/03)
    ssh eieio
    mkdir -p ~/sc1/bed/otherSARS
    cd ~/sc1/bed/otherSARS
    # Went to http://www.ncbi.nlm.nih.gov/ , did a search in Nucleotide 
    # for SARS Coronavirus, viewed results as FASTA, and saved results to 
    # <accession>.fa files:
    # AY278554.2 (CUHK-W1)
    # AY278741.1 (Urbani)
    # AY278491.2 (HKU-39849)
    # AY279354.1 (BJ04 partial)
    # AY278490.1 (BJ03 partial)
    # AY278489.1 (GZ01 partial)
    # AY278488.1 (BJ01 partial)
    # AY278487.1 (BJ02 partial)
    # AY268049.1 (Taiwan partial cds)
    # AY269391.1 (Vietnam partial)
    cat A*.fa > otherSARS.fa
    # Now blat (sensitive is OK because these are same organism):
    /cluster/home/kent/bin/i386/blat -noHead ~/sc1/chr1.fa otherSARS.fa \
      otherSARS.psl
    # Now put the sequence in /gbdb and index it into extFile/seq:
    ssh hgwdev
    mkdir -p /gbdb/sc1/otherSARS/
    ln -s `pwd`/otherSARS.fa /gbdb/sc1/otherSARS/
    hgLoadRna addSeq sc1 /gbdb/sc1/otherSARS/otherSARS.fa -type=otherSARS.fa
    # And load the alignments:
    hgLoadPsl -noTNameIx sc1 otherSARS.psl

LOADING VIRAL mrna for blastz track (done 4/30/03)

cd ~/sars/genbank
gunzip -c /cluster/store5/genbank/data/download/genbank.135.0/gb{phg,vrl,vrt}*.seq.gz | gbToFaRa any.fil viralany.fa viralany.ra viralany.ta stdin
gunzip -c /cluster/store5/genbank/data/download/refseq.135.0/cumulative/rscu.gbff.Z | gbToFaRa refseq.fil refseqviral.fa refseqviral.ra refseqviral.ta stdin
cat viralany.fa refseqviral.fa > viral.fa
cat viralany.ra refseqviral.ra > viral.ra
mv viral.fa viralany.fa
mv viral.ra viralany.ra
hgLoadRna new sc1
hgLoadRna add -type=mRNA sc1 /gbdb/sc1/mrnaViral/viralany.fa viralany.ra
hgLoadRna addSeq -abbr=prot sc1 /gbdb/sc1/swissprot/viralProt.fa
hgLoadRna addSeq sc1 /gbdb/sc1/otherSARS/otherSARS.fa -type=otherSARS.fa
tr acgt ACGT viralany.fa > viralAnyUpper.fa

MRNA ALIGNMENT USING BLASTZ and CHAINING (done 5/16/03)
 
cd ~/sars/mrna.blastz.all
blastz /cluster/store5/SARS_Coronavirus_TOR2/nib/chr1.nib[1,29736] /cluster/store5/SARS_Coronavirus_TOR2/bed/genbank/viralAllUpper.fa m=40000000 H=2000 K=2000 v=1 M=0 > viralany.out
/cluster/home/angie/schwartzbin/xout2lav chr1 29736 /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/ /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/lav/viralany.lav /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/DEF 2> viralany.log
lavToPsl lav/viralany.lav psl/viralany.psl
pslReps viralany.psl -minAli=0.6 all_mrna.psl /dev/null
vi all_mrna.psl change tName to chr1 and delete sars self join
hgLoadPsl sc1 all_mrna.psl
hgLoadPsl sc1 all_mrna.psl -table=xenoMrna


lavToAxt /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/lav/viralany.lav /cluster/store5/SARS_Coronavirus_TOR2/nib /cluster/store5/SARS_Coronavirus_TOR2/bed/genbank/viralAllUpper.fa axt/viralany.axt -fa
axtChain axt/xeno.axt /cluster/store5/SARS_Coronavirus_TOR2/nib/ /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/nib chain/xeno.chain
chainFilter chain/xeno.chain -minScore=3000 -maxScore=2000000 > chain/filter.chain
chainToPsl chain/filter.chain S1.len S2.len ../../nib/chr1.nib query.lst psl/xenoBlastzMrna.psl
cd psl
hgLoadPsl sc1 xenoBlastzMrna.psl


DOWNLOADS (done 10/8/04)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/scApr2003/bigZips
# done 6/5/04:
zip -j otherSARS.fa.zip /cluster/data/sc1/bed/otherSARS/otherSARS.fa
zip -j viralProt.fa.zip /cluster/data/sc1/bed/swissprot/viralProt.fa
zip -j viralany.fa.zip /cluster/data/sc1/bed/genbank/viralany.fa
# done 10/8/04:
zip -j SARS.fa.zip /cluster/data/sc1/chr1.fa


