# for emacs: -*- mode: sh; -*-
#########################################################################
# hivgne8 DATABASE BUILD (STARTED, 07/26/07, Fan)

    ssh hgwhiv1
    mkdir -p /cluster/store12/medical/hiv/hivgne8
    cd /cluster/store12/medical/hiv/hivgne8

# Get HIV GNE8 base sequence (1,700 bases) from GSID

# This sequence is re-created using the data sheets (page 281-284, Vol 1) 
# from VaxGene document, provided by Phil Berman.
# The data sheets are scanned and OCRed into raw txt file first, then
# manually edited and checked.  The final result is store in file, gne8_gsid.txt.

    echo '>GNE8' >gne8.fa
    cat gne8_gsid.txt|sed -e 's/  / /g'|sed -e 's/  / /g'|sed -e 's/  / /g'|\
    sed -e 's/ /\t/'|\
    sed -e 's/ /\t/'|\
    cut -f 3 |\
    sed -e 's/ //g' >>gne8.fa
    faToTab gne8.fa stdout|toUpper stdin gne8.tab

# translate to nib
    ln -s /cluster/store12/medical/hiv/hivgne8 ~/hivgne8
    cd ~/hivgne8
    mkdir nib
    ln -s gne8.fa chr1.fa
    faToNib chr1.fa nib/chr1.nib

# CREATING DATABASE (DONE 07/27/07)

# Create the hivgne8 database.
    
    echo 'create database hivgne8' | hgsql hiv1

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 07/27/07)
    
    echo "create table grp (PRIMARY KEY(NAME)) select * from hiv1.grp" \
      | hgsql hivgne8

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 09/05/06)

    # Make symbolic links from /gbdb/hivgne8/nib to the real nibs.

    mkdir -p /gbdb/hivgne8/nib
    ln -s /cluster/store12/medical/hiv/hivgne8/nib/chr1.nib /gbdb/hivgne8/nib

    # Load /gbdb/hivgne8/nib paths into database and save size info.
    hgsql hivgne8  < ~/src/hg/lib/chromInfo.sql

    cd ~/hivgne8
    hgNibSeq -preMadeNib hivgne8 /gbdb/hivgne8/nib chr1.fa
    echo "select chrom,size from chromInfo" | hgsql -N hivgne8 > chrom.sizes

# MAKE HGCENTRALHIV1 ENTRY AND TRACKDB TABLE FOR HIVGNE8 (DONE 08/02/06)
    echo 'insert into defaultDb values("HIV GNE8 (GP120)", "hivgne8");' \
      | hgsql -h localhost hgcentralhiv1

    echo 'insert into dbDb values("hivgne8", "Sep. 1998", \
          "/gbdb/hivgne8/nib", "Human immunodeficiency virus 1", "chr1", 1, 99.5, "HIV GNE8 (GP120)","Human immunodeficiency virus 1", "/gbdb/hivgne8/html/description.html", 0, 0, " sequence as of Feb., 2005");' \
      | hgsql hgcentralhiv1 -h localhost

    echo 'insert into genomeClade values("HIV GNE8 (GP120)", "other", 100);'\
      | hgsql hgcentralhiv1 -h localhost

    # Edit that makefile to add hivgne8 in all the right places

    vi makefile

    make update
    make alpha
    cvs commit makefile

# MAKE HGCENTRALHIV1 BLATSERVERS ENTRY FOR HIVGNE8
    ssh hiv1
    echo 'insert into blatServers values("hivgne8", "hiv1", "17784", "1", "0"); \
          insert into blatServers values("hivgne8", "hiv1", "17785", "0", "0");' \
      | hgsql hgcentralhiv1 -h localhost

# CREATE TRACKDB TABLE

    hgsql hivgne8 < ~/src/hg/lib/trackDb.sql
    hgsql hivgne8 -e "insert into trackDb select * from hiv1.trackDb"

# copy over tables from hiv1

    mysqldump -d hiv1 dnaSeq -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into dnaSeq select * from hiv1.dnaSeq"

    mysqldump -d hiv1 aaSeq -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into aaSeq select * from hiv1.aaSeq"
    
    mysqldump -d hiv1 gsidSubjInfo -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into gsidSubjInfo select * from hiv1.gsidSubjInfo"

    mysqldump -d hiv1 gsidClinicRec -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into gsidClinicRec select * from hiv1.gsidClinicRec"
    
    mysqldump -d hiv1 gsIdXref -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into gsIdXref select * from hiv1.gsIdXref"

    mysqldump -d hiv1 vax004Msa -u medcat -p$HGPSWD|hgsql hivgne8
    hgsql hivgne8 -e "insert into vax004Msa select * from hiv1.vax004Msa"

# CREATE VAX004 TRACK (DONE 8/9/07)

# get vax004 sequences
    hgsql hivgne8 -N -e 'select * from dnaSeq where id like "%U%"' >vax004.tab

# create .fa file
    tabToFa vax004

    mkdir -p /gbdb/hivgne8/vax004
    cp -p vax004.fa /gbdb/hivgne8/vax004/vax004.fa

    hgLoadSeq -replace hivgne8 /gbdb/hivgne8/vax004/vax004.fa

# BLAT
    gfClient -minScore=200 -minIdentity=80 -nohead hiv1.cse.ucsc.edu 17785  /gbdb/hivgne8/nib \
    -out=psl -t=dna -q=dna vax004.fa vax004.psl

# count the result
    wc *.psl
    cut -f 10 vax004.psl |wc
    cut -f 10 vax004.psl |sort -u |wc

# load the psl result into vax004 table
    hgLoadPsl hivgne8 vax004.psl

# CREATE HIVGENE TRACK
    mkdir -p /cluster/store12/medical/hiv/hivgne8/hivGene
    cd /cluster/store12/medical/hiv/hivgne8/hivGene

# clean up files from previous build
    rm *.psl *.bed *.tab *.fa *.tmp

# Get corresponding DNA gemomic sequences from hiv1 base genome.
    hgsql hiv1 -N -e \
    'select h.name, substring(b.seq, h.chromStart, h.chromEnd-h.chromStart) from baseSeq b, hivGene h where b.id="hiv1"'\ 
    >hSeq.tmp

# keep only the genes/regions within the gp120 region
    fgrep V hSeq.tmp >hSeq.tab
    fgrep vpu hSeq.tmp >>hSeq.tab

    fgrep gp120 hSeq.tmp >>hSeq.tab
    fgrep RRE hSeq.tmp >>hSeq.tab

# create .fa file from .tab file
    tabToFa hSeq

# BLAT the sequences agains hivgne8 base genome
    gfClient -minScore=10 -minIdentity=35 -nohead hiv1.cse.ucsc.edu 17785 \
    /gbdb/hivgne8/nib -out=psl -t=dna -q=dna hSeq.fa hSeq.psl

# load the BLAT result into a temp psl track
    hgLoadPsl hivgne8 hSeq.psl

# NEED TO MANUALLY LOAD THE psl.tab, BECAUSE hgLoadPsl HAS PERMISSION PROBLEM ON hiv1.
    hgsql hivgne8 -e 'delete from hSeq'
    hgsql hivgne8 -e 'load data local infile "psl.tab" into table hSeq'

# create .bed file for hivGene
    hgsql hivgne8 -N -e 'select "chr1", tStart, tEnd,  qName from hSeq ' >hivGene.bed

# load the bed file into hivGene table, needs to load manually bed.tab manually due 
# to permission problem.
    hgLoadBed hivgne8 hivGene hivGene.bed

# CREATE INTERPRO TRACK

# get all HIV-1 domain sequences
    getInterProFa interProXrefHiv1 interProHiv1.fa

# BLAT against base genome
    gfClient localhost 17784 /gbdb/hivgne8/nib -out=psl -t=dnax -q=prot  interProHiv1.fa interProHiv1.psl

# load it into a temp table
    hgLoadPsl hivgne8 table=testPsl interProHiv1.psl

# create bed file from this temp table
    hgsql hivgne8 -N -e \
    'select "chr1", tStart, tEnd, qName from testPsl where (tEnd-tStart)/3/qSize>0.42' \
    > interProHiv1.bed

# load the bed file into the track table
    hgLoadBed hivgne8 interPro interProHiv1.bed

# drop the temp table.
    hgsql hivgne8 -e 'drop table testPsl'

# CREATE CONSERVATION TRACKS

    mkdir -p /cluster/store12/medical/hiv/hivgne8/conservation
    cd /cluster/store12/medical/hiv/hivgne8/conservation

# create the .wig file and .fa file of the consensus sequence.
    gsidMsa hivgne8 vax004Msa GNE8 231 vax004Cons.wig vax004Consensus.fa

# encode and load the wig file
    wigEncode vax004Cons.wig stdout vax004Cons.wib \
    | hgLoadWiggle hivgne8 vax004Cons stdin

# copy .wib file to /gbdb
    mkdir -p /gbdb/hivgne8/wib
    cp vax004Cons.wib /gbdb/hivgne8/wib

# do the same for protein conservation track

    mkdir aa
    cd aa

# create .wig file
    gsidAaMsa2 hivgne8 vax004Msa GNE8 231 vax004AaCons.wig vax004AaConsensus.fa

# encode and load the .wib file   
    wigEncode vax004AaCons.wig stdout vax004AaCons.wib \
    | hgLoadWiggle hivgne8 vax004AaCons stdin

    cp vax004AaCons.wib /gbdb/hivgne8/wib

# CREATE MAF TRACKS

    mkdir -p /cluster/store12/medical/hiv/hivgne8/msa
    cd /cluster/store12/medical/hiv/hivgne8/msa

# create a script file, doall

    hgsql hivgne8 -N -e \
    'select id from dnaSeq where id like "%U%"'\
    |sed -e 's/ss/do1 ss/g' >doall

# create one line script file, do1, with the following line in it:

    hgsql hivgne8 -N -e  "select id, seq from vax004Msa where id='${1}'"

    chmod +x do*

# run the script to get the .tab file with all MSA sequences of VAX004
    doall >gne8.tab
# convert .tab into .fa file
    tabToFa gne8

# grab the base alignment sequence
    echo ">hivgne8" >gne8.aln
    hgsql hivgne8 -N -e 'select seq from vax004Msa where id="GNE8"'  >> gne8.aln

# prepare an interium file, jjAll.mfa
    cat gne8.aln gne8.fa >jjAll.mfa
    echo = >>jjAll.mfa

# Run xmfaToMafGne8 to create a precursor file for the final .maf

    xmfaToMafGne8 jjAll.mfa j.out  org1=hivgne8
    cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp

    rm jjAll.mfa j.out

    cat chr1.tmp |sed -e 's/ss_U/U/g' >chr1.maf

# copy .maf to /gbdb.
    mkdir -p  /gbdb/hivgne8/vax004Maf 
    cp chr1.maf /gbdb/hivgne8/vax004Maf -p
    echo before load
    hgLoadMaf hivgne8 vax004Maf

# create another copy for protein MAF.
    mkdir -p  /gbdb/hivgne8/vax004AaMaf 
    cp -p chr1.maf /gbdb/hivgne8/vax004AaMaf
    hgLoadMaf hivgne8 vax004AaMaf
