# for emacs: -*- mode: sh; -*-

# This file describes browser build for the mouse
# genome, March 2005, ncbi mouse_34 - Mm6
#
#	"$Id: mm6.txt,v 1.8 2008/01/14 23:06:15 rhead Exp $"
#
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2005-03-09 - Hiram)
#
#	Examine disk space issues, summarize mm5 usage:
    ssh kksilo
    cd /cluster/store6
    du -hsc mm5
#	476G    mm5
#    That is a lot of disk space for an assembly, I wonder if it needs
#    some cleaning.  Our last vestige of large amounts of space is on
#    store10, thus:
    ssh kksilo
    mkdir /cluster/store10/mm6
    ln -s /cluster/store10/mm6 /cluster/data/mm6
    mkdir /cluster/data/mm6/ncbi
    cd /cluster/data/mm6/ncbi
#	set the login name and password in a .wgetrc file in this
#	directory, permissions 600, its format:
#	login = name
#	passwd = xxxx
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=1 \
	ftp://ftp-private.ncbi.nih.gov/mouse_34
#	Downloaded: 1,586,892,564 bytes in 57 files
#
#	NOTE: chrY - radical change from mm5 - most of it was moved into
#	chrY_random
#	Notes from Deanna Church:
##############################################################################
# There are some general stats on chromosome lengths and N50s (for this build
# and compared to Build 33) as well as a comparison to the non-sequence based
# maps. 

# Before there is too much alarm I want to point out that there is a fair
# amount of N50 variation from chromosome to chromosome, as well as a few
# chromosomes that have significantly different N50 than in Build 33. The most
# severe case is Mmu2 (36 Mb in Build 34 vs 90 Mb in Build 33). Remember Mmu2
# was not automatically assembled and this drop is due to gaps being
# purposefully introduced. A couple of auto-assembled chromosomes also saw
# drops in N50 (Mmu5 down ~13Mb and Mmu13 down about 6 Mb)- however, this is
# also likely due to clones having been introduced to the TPF. In these cases,
# many of the clones have no sequence- and there is likely no WGS to hold
# things together either. So, I don't think this by itself is cause for
# concern- the N50 for these two chromosomes is still >19 Mb, so they are very
# contiguous.

# On the plus side, Mmu14 has an N50 increase of 41 Mb, Mmu12 has an increase
# of 24 Mb and Mmu8 has an increase of 13 Mb. 

# The genome average N50 increased from 23.2 Mb to 26.2 Mb.
##############################################################################

#  No chrM or chrMT was delivered.  Pick up NC_005089.1 fasta file
#	for chrM sequence from NCBI and place in this ncbi download
#	directory as chrM in both chrfasta and contigfasta
#	with a fixed up header line to be like all the others:
#	>lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome

#  Fixup the agp and contig.idmap files to add chrM
    cd /cluster/data/mm6
    zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp
    echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \
	allrefcontig.chr.agp
    gzip allrefcontig.chr.agp
    zcat ncbi/allcontig.agp.gz > allcontig.agp
    echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \
	allcontig.agp
    gzip allcontig.agp
    zcat ncbi/seq_contig.md.gz | sed -e "6991i\
10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\
10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\
10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md
    #	The line number 6991 was found by checking the contents of
    #	ncbi/seq_contig.md.gz and it was the line starting with:
    #	10090^IUn|NT_039766^I1^I4412^
    #	Wanted this chrM information before that line.  I tried to get
    #	this work with a match and insert, but for some unknown reason
    #	it would not function:
    #	sed -e "#NT_039769#i\
    #		....  this is supposed to work, I don't know why it does not
    #  And even more curiously, this command cut and paste did NOT work
    #	on hgwdev in my login.  Mysteries of environment.  Only worked
    #	on kksilo.
    gzip seq_contig.md
#   summarize sequence counts
    mkdir faCounts
    time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 &
    time faCount ncbi/contigfasta/chr*.fa.gz > \
	faCounts/contigfasta.faCount 2>&1 &
    #	about 3 minutes each for the above two faCounts
    time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \
	faCounts/chrfasta.headers 2>&1 &
    time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \
	faCounts/contigfasta.headers 2>&1 &
    #	about 2 minutes each for the above two zcat/greps

#############################################################################
#  BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS
#			(DONE - 2005-03-09 - Hiram)
    ssh kksilo
    cd /cluster/data/mm6
    for F in ncbi/chrfasta/chr*.fa.gz
    do
	CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"`
	echo ${CHR} ${F}
	mkdir -p "${CHR}"
	zcat allrefcontig.chr.agp.gz | \
	    perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \
		${CHR}/chr${CHR}.agp
	zcat ncbi/chrfasta/chr${CHR}.fa.gz | \
	    perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \
		splitFaIntoContigs ${CHR}/chr${CHR}.agp \
		    stdin /cluster/data/mm6 -nSize=5000000
    done
    #	The above loop takes about 5 minutes

#############################################################################
# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2005-03-09 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/jkStuff
    cd /cluster/data/mm6
    mkdir Un tmp
    cp -p /cluster/data/mm5/jkStuff/ncbiFixAgp ./jkStuff
    zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin | gzip > \
	allrefcontig.chr.ordered.agp.gz
    #	Set the appropriate release number here, this one is 34
    sed -e "s/buildNum = 33/buildNum = 34/" \
	/cluster/data/mm5/jkStuff/ncbiToRandomAgps > \
	    jkStuff/ncbiToRandomAgps
    chmod +x jkStuff/ncbiToRandomAgps
    gunzip seq_contig.md.gz allrefcontig.chr.ordered.agp.gz
    ./jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \
                        ncbi/contig.idmap .
    #  The chrUn_random.agp created by this is too large with the 5000
    #  gaps.  it will work with 1000 gaps, so fixup the chrUn_random
    #  agp:
    ./jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
	seq_contig.md allrefcontig.chr.ordered.agp ncbi/contig.idmap .
    for C in ? ??
    do
	if [ -s ${C}/chr${C}_random.ctg.agp ]; then
	    echo "building ${C}/chr${C}_random.fa"
	    rm -f ./tmp.fa
	    zcat ncbi/contigfasta/chr${C}.fa.gz | \
		perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa
	    $HOME/bin/i386/agpToFa -verbose=2 -simpleMulti \
		${C}/chr${C}_random.ctg.agp chr${C}_random \
		    ${C}/chr${C}_random.fa ./tmp.fa
	    rm -f ./tmp.fa
	fi
    done > tmp/agpToFa.out 2>&1
    #	the above loop takes about 6 minutes, examine the tmp/agpToFa.out
    #	record for any errors
    # Clean these up to avoid confusion later... they're easily rebuilt
    #   with the ncbiToRandomAgps script above
    rm ?/*_random.ctg.agp ??/*_random.ctg.agp
    gzip seq_contig.md allrefcontig.chr.ordered.agp

#############################################################################
# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
#					(DONE 2005-03-09 - Hiram)
    ssh kksilo
    cd /cluster/data/mm6
    for C in ? ??
    do
	if [ -s ${C}/chr${C}_random.fa ]; then
	    splitFaIntoContigs  -nSize=5000000 ${C}/chr${C}_random.agp \
		${C}/chr${C}_random.fa .
	    mkdir -p ${C}/lift
	    rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst
	    mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst
	    mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft
	    mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst
	    rmdir ${C}_random/lift
	    rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa
	    rm -rf ${C}/chr${C}_random_*
	    mv ${C}_random/chr${C}_random_* ${C}
	    rmdir ${C}_random
	fi
    done > tmp/split.out 2>&1
    #	the above loop takes less than a minute
    #	scan the tmp/split.out file for possible errors

#############################################################################
# MAKE LIFTALL.LFT (DONE - 2005-03-10 - Hiram)
    ssh kksilo
    cd /cluster/data/mm6
    cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft

#############################################################################
# CREATING DATABASE (DONE - 2005-03-10 - Hiram)
    ssh kksilo
    cd /cluster/data/mm6
    faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm6.2bit
    twoBitInfo mm6.2bit stdout | sort -rn +1 > chrom.sizes
    grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
    twoBitInfo mm6.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/mm6/mm6.2bit\n", $1,$2}' > chromInfo.tab

    ssh hgwdev
    cd /cluster/data/mm6
    hgsql -e "create database mm6;" mysql
    #	Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
    df -h | grep mysql
    #	/dev/sda1             472G  227G  222G  51% /var/lib/mysql2
    #	/dev/sdc1             1.8T  728G  933G  44% /var/lib/mysql

    # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-03-10 - Hiram)
    #   Use any of the newest databases to ensure that the organization
    #   of the grp table is up to date
    ssh hgwdev
    hgsql mm6 -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp"
    hgsql mm6 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql mm6 -e 'load data local infile "chromInfo.tab" into table chromInfo;'

    # Enter mm6 into dbDb and defaultDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("mm6", "March 2005", "/gbdb/mm6", "Mouse", \
        "chr6:28912411-28925620", 1, 24, "Mouse", \
        "Mus musculus", "/gbdb/mm6/html/description.html", 0, 0, \
        "NCBI Build 34");' -h localhost hgcentraltest
    #	do this defaultDb entry later after there is something to see
    #	on this browser.
    hgsql -e 'INSERT INTO defaultDb (name, genome) VALUES("mm6", "Mouse")' \
        -h localhost hgcentraltest
    # start a new entry in the trackDb hierarchy
    cd $HOME/kent/src/hg/makeDb/trackDb/mouse
    mkdir mm6
    cvs add mm6
    cd mm6
    cp ../mm5/description.html .
    vi description.html - fixup text for this assembly
    cvs add description.html
    cvs commit
    cd ../..
    vi trackDb.ra - add mm6 to the list
    mkdir /cluster/data/mm6/html
    mkdir /gbdb/mm6
    ln -s /cluster/data/mm6/html /gbdb/mm6/html
    ln -s /cluster/data/mm6/mm6.2bit /gbdb/mm6/mm6.2bit
    cp -p mouse/mm6/description.html /gbdb/mm6/html
    make DBS=mm6 ZOO_DBS=""

#############################################################################
#  GOLD GAP tracks (DONE - 2005-03-10 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm6
    #	make sure these tmp contig agp files are gone, easily generated
    #	as above with jkStuff/ncbiToRandomAgps
    rm -f */chr*.ctg.agp
    mkdir ffa
    zcat ncbi/sequence.inf.gz > ffa/sequence.inf
    hgGoldGapGl -chromLst=chrom.lst mm6 /cluster/data/mm6 .
    featureBits mm6 gold
    #	2597150411 bases of 2597150411 (100.000%) in intersection
    featureBits mm5 gold
    #	2615483787 bases of 2615483787 (100.000%) in intersection
    featureBits mm4 gold
    #	2627444668 bases of 2627444668 (100.000%) in intersection

    featureBits mm6 gap
    #	482483041 bases of 2597150411 (18.577%) in intersection
    featureBits mm5 gap
    #	549468286 bases of 2615483787 (21.008%) in intersection
    featureBits mm4 gap
    #	325167539 bases of 2627444668 (12.376%) in intersection

#############################################################################
#  DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS
    ssh kksilo
    mkdir /cluster/bluearc/mm6
    cd /cluster/data/mm6
    mkdir /cluster/bluearc/mm6/chrom
    cp -p */*.fa /cluster/bluearc/mm6/chrom
    #	(these were removed later)

    # break up into 500,000 sized chunks for repeat masker runs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000
    cd ${TOP}
done > tmp/ctg_split.out 2>&1
    #	about 3 minutes, check the tmp/ctg_split.out for anything unusual

    #	make a list of the contigs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    ls ${ctg}_* | while read F
    do
        echo ${CTG_DIR}/${F}
    done
    cd ${TOP}
done > contig500K.lst
    #	count 'em
    wc contig500K.lst
    #	6678   6678 176765 contig500K.lst

    mkdir -p /panasas/store/mm6/contigs
    rsync -a --progress --files-from=contig500K.lst . \
        /panasas/store/mm6/contigs/

    ssh kkr1u00
    mkdir /iscratch/i/mm6
    cd /iscratch/i/mm6
    cp -p /cluster/bluearc/mm6/chrom/* .
    /cluster/bin/iSync
    #	verify the contig copy above functioned OK
    find /panasas/store/mm6/contigs -type f | wc
    #	6678    6678  443885

#############################################################################
# SIMPLE REPEAT TRACK (DONE - 2005-03-10 Hiram)
    # TRF can be run in parallel with RepeatMasker on the file server
    #   since it doesn't require masked input sequence.
    #   Run this on the rack 9 cluster     
    ssh kk9 
    mkdir /cluster/data/mm6/bed/simpleRepeat
    cd /cluster/data/mm6/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe 
#
set path1 = $1
set inputFN = $1:t  
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << keep emacs coloring happy
    chmod +x runTrf

cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    ls -1S /iscratch/i/mm6/chrom/chr*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    #	be gentle on the start up of these things since each starting
    #	job is a copy of the .fa file, a 'para try' starts 10 jobs
    #	there are only 40 total jobs
    para try
    sleep 30
    para check
    para try
    sleep 30
    para check
    para try
    sleep 30
    para check
    para try
    para check ... all 40 are running at this point, some are already done
    para time
Completed: 40 of 40 jobs
CPU time in finished jobs:      20946s     349.11m     5.82h    0.24d  0.001 y
IO & Wait Time:                  5543s      92.38m     1.54h    0.06d  0.000 y
Average job time:                 662s      11.04m     0.18h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            1934s      32.23m     0.54h    0.02d
Submission to last job:          1934s      32.23m     0.54h    0.02d

    # Load into the database
    ssh hgwdev
    cd /cluster/data/mm6/bed/simpleRepeat
    cat trf/chr*.bed > simpleRepeat.bed
    hgLoadBed mm6 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    #	Loaded 1152810 elements of size 16

    featureBits mm6 simpleRepeat
    #	83220723 bases of 2597150411 (3.204%) in intersection
    featureBits mm5 simpleRepeat
    # 81414259 bases of 2615483787 (3.113%) in intersection
    featureBits mm4 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection
    featureBits mm3 simpleRepeat
    # 75457193 bases of 2505900260 (3.011%) in intersection

#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-03-14 - DONE)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/mm6/bed/simpleRepeat
    mkdir trfMask
    for F in trf/chr*.bed
    do
	echo "${F} -> ${F/trf\//}"
	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
    done


#############################################################################
# REPEATMASKER RUN (after contigs have been distributed to panasas FS)
#	(DONE - 2005-03-10 - 2005-03-14 - Hiram)
#  RM Version: RepBase Update 9.11, RM database version 20050112
#	/cluster/bluearc/RepeatMasker050112
    ssh kk

    #- Make the run directory and job list:
    cd /cluster/data/mm6
    cat << '_EOF_' > jkStuff/RMMouse
#!/bin/csh -fe

cd /cluster/data/mm6/$1
pushd .
/bin/mkdir -p /tmp/mm6/$2
/bin/cp /panasas/store/mm6/contigs/$1/$2 /tmp/mm6/$2
cd /tmp/mm6/$2
/cluster/bluearc/RepeatMasker050112/RepeatMasker -ali -s -species mus $2
popd
/bin/cp /tmp/mm6/$2/$2.out ./
if (-e /tmp/mm6/$2/$2.align) /bin/cp /tmp/mm6/$2/$2.align ./
if (-e /tmp/mm6/$2/$2.tbl) /bin/cp /tmp/mm6/$2/$2.tbl ./
if (-e /tmp/mm6/$2/$2.cat) /bin/cp /tmp/mm6/$2/$2.cat ./
/bin/rm -fr /tmp/mm6/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm6/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm6
'_EOF_'
    chmod +x jkStuff/RMMouse

    mkdir -p RMRun
    rm -f RMRun/RMJobs
    cat contig500K.lst | while read C
    do
	D=`dirname ${C}`
	F=`basename ${C}`
	echo /cluster/data/mm6/jkStuff/RMMouse ${D} ${F} \
		'{'check out line+ /cluster/data/mm6/${D}/${F}.out'}'
    done >> RMRun/RMJobs

    #- Do the run
    ssh kk
    cd /cluster/data/mm6/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 6678 of 6678 jobs
# CPU time in finished jobs:   45303442s  755057.37m 12584.29h  524.35d  1.437 y
# IO & Wait Time:                100211s    1670.18m    27.84h    1.16d  0.003 y
# Average job time:                6799s     113.32m     1.89h    0.08d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           10760s     179.33m     2.99h    0.12d
# Submission to last job:        121602s    2026.70m    33.78h    1.41d
    #	had cluster contention with other jobs

    #- Lift up the split-contig .out's to contig-level .out's
    ssh kksilo
    cd /cluster/data/mm6
    for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
	??/chr??_random_[0-9]*
    do
	CONTIG=`basename ${D}`
	liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft warn \
		${D}/${CONTIG}_[0-9]*.fa.out
    done > tmp/RM.lift.outs 2>&1

    cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh
#!/bin/sh
for C in ? ??
do
    echo "lifting ${C}"
    cd ${C}
    if [ -s lift/ordered.lft ]; then
	liftUp chr${C}.fa.out lift/ordered.lft warn `cat lift/oOut.lst`
    else
	echo "WARNING: Can not find ${C}/lift/ordered.lft"
    fi
    if [ -s lift/random.lft ]; then
	liftUp chr${C}_random.fa.out lift/random.lft warn `cat lift/rOut.lst`
    fi
    cd ..
done
'_EOF_'
    # << keep emacs coloring happy
    chmod +x jkStuff/liftRM_out_to_chr.sh
    ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1
    #	scan the results tmp/liftRM_out_to_chr.out
    #	there is a single: WARNING: Can not find Un/lift/ordered.lft
    #	which is OK
    #	List the final .out files, nothing should be size 0:
    ls -og */*.fa.out

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/mm6
    hgLoadOut mm6 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \
	??/chr??_random.fa.out
    #	about 7 minutes, there are always a few of these errors:
Strange perc. field -0.1 line 179923 of 1/chr1.fa.out
Strange perc. field -0.1 line 190937 of 1/chr1.fa.out
Strange perc. field -0.1 line 83366 of 5/chr5.fa.out
Strange perc. field -4.5 line 57734 of 7/chr7.fa.out
Strange perc. field -3.1 line 110634 of 10/chr10.fa.out
Strange perc. field -9.2 line 110634 of 10/chr10.fa.out
Strange perc. field -0.7 line 44931 of 14/chr14.fa.out
Strange perc. field -0.1 line 952 of 9/chr9_random.fa.out
Loading up table chrUn_random_rmsk
note: 394 records dropped due to repStart > repEnd
      run with -verbose=2 for details

    #	verify everything seems normal compared with previous builds
    featureBits mm6 rmsk
    #	1110222842 bases of 2597150411 (42.748%) in intersection
    featureBits mm5 rmsk
    #	1137310280 bases of 2615483787 (43.484%) in intersection
    featureBits mm4 rmsk
    #	1130883581 bases of 2627444668 (43.041%) in intersection
    featureBits mm3 rmsk
    #	1080265553 bases of 2505900260 (43.109%) in intersection

    featureBits -countGaps mm6 rmsk
    #	1110222842 bases of 3079633452 (36.050%) in intersection
    featureBits -countGaps mm5 rmsk
    #	1137310280 bases of 3164952073 (35.935%) in intersection
    featureBits -countGaps mm4 rmsk
    #	1130883581 bases of 2952612207 (38.301%) in intersection
    featureBits -countGaps mm3 rmsk
    #	1080265553 bases of 2708220133 (39.888%) in intersection


#############################################################################
# GC5BASE (DONE - 2005-03-10 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm6/bed/gc5Base
    cd /cluster/data/mm6/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm6 \
        /cluster/data/mm6 | wigEncode stdin gc5Base.wig gc5Base.wib
    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/mm6/mm6.2bit
    #       File stdout created
    #	Converted stdin, upper limit 100.00, lower limit 0.00

    #	runs for about 50 minutes

    mkdir /gbdb/mm6/wib
    ln -s `pwd`/gc5Base.wib /gbdb/mm6/wib
    hgLoadWiggle -pathPrefix=/gbdb/mm6/wib mm6 gc5Base gc5Base.wig

    #	verify index is correct:
    hgsql mm6 -e "show index from gc5Base;"
    #	should see good numbers in Cardinality column, NULL:
    hgsql mm6 -e "analyze table gc5Base;"

#############################################################################
# PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE
#		(DONE - 2005-03-14 - Hiram)
    ssh kksilo
    cd /cluster/data/mm6
    for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	FA=${CHR#*\/}
	C=${FA%.fa}
	echo -n "repeat masking ${C} ... "
	/cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR}
	echo -n "adding simpleRepeats ... "
	/cluster/bin/i386/maskOutFa -softAdd ${CHR} \
		bed/simpleRepeat/trfMask/${C}.bed ${CHR}
	echo "done - ${CHR}"
    done > tmp/addRM_and_Simple.out 2>&1

    # you will note the usual warnings about troublesome coordinates
    # in the repeat masker outputs - even more than when they were lifted.

    #	and make the hard masked sequences from these soft masked sequences
    for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	echo "maskOutFa ${CHR} hard ${CHR}.masked"
	/cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked
    done > /tmp/hardMask.out 2>&1

    #	rebuild the nib file
    faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm6Soft.2bit
    #	verify the sequence is still the same size as before:
    twoBitInfo mm6.2bit stdout | sort -rn +1 | sum -r
    #	62443     1
    sum -r chrom.sizes
    #	62443     1
    #	replace the former unmasked 2bit file with this new one:
    rm mm6.2bit
    mv mm6Soft.2bit mm6.2bit
    #	check the browser, make sure it is functioning OK

    #	Copy to panasas unit for cluster runs
    cp -p mm6.2bit /panasas/store/mm6/mm6.2bit
    mkdir /panasas/store/mm6/fasta
    time cp -p */*.fa */*.fa.masked /panasas/store/mm6/fasta

#############################################################################
# PREPARE "bigZips" files for public release
#		(DONE through mrna.fa - 2005-03-15 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/bigZips
    cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips
    cp -p /usr/local/apache/htdocs/goldenPath/mm5/bigZips/README.txt .
    # edit README.txt to indicate proper version of sequence and
    #	RepeatMasker
    cd /cluster/data/mm6
    tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromAgp.tar.gz \
	?/chr*.agp ??/chr*.agp
    tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromFa.tar.gz \
	?/chr*.fa ??/chr*.fa
    tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromFaMasked.tar.gz \
	?/chr*.fa.masked ??/chr*.fa.masked
    tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromOut.tar.gz \
	?/chr*.fa.out ??/chr*.fa.out
    cd /cluster/data/mm6/bed/simpleRepeat
    tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromTrf.tar.gz \
	./trfMask
    # get GenBank native mRNAs
    cd /cluster/data/genbank
    ./bin/i386/gbGetSeqs -db=mm6 -native GenBank mrna \
	/usr/local/apache/htdocs/goldenPath/mm6/bigZips/mrna.fa
    cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips
    gzip mrna.fa

    # add upstreams file (Heather, Sept. 2005)
    cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips
    nice featureBits mm6 refGene:upstream:1000 -fa=upstream1000.fa
    nice gzip upstream1000.fa
    nice featureBits mm6 refGene:upstream:2000 -fa=upstream2000.fa
    nice gzip upstream2000.fa
    nice featureBits mm6 refGene:upstream:5000 -fa=upstream5000.fa
    nice gzip upstream5000.fa
    
    md5sum *.gz > md5sum.txt

#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2005-03-14 - Hiram)

    ssh kksilo
    mkdir /panasas/store/mm6/rmsk
    cd /cluster/data/mm6
    cp -p */chr*.fa.out /panasas/store/mm6/rmsk
    mkdir /panasas/store/mm6/rmsk.spec
    cd /panasas/store/mm6/rmsk.spec
    ln -s ../rmsk/*.out .

    for FN in chr*.fa.out
    do
	echo ${FN}
	/cluster/bluearc/RepeatMasker050112/DateRepeats \
	    ${FN} -query mouse -comp human -comp rat -comp dog -comp cow
    done
    #	takes about 30 minutes

    cd /panasas/store/mm6
    mkdir linSpecRep.notInHuman
    mkdir linSpecRep.notInRat
    mkdir linSpecRep.notInDog
    mkdir linSpecRep.notInCow
    for F in rmsk.spec/chr*.out_homo-sapiens*
    do
	B=${F/rmsk.spec\/}
	B=${B/.fa.out*/}
	echo $B 
        /cluster/bin/scripts/extractRepeats 1 ${F} > \
		linSpecRep.notInHuman/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 2 ${F} > \
		linSpecRep.notInRat/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 3 ${F} > \
		linSpecRep.notInDog/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 4 ${F} > \
		linSpecRep.notInCow/${B}.out.spec
    done
    #	the notInHuman, notInDog, and notInCow ended up being identical

#############################################################################
# NIBS for BLASTZ (DONE - 2005-03-15 - Hiram)
    #	turns out not all the details are worked out with the blastz
    #	script to allow it to use a 2bit file for target.  So, we will
    #	need nib files until then.  Eventually this requirement should
    #	be eliminated.
    ssh kksilo
    cd /cluster/data/mm6
    mkdir nib
    for C in ?/chr?*.fa ??/chr??*.fa
    do
	B=${C/*\/}
	B=${B/.fa/}
	echo faToNib -softMask ${C} nib/${B}.nib
	rm -f nib/${B}.nib
	faToNib -softMask ${C} nib/${B}.nib
    done
    mkdir /panasas/store/mm6/nib
    cp -p nib/* /panasas/store/mm6/nib

#########################################################################
# CONTIG SPLIT UP - (DONE - 2005-03-24 - Hiram)
    #	A first attempt was made to allow the genbank alignments to work
    #	from just the 2bit file, but that leads to some large job
    #	situations and things are not split up as best as they could be.
    #	A survey was taken of the gaps and longest stretch of unbroken
    #	sequence (see /cluster/data/mm6/gapAnalysis/ )
    #	and there are only 30 stretches of sequence longer than 5 Mb
    #	Going to try an faSplit in a 10 Mb basis (this used to be 5
    #	Mb in the past, split on contigs) and see how that goes.

#############################################################################
# BREAK UP SEQUENCE INTO 10 MB CHUNKS AT GAPS OF AT LEAST 100
    ssh kksilo
    cd /cluster/data/mm6
    mkdir ctgs10Mb
    mkdir ctgs10Mb/lift
    for C in ? ??
    do
	mkdir ctgs10Mb/${C}
	if [ -s ${C}/chr${C}.fa ]; then
	    echo -n "working: chr${C} ... "
	    $HOME/bin/i386/faSplit -minGapSize=100 \
		-lift=ctgs10Mb/lift/chr${C}.lft gap \
		    ${C}/chr${C}.fa 10000000 ctgs10Mb/${C}/chr${C}_
	fi
	if [ -s ${C}/chr${C}_random.fa ]; then
	    echo -n "working: chr${C}_random ... "
	    $HOME/bin/i386/faSplit -minGapSize=100 \
		-lift=ctgs10Mb/lift/chr${C}_random.lft gap \
		    ${C}/chr${C}_random.fa 10000000 \
			ctgs10Mb/${C}/chr${C}_random_
	fi
    done
    cat ctgs10Mb/lift/*.lft > jkStuff/liftAll.lft
    cp -p jkStuff/liftAll.lft /panasas/store/mm6

#########################################################################
# GENBANK auto update started (DONE - 2005-03-15 - 2005-03-29 - Hiram)
    ssh eieio
    cd /cluster/data/genbank
    #	edit etc/genbank.conf, add the following section:
# mm6
mm6.genome = /panasas/store/mm6/mm6.2bit
mm6.lift = /panasas/store/mm6/liftAll.lft
mm6.downloadDir = mm6
mm6.genbank.est.xeno.load = yes
mm6.mgcTables.default = full
mm6.mgcTables.mgc = all

    #	Do the refseq's first, they are the quick ones
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial mm6
    #	var/build/logs/2005.03.25-12:18:01.mm6.initalign.log
    #	real    109m23.547s
    #	user    4m8.057s
    #	sys     1m18.459s
# Completed: 5190 of 5190 jobs
# CPU time in finished jobs:      44385s     739.76m    12.33h    0.51d  0.001 y
# IO & Wait Time:                 20723s     345.38m     5.76h    0.24d  0.001 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             278s       4.63m     0.08h    0.00d
# Submission to last job:          6017s     100.28m     1.67h    0.07d

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6
    #	var/dbload/hgwdev/logs/2005.03.25-15:31:22.dbload.log
    #	real    2m28.355s
    #	user    0m19.830s
    #	sys     0m10.180s

    #	check the RefSeq Genes track shows up in the browser
    #	table browser query on RefSeq Genes whole genome, summary stats
    #	indicates 18,397 items covering 713,077,002 (%27.46) bases

    #	To get the genbank started, the above results need to be
    #	moved out of the way.  These things can be removed if there are
    #	no problems to debug
    ssh eieio
    cd /cluster/data/genbank/work
    mv initial.mm6 initial.mm6.refseq.mrna
    # or: rm -fr initial.mm6

    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial mm6
    #	logFile: var/build/logs/2005.03.25-17:13:13.mm6.initalign.log
    #	RUNNING - 2005-03-25 17:30
    #	There was one incredibly long job that occupied most of the time
    #	real    741m1.285s
    #	user    88m31.751s
    #	sys     25m6.943s
# Completed: 27680 of 27680 jobs
# CPU time in finished jobs:    5495665s   91594.42m  1526.57h   63.61d  0.174 y
# IO & Wait Time:                114623s    1910.38m    31.84h    1.33d  0.004 y
# Average job time:                 203s       3.38m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           13544s     225.73m     3.76h    0.16d
# Submission to last job:         37663s     627.72m    10.46h    0.44d

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    # There is a lock file present, I believe from the previous load:
    [hiram@hgwdev /cluster/data/genbank/var/dbload/hgwdev/run] ls -l
    #	-rw-r--r--    1 hiram    protein        18 Mar 15 10:08 dbload.lock
    # checking that it is actually owned by yourself, it is safe to remove
    #	it.  The next load will not proceed with this lock in place.
    rm var/dbload/hgwdev/run/dbload.lock

    time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6
    #	var/dbload/hgwdev/logs/2005.03.26-08:47:42.dbload.log
    #	22 minute load time

    #	And finally, the big est run - expect several days on this one
    ssh eieio
    cd /cluster/data/genbank/work
    mv initial.mm6 initial.mm6.genbank.mrna

    #	since it is going to run several days, create a screen for it to
    #	run in.  detach and reattach as necessary to view the progress
    #	of the job
    cd /cluster/data/genbank
    screen
    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial mm6
    #	var/build/logs/2005.03.26-09:00:22.mm6.initalign.log
    #	STARTED 2005-03-26 09:13
    #	FINISHED 2005-03-28 11:53
    #	about 50 hours run time

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh eieio
    screen -d -r
# Completed: 159852 of 159852 jobs
# CPU time in finished jobs:  110196174s 1836602.90m 30610.05h 1275.42d  3.494 y
# IO & Wait Time:               1230416s   20506.93m   341.78h   14.24d  0.039 y
# Average job time:                 697s      11.62m     0.19h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           41516s     691.93m    11.53h    0.48d
# Submission to last job:        137193s    2286.55m    38.11h    1.59d

    ssh hgwdev
    cd /cluster/data/genbank
    time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6
    #	many tables are loaded by this load:
    #	refFlat, refGene, refLink, refSeqAli, refSeqStatus, refSeqSummary
    #	mgcGenes mgcFullMrna mgcFullStatus intronEst all_est chr*est
    #	estOrientInfo xenoEst
    #	var/dbload/hgwdev/logs/2005.03.28-13:38:13.dbload.log
    #	LOADING - 2005-03-28 - 13:35
    #	FINISHED - 2005-03-29 - 00:43
    #	real    665m1.596s
    #	user    105m19.790s
    #	sys     33m10.390s
    #	Note, that is an 11 hour load time.
    #	Measurements:  (these are interesting in the table browser too)
    #	RefSeq Genes
    featureBits mm6 refGene
    #	41752877 bases of 2597150411 (1.608%) in intersection
    #	MGC Genes
    featureBits mm6 refFlat
    #	41752877 bases of 2597150411 (1.608%) in intersection
    featureBits mm6 refSeqAli
    #	41738603 bases of 2597150411 (1.607%) in intersection
    #	additionally created tables by the genbank process that can not
    #	be measured with featureBits
    #	refLink, refSeqStatus, refSeqSummary
    featureBits mm6 mgcGenes
    #	27174785 bases of 2597150411 (1.046%) in intersection
    #	Mouse mRNAs
    featureBits mm6 all_mrna
    #	112068807 bases of 2597150411 (4.315%) in intersection
    #	Spliced ESTs
    featureBits mm6 intronEst
    #	52812872 bases of 2597150411 (2.033%) in intersection
    #	Mouse ESTs
    featureBits mm6 est
    #	236687034 bases of 2597150411 (9.113%) in intersection
    #	Non-Mouse mRNAs
    featureBits mm6 xenoMrna
    #	52119099 bases of 2597150411 (2.007%) in intersection
    #	Non-Mouse ESTs
    featureBits mm6 xenoEst

#########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE 2005-03-14 - 2005-03-17 Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm6/bed/genscan
    cd /cluster/data/mm6/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/mm6/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    ls -1S /panasas/store/mm6/fasta/*.masked > genome.list

    # XXX There is an error in the following template, note the extra
    # space between the - and par=
    #	It turns out the default for the -par argument is this same
    #	matrix so the extra space had no effect on the end result.
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
    #	Had two jobs crash:
# Completed: 38 of 40 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:     343416s    5723.60m    95.39h    3.97d  0.011 y
# IO & Wait Time:                  3443s      57.38m     0.96h    0.04d  0.000 y
# Average job time:                9128s     152.13m     2.54h    0.11d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           27423s     457.05m     7.62h    0.32d
# Submission to last job:         34524s     575.40m     9.59h    0.40d


    # If there are crashes, diagnose with "para problems".  
    para problems > problems.0
    #	Two of them needed to be rerun, adjust window down to 2000000 to
    #	get them to complete.  Lower that number if the error persists.
    ssh kolossus
    cd /cluster/data/mm6/bed/genscan
    # XXX There is an error in the following commands, note the extra
    # space between the - and par=
    #	It turns out the default for the -par argument is this same
    #	matrix so the extra space had no effect on the end result.
    /cluster/bin/x86_64/gsBig /panasas/store/mm6/fasta/chr2.fa.masked gtf/chr2.fa.gtf -trans=pep/chr2.fa.pep -subopt=subopt/chr2.fa.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
    /cluster/bin/x86_64/gsBig /panasas/store/mm6/fasta/chr14.fa.masked gtf/chr14.fa.gtf -trans=pep/chr14.fa.pep -subopt=subopt/chr14.fa.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000

    # cat the results into single files
    ssh kksilo
    cd /cluster/data/mm6/bed/genscan
    cat gtf/chr?.fa.gtf gtf/chr??.fa.gtf gtf/chr?_random.fa.gtf \
	gtf/chr??_random.fa.gtf > genscan.gtf
    cat subopt/chr?.fa.bed subopt/chr??.fa.bed subopt/chr?_random.fa.bed \
	subopt/chr??_random.fa.bed > genscanSubopt.bed
    cat pep/chr?.fa.pep pep/chr??.fa.pep pep/chr?_random.fa.pep \
	pep/chr??_random.fa.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm6/bed/genscan
    ldHgGene mm6 -gtf genscan genscan.gtf
    hgPepPred mm6 generic genscanPep genscan.pep
    hgLoadBed mm6 genscanSubopt genscanSubopt.bed

    #	check the numbers
    featureBits mm6 genscan
    #	54894283 bases of 2597150411 (2.114%) in intersection
    featureBits mm5 genscan
    #	55024722 bases of 2615483787 (2.104%) in intersection
    featureBits mm4 genscan
    #	56164126 bases of 2627444668 (2.138%) in intersection
    featureBits mm3 genscan
    #	51697165 bases of 2505900260 (2.063%) in intersection

    featureBits mm6 genscanSubopt
    #	57856316 bases of 2597150411 (2.228%) in intersection
    featureBits mm5 genscanSubopt
    #	58474899 bases of 2615483787 (2.236%) in intersection
    featureBits mm4 genscanSubopt
    #	59601009 bases of 2627444668 (2.268%) in intersection
    featureBits mm3 genscanSubopt
    #	56085184 bases of 2505900260 (2.238%) in intersection


#########################################################################
#	BLASTZ NOTE:  with the advent of Angie's script to run the
#	blastz process through to chains and nets loaded into the
#	database and download files prepared, it is now a juggling act
#	to see which klusters are available.  The particular options to
#	the script to make it go to one kluster or another are to be
#	determined at run-time.  The typical run-times listed here will
#	be a factor in your choice of kluster to operate on.
#########################################################################
# BLASTZ HUMAN Hg17 (DONE - 2005-03-14 - 2005-03-18 - Hiram)
    ssh kk
    mkdir /cluster/data/mm6/bed/blastzHg17.2005_03_14
    cd /cluster/data/mm6/bed/blastzHg17.2005_03_14
    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse
SEQ1_DIR=/panasas/store/mm6/nib
# not used
SEQ1_RMSK=/panasas/store/mm6/rmsk
# not used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Human
SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=
# FLAG not currently used
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzHg17.2005_03_14

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/hg17/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	real    993m28.547s
    #	user    0m0.198s
    #	sys     0m0.171s
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED - 2005-03-17 21:25
    #	FINISHED - 2005-03-18 14:00
# Completed: 45347 of 45347 jobs
# CPU time in finished jobs:   16921981s  282033.02m  4700.55h  195.86d  0.537 y
# IO & Wait Time:               2381711s   39695.18m   661.59h   27.57d  0.076 y
# Average job time:                 426s       7.09m     0.12h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            9568s     159.47m     2.66h    0.11d
# Submission to last job:         58695s     978.25m    16.30h    0.68d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:        272s       4.54m     0.08h    0.00d  0.000 y
# IO & Wait Time:                  1145s      19.08m     0.32h    0.01d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest job:                       24s       0.40m     0.01h    0.00d
# Submission to last job:           265s       4.42m     0.07h    0.00d

    #	The kki batch doChainRun.csh appears to have failed
    #	due to underlying changes in the location of hg17 items
    #	fixup the symlinks which are in a state of flux today, then,
    #	to recover:
    ssh kki
    cd /cluster/data/mm6/bed/blastzHg17.2005_03_14/axtChain/run
    rm -fr chain
    time ./doChainRun.csh
    #	real    22m47.917s
    #	user    0m0.380s
    #	sys     0m0.630s
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       6373s     106.22m     1.77h    0.07d  0.000 y
# IO & Wait Time:                   552s       9.20m     0.15h    0.01d  0.000 y
# Average job time:                 173s       2.89m     0.05h    0.00d
# Longest job:                      662s      11.03m     0.18h    0.01d
# Submission to last job:          1200s      20.00m     0.33h    0.01d

    #	That was the last part of the chainRun step, can now continue:
    ssh kksilo
    cd /cluster/data/mm6/bed/blastzHg17.2005_03_14
    time /cluster/bin/scripts/doBlastzChainNet.pl -continue chainMerge `pwd`/DEF > chainMerge.run.out 2>&1 &
    # STARTED - 2005-03-18 15:00
    # FINISHED 2005-03-18 16:33

    #	checking the numbers for sanity:
    ssh hgwdev
    #	expect ~ 2m30 seconds for chain measurement
    time featureBits mm6 chainHg17
    #	2596946329 bases of 2597150411 (99.992%) in intersection
    time featureBits mm5 chainHg17
    #	2507720521 bases of 2615483787 (95.880%) in intersection


    #	expect ~ 2m30s seconds for net measurement
    time featureBits mm6 netHg17
    #	2579747741 bases of 2597150411 (99.330%) in intersection
    time featureBits mm5 netHg17
    #	2504056038 bases of 2615483787 (95.740%) in intersection

    ssh kolossus
    #	expect ~ 20-22 minutes for the chainLink measurement
    HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
	featureBits mm6 chainHg17Link
    #	966916309 bases of 2597150411 (37.230%) in intersection
    HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
	featureBits mm5 chainHg17Link
    #	1025750185 bases of 2615483787 (39.218%) in intersection

    #	swap results to place mm6 alignments onto Hg17
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &
    #	STARTED - 2005-03-29 - 15:58
    #	FINI - 2005-03-29 - 18:48
    #	real    171m26.172s
    #	user    0m2.270s
    #	sys     0m0.870s

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm6Link
    #	969459954 bases of 2866216770 (33.824%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link
    #	1020106336 bases of 2866216770 (35.591%) in intersection

    #	A measurement script to do all featureBits combinations:
    cd /cluster/data/mm6/jkStuff
    cat << '_EOF_' >  netChainCheck.sh
#!/bin/sh

usage()
{
echo "usage: netChainCheck.sh <db0> <db1> <targetDb>"
echo "    does: featureBits <db0> net<targetDb>"
echo "          featureBits <db1> net<targetDb>"
echo "    as well as the chain and chainLink tables,"
echo "    and on the targetDb:"
echo "          featureBits <targetDb> net<db0>"
echo "          featureBits <targetDb> net<db1>"
echo "    and the chain and chainLink tables."
echo -e "\texample: netChainCheck.sh mm6 mm5 fr1"
}

doOne() 
{   
db=$1
tbl=$2
echo "    featureBits $db $tbl"
echo -en "    #\t"
time featureBits $db $tbl
}

ucFirstLetter()
{
ucString="$1"
fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"`
rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"`
FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'`
echo "${FC}${rest}"
}

if [ "$#" -ne 3 ]; then
    usage
    exit 255
fi

db0=$1
db1=$2
targetDb=$3

targetDB=`ucFirstLetter "${targetDb}"`
DB0=`ucFirstLetter "${db0}"`
DB1=`ucFirstLetter "${db1}"`

export db0 db1 targetDb targetDB DB0 DB1
# echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}"

doOne "${db0}" net${targetDB}
doOne "${db1}" net${targetDB}
doOne "${db0}" chain${targetDB}
doOne "${db1}" chain${targetDB}
doOne "${db0}" chain${targetDB}Link
doOne "${db1}" chain${targetDB}Link
doOne ${targetDb} net${DB0}
doOne ${targetDb} net${DB1}
doOne ${targetDb} chain${DB0}
doOne ${targetDb} chain${DB1}
doOne ${targetDb} chain${DB0}Link
doOne ${targetDb} chain${DB1}Link
'_EOF_'
    # << keep emacs coloring happy

#########################################################################
# BLASTZ RAT Rn3 (FIRST TRY - 2005-03-15 - 2005-03-17 - Hiram)
#	THESE ARE THE CORRECT PARAMETERS - the second try was not used
#		it was too restrictive and cuts out too many alignments
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzRn3.2005_03_22
    cd /cluster/data/mm6/bed/blastzRn3.2005_03_22
    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/panasas/store/mm6/nib
# not used
SEQ1_RMSK=/panasas/store/mm6/rmsk
# not used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=
# FLAG not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzRn3.2005_03_15

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/rn3/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
# Completed: 40713 of 40713 jobs
# CPU time in finished jobs:   18170174s  302836.24m  5047.27h  210.30d  0.576 y
# IO & Wait Time:               1770530s   29508.83m   491.81h   20.49d  0.056 y
# Average job time:                 490s       8.16m     0.14h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           28252s     470.87m     7.85h    0.33d
# Submission to last job:         69864s    1164.40m    19.41h    0.81d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:       1168s      19.46m     0.32h    0.01d  0.000 y
# IO & Wait Time:                  3047s      50.79m     0.85h    0.04d  0.000 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest job:                      119s       1.98m     0.03h    0.00d
# Submission to last job:           359s       5.98m     0.10h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:      12274s     204.56m     3.41h    0.14d  0.000 y
# IO & Wait Time:                  1719s      28.66m     0.48h    0.02d  0.000 y
# Average job time:                 350s       5.83m     0.10h    0.00d
# Longest job:                     1016s      16.93m     0.28h    0.01d
# Submission to last job:          1482s      24.70m     0.41h    0.02d

    #	After this was complete, realized that it needs a minScore
    #	filter on the chaining step.   Also, we need some pslChrom files
    #	for Gill's work:
    ssh kksilo
    /cluster/data/mm6/bed/blastzRn3.2005_03_15
    cat << '_EOF_' > mkPslChrom.sh
#!/bin/sh

if [ -d pslChrom ]; then
    mv pslChrom pslChrom.0
    rm -fr pslChrom.0 &
fi

mkdir pslChrom

ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C
do
    echo -n "working: ${C} ... "
    zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \
            > pslChrom/${C}.psl
    echo "done"
done
'_EOF_'
    # << keep emacs coloring happy
    chmod +x mkPslChrom.sh
    ./mkPslChrom.sh

    #	After the experiment of 2005-03-22
    #	RELOADING these chains and nets
    ssh hgwdev
    hgsql mm6 -e "drop table netRn3;"
    hgsql mm6 -e "show tables;" | grep chainRn3 | while read T
    do
	hgsql mm6 -e "drop table ${T};"
	echo ${T}
    done
    #	kksilo currently off-limits to logins due to hardware difficulties
    ssh kolossus
    cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain
    chainSplit chain mm6.rn3.all.chain.gz
    ssh hgwdev
    cat << '_EOF_' > reLoad.csh
#!/bin/csh -ef
# Load chains:
cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain/chain
foreach f (*.chain)
    set c = $f:r
    echo "hgLoadChain mm6 ${c}_chainRn3 $f"
    hgLoadChain mm6 ${c}_chainRn3 $f
end

cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain
# Load nets:
netFilter -minGap=10 mm6.rn3.net.gz | hgLoadNet -verbose=0 mm6 netRn3 stdin
'_EOF_'
    # << keep emacs coloring happy

    chmod +x reLoad.csh
    time ./reLoad.csh
    #	~ 188m == 3 hours 8 min

    #	Measurements:
    time featureBits mm6 netRn3
    #	expect ~ 2m 12s
    #	2720144602 bases of 2597150411 (104.736%) in intersection
    time featureBits mm5 netRn3
    #	2638255333 bases of 2615483787 (100.871%) in intersection

    time featureBits mm6 chainRn3
    #	expect ~ 10m 30s to 13m 25s
    #	2768422449 bases of 2597150411 (106.595%) in intersection
    time featureBits mm5 chainRn3
    #	2646682349 bases of 2615483787 (101.193%) in intersection

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainRn3Link
    #	1802980225 bases of 2597150411 (69.421%) in intersection
    #	real    94m48.021s
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm5 chainRn3Link
    #	1798705001 bases of 2615483787 (68.771%) in intersection
    #	real    76m44.580s

    #	Prepare for a re-run of the swap, move the 2005_03_22 swap
    #	results out of the way
    ssh hgwdev
    cd /cluster/data/rn3/bed
    mv blastz.mm6.swap blastz.mm6.swap.2005_03_22

    cd /cluster/data/mm6/bed/blastzRn3.2005_03_15
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run_1.out 2>&1 &
    #	STARTED - 2005-03-29 15:55
    #	FINI - 2005-03-30 05:21
    #	real    807m3.833s
    #	user    0m2.200s
    #	sys     0m1.150s

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm6Link
    #	1812992492 bases of 2571104688 (70.514%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm5Link
    #	1673171206 bases of 2571104688 (65.076%) in intersection


#########################################################################
# BLASTZ RAT REDONE 2005-03-22 
#	THIS WAS AN EXPERIMENT - THESE RESULTS WERE DROPPED FROM THE DB
#	 more stringent BLASTZ parameters and chain filtering
#   COMPLETE 2005-03-23 - swap to place chainMm6 and netMm6 on rn3 browser
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzRn3.2005_03_22
    cd /cluster/data/mm6/bed/blastzRn3.2005_03_22
### XXXX - 2005-03-31 - THERE IS AN ERROR IN THIS DEF FILE SPECIFYING
### the SEQ2_LEN but it didn't seem to matter.  The blastz run appears
### to ahve used SEQ2 correctly despite this incorrect specification.
    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/bin/scripts:/cluster/home/angie/schwartzbin/

# mouse vs. rat
#	more stringent parameters to reduce output
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=50000
BLASTZ_T=2
# scoring matrix
BLASTZ_Q=/cluster/data/blastz/mus_rat.q 
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse (mm6)
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat (rn3)
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes
#  XXXXXXXXXXXXXXX   ^^^^^^^  wrong file ! XXXXXXXXXXXXXX

BASE=/cluster/data/mm6/bed/blastzRn3.2005_03_22

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/rn3/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore 5000 \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	These more strict blastz parameters make this run much faster;
# Completed: 40713 of 40713 jobs
# CPU time in finished jobs:    4813023s   80217.06m  1336.95h   55.71d  0.153 y
# IO & Wait Time:               1788355s   29805.91m   496.77h   20.70d  0.057 y
# Average job time:                 162s       2.70m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2543s      42.38m     0.71h    0.03d
# Submission to last job:         10945s     182.42m     3.04h    0.13d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:        146s       2.43m     0.04h    0.00d  0.000 y
# IO & Wait Time:                   840s      14.00m     0.23h    0.01d  0.000 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest job:                        7s       0.12m     0.00h    0.00d
# Submission to last job:            66s       1.10m     0.02h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:       3870s      64.50m     1.08h    0.04d  0.000 y
# IO & Wait Time:                   364s       6.06m     0.10h    0.00d  0.000 y
# Average job time:                 106s       1.76m     0.03h    0.00d
# Longest job:                      224s       3.73m     0.06h    0.00d
# Submission to last job:           406s       6.77m     0.11h    0.00d

    #	FINISHED - 2005-03-22 15:44

    #	it is helpful to time these commands to make sure everything
    #	is sane.  The times should also be similar, as are the numbers.

    #	check the numbers
    time featureBits mm6 chainRn3
    2705309999 bases of 2597150411 (104.165%) in intersection
    time featureBits mm5 chainRn3
    2646682349 bases of 2615483787 (101.193%) in intersection
    #	These chainRn3 tables on mm6 are much smaller than mm5
    #	There was an attempt during mm5 to run the blastz on rn3 with
    #	the stringent parameters used here, but that ran into
    #	difficulties as there were other tracks depending upon the
    #	older bulky alignments and it was necessary to bring the bulky
    #	alignments back.  There is a vast difference in the number of
    #	chains:  mm5.chr1_chainRn3: 1,865,181 rows, mm6.chr1_chainRn3: 16466
    #	mm5.chr1_chainRn3Link: 18,252,937 rows, mm6.chr1_chainRn3Link: 2,340,447

    #	trying to do the chainLink's requires kolossus, big memory
    ssh kolossus
    #	specify a .hg.conf file with read-only passwords:
    #	these take about 15 and 90 minutes (the mm5's are much larger)
    HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
	featureBits mm6 chainRn3Link
    #	1652692239 bases of 2597150411 (63.635%) in intersection
    #	real 864.72
    #	user 211.05
    #	sys 66.95

    #	1802980225 bases of 2597150411 (69.421%) in intersection
    HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
	featureBits mm5 chainRn3Link
    #	1798705001 bases of 2615483787 (68.771%) in intersection


    #	the netRn3 measurements take about 2m30s
    time featureBits mm6 netRn3
    #	2705309999 bases of 2597150411 (104.165%) in intersection
    time featureBits mm5 netRn3
    #	2638255333 bases of 2615483787 (100.871%) in intersection


    #	And then the swap of that:
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &
    #	STARTED - 2005-03-22 16:15
    #	FINISHED - 2005-03-22 17:41

    #	check the numbers
    featureBits rn3 chainMm6
    #	2819351420 bases of 2571104688 (109.655%) in intersection
    featureBits rn3 chainMm5
    #	2786666162 bases of 2571104688 (108.384%) in intersection

    featureBits rn3 netMm6
    #	2808675438 bases of 2571104688 (109.240%) in intersection
    featureBits rn3 netMm5
    #	2778454647 bases of 2571104688 (108.065%) in intersection

#########################################################################
# BLASTZ Zebrafish danRer2 (DONE - 2005-03-17 - 2005-03-18 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzDanRer2.2005_03_17
    cd /cluster/data/mm6/bed/blastzDanRer2.2005_03_17

    cat << '_EOF_' > DEF
# mouse (mm6) vs zebrafish (danRer2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1, danRer-hg17 and mm6-danRer1
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer2)
SEQ2_DIR=/iscratch/i/danRer2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzDanRer2.2005_03_17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/danRer2/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED - 2005-03-17 10:00
    #	FINISHED - 2005-03-18 01:01
    #	real    494m43.717s
    #	user    0m0.322s
    #	sys     0m0.184s

# Completed: 57263 of 57263 jobs
# CPU time in finished jobs:   14680881s  244681.36m  4078.02h  169.92d  0.466 y
# IO & Wait Time:                320049s    5334.14m    88.90h    3.70d  0.010 y
# Average job time:                 262s       4.37m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1308s      21.80m     0.36h    0.02d
# Submission to last job:         51993s     866.55m    14.44h    0.60d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:         87s       1.46m     0.02h    0.00d  0.000 y
# IO & Wait Time:                   869s      14.48m     0.24h    0.01d  0.000 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest job:                        8s       0.13m     0.00h    0.00d
# Submission to last job:           161s       2.68m     0.04h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:       2496s      41.60m     0.69h    0.03d  0.000 y
# IO & Wait Time:                   295s       4.92m     0.08h    0.00d  0.000 y
# Average job time:                  70s       1.16m     0.02h    0.00d
# Longest job:                      139s       2.32m     0.04h    0.00d
# Submission to last job:           470s       7.83m     0.13h    0.01d

    #	swap results to place mm6 alignments onto danRer2
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzDanRer2.2005_03_17
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap `pwd`/DEF > swap.run.out 2>&1 &
    #	~ 27 minutes
    cat << '_EOF_' > measurements.sh
#!/bin/sh

doOne()
{
db=$1
tbl=$2
echo "    featureBits $db $tbl"
echo -en "    #\t"
time featureBits $db $tbl
}

doOne mm6 netDanRer2
doOne mm5 netDanRer2
doOne mm6 chainDanRer2
doOne mm5 chainDanRer2
doOne mm6 chainDanRer2Link
doOne mm5 chainDanRer2Link
doOne danRer2 netMm6
doOne danRer2 netMm5
doOne danRer2 chainMm6
doOne danRer2 chainMm5
doOne danRer2 chainMm6Link
doOne danRer2 chainMm5Link
'_EOF_'
    # << keep emacs happy
    chmod +x measurements.sh
    time ./measurements.sh > measures.out 2>&1 &

    featureBits mm6 netDanRer2
    #   686375730 bases of 2597150411 (26.428%) in intersection
    featureBits mm5 netDanRer2
    #   553450442 bases of 2615483787 (21.161%) in intersection
    featureBits mm6 chainDanRer2
    #   782392894 bases of 2597150411 (30.125%) in intersection
    featureBits mm5 chainDanRer2
    #   598864029 bases of 2615483787 (22.897%) in intersection
    featureBits mm6 chainDanRer2Link
    #   162226493 bases of 2597150411 (6.246%) in intersection
    featureBits mm5 chainDanRer2Link
    #   59978861 bases of 2615483787 (2.293%) in intersection
    featureBits danRer2 netMm6
    #   576283947 bases of 1560497282 (36.930%) in intersection
    featureBits danRer2 netMm5
    #   476966014 bases of 1560497282 (30.565%) in intersection
    featureBits danRer2 chainMm6
    #   641696461 bases of 1560497282 (41.121%) in intersection
    featureBits danRer2 chainMm5
    #   505097651 bases of 1560497282 (32.368%) in intersection
    featureBits danRer2 chainMm6Link
    #   176391894 bases of 1560497282 (11.304%) in intersection
    featureBits danRer2 chainMm5Link
    #   68003819 bases of 1560497282 (4.358%) in intersection

#########################################################################
# BLASTZ FUGU fr1 (DONE - 2005-03-17 - 2005-03-19 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzFr1.2005_04_01
    cd /cluster/data/mm6/bed/blastzFr1.2005_04_01

    cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzFr1.2005_04_01

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/fr1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kki \
	-fileServer eieio -chainMinScore 5000 `pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED - 2005-04-01 16:30
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \
	-continue cat -fileServer eieio -chainMinScore 5000 \
	`pwd`/DEF > cat.run.out 2>&1 &
    #	STARTED - 2005-04-01 16:30

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED - 2005-03-17 11:00
    #	FINISHED - 2005-03-19 00:14
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-chainMinScore 5000 -swap `pwd`/DEF > swap.run.out 2>&1 &

    #	measurements
    featureBits mm6 netFr1
    #   618129802 bases of 2597150411 (23.800%) in intersection
    featureBits mm6 chainFr1
    #   666835089 bases of 2597150411 (25.676%) in intersection
     featureBits mm6 chainFr1Link
    #   55355465 bases of 2597150411 (2.131%) in intersection

    featureBits fr1 netMm6
    #   146828640 bases of 315518167 (46.536%) in intersection
    featureBits fr1 chainMm6
    #   160874127 bases of 315518167 (50.987%) in intersection
    featureBits fr1 chainMm6Link
    #   46266090 bases of 315518167 (14.664%) in intersection

#########################################################################
# BLASTZ TETRAODON tetNig1 (TBD - 2005-03-17 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzTetNig1.2005_03_17
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17
    # use same parameters as for danRer1-mm5
    #	NOTE: The BLASTZ_Q score matrix should have been the Tuned.gap
    #	one which is recreated below during the re-score
    cat << '_EOF_' > DEF
# mouse (mm6) vs Tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon (tetNig1)
SEQ2_DIR=/iscratch/i/tetNig1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzTetNig1.2005_03_17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy


    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/tetNig1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED 2005-03-17 21:30
    #	FINISHED 2005-03-18 05:10
    #	real    461m56.901s
    #	user    0m0.426s
    #	sys     0m0.310s
# Completed: 18867 of 18867 jobs
# CPU time in finished jobs:    2396227s   39937.11m   665.62h   27.73d  0.076 y
# IO & Wait Time:                 53160s     886.00m    14.77h    0.62d  0.002 y
# Average job time:                 130s       2.16m     0.04h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             591s       9.85m     0.16h    0.01d
# Submission to last job:         26573s     442.88m     7.38h    0.31d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:         27s       0.45m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   798s      13.30m     0.22h    0.01d  0.000 y
# Average job time:                   2s       0.04m     0.00h    0.00d
# Longest job:                        6s       0.10m     0.00h    0.00d
# Submission to last job:           200s       3.33m     0.06h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:        688s      11.47m     0.19h    0.01d  0.000 y
# IO & Wait Time:                   169s       2.82m     0.05h    0.00d  0.000 y
# Average job time:                  21s       0.36m     0.01h    0.00d
# Longest job:                       55s       0.92m     0.02h    0.00d
# Submission to last job:           212s       3.53m     0.06h    0.00d

    #	Re-do the chains with different scoring matrix
    ssh kki
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain/run

    # Reuse gap penalties from chicken run.
    #	It turns out this is unnecessary.  This scoring matrix is
    #	actually the default scoring matrix used in axtChain
    #	The processing below does not use this file.
    cat << '_EOF_' | sed 's/  */\t/g' > ../../Tuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line keeps emacs coloring happy
    rm batch
    para make jobList
    para check
    para time
# Completed: 40 of 40 jobs
# CPU time in finished jobs:        692s      11.54m     0.19h    0.01d  0.000 y
# IO & Wait Time:                   295s       4.91m     0.08h    0.00d  0.000 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              60s       1.00m     0.02h    0.00d
# Submission to last job:            87s       1.45m     0.02h    0.00d

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17
    mv mafNet mafNet.0
    mv axtNet axtNet.0
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain
    mv mm6.tetNig1.all.chain.gz mm6.tetNig1.all.chain.gz.0
    mv mm6.tetNig1.over.chain.gz mm6.tetNig1.over.chain.gz.0
    mv mm6.tetNig1.net.gz mm6.tetNig1.net.gz.0
    chainMergeSort run/chain/*.chain | nice gzip -c >  mm6.tetNig1.all.chain.gz
    chainSplit chain mm6.tetNig1.all.chain.gz
    time ./netChains.csh
    #	~ 3m17s
    
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain
    time ./loadUp.csh
    #	~ 7m40s
    gzip mm6.tetNig1.net
    cd /usr/local/apache/htdocs/goldenPath/mm6/vsTetNig1
    md5sum *.gz axtNet/*.gz > md5sum.txt

    #	swap results to place mm6 alignments onto TetNig1
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17
    rm -fr /usr/local/apache/htdocs/goldenPath/tetNig1/vsMm6
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap `pwd`/DEF > swap.run.out 2>&1 &
    
    #	Measurements:
    cat << '_EOF_' > measurements.sh
#!/bin/sh

doOne()
{
db=$1
tbl=$2
echo "    featureBits $db $tbl"
echo -en "    #\t"
time featureBits $db $tbl
}

doOne mm6 netTetNig1
doOne mm5 netTetNig1
doOne mm6 chainTetNig1
doOne mm5 chainTetNig1
doOne mm6 chainTetNig1Link
doOne mm5 chainTetNig1Link
doOne tetNig1 netMm6
doOne tetNig1 netMm5
doOne tetNig1 chainMm6
doOne tetNig1 chainMm5
doOne tetNig1 chainMm6Link
doOne tetNig1 chainMm5Link
'_EOF_'
    # << keep emacs happy
    chmod +x measurements.sh
    time ./measurements.sh

    featureBits mm6 netTetNig1
    #   720943295 bases of 2597150411 (27.759%) in intersection
    featureBits mm5 netTetNig1
    #   618111072 bases of 2615483787 (23.633%) in intersection
    featureBits mm6 chainTetNig1
    #   771732145 bases of 2597150411 (29.715%) in intersection
    featureBits mm5 chainTetNig1
    #   652622662 bases of 2615483787 (24.952%) in intersection
    featureBits mm6 chainTetNig1Link
    #   62346107 bases of 2597150411 (2.401%) in intersection
    featureBits mm5 chainTetNig1Link
    #   43905129 bases of 2615483787 (1.679%) in intersection
    featureBits tetNig1 netMm6
    #   176451958 bases of 342403326 (51.533%) in intersection
    featureBits tetNig1 netMm5
    #   152232538 bases of 342403326 (44.460%) in intersection
    featureBits tetNig1 chainMm6
    #   197657323 bases of 342403326 (57.726%) in intersection
    featureBits tetNig1 chainMm5
    #   163683179 bases of 342403326 (47.804%) in intersection
    featureBits tetNig1 chainMm6Link
    #   55282376 bases of 342403326 (16.145%) in intersection
    featureBits tetNig1 chainMm5Link
    #   41736750 bases of 342403326 (12.189%) in intersection
 
#########################################################################
# CPGISLANDS (DONE - 2005-03-17 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm6/bed/cpgIsland
    cd /cluster/data/mm6/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe ../..
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh kksilo
    cd /cluster/data/mm6/bed/cpgIsland
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo "./cpglh.exe ${FA} > ${C}.cpg"
	./cpglh.exe ${F} > ${C}.cpg
    done > cpglh.out 2>&1 &

    #	three warnings:
    #	Bad char 0x52 = 'R' at line 164245, base 8212187, sequence chr14
    #	Bad char 0x53 = 'S' at line 167424, base 8371114, sequence chr14
    #	Bad char 0x53 = 'S' at line 167426, base 8371198, sequence chr14
    #	Several chroms have 0 results:
    #	-rw-rw-r--  1     0 Mar 17 12:13 chr10_random.cpg
    #	-rw-rw-r--  1     0 Mar 17 12:18 chr9_random.cpg
    #	-rw-rw-r--  1     0 Mar 17 12:18 chrM.cpg
    #	-rw-rw-r--  1     0 Mar 17 12:18 chrY.cpg
    #	-rw-rw-r--  1     0 Mar 17 12:18 chrY_random.cpg
# XXX - this is interesting that chrY, either one, have nothing.
#	the previous mm5 release did have some on chrY
#	Evidently the new chrY is too short - this chrY is being
#	reconstructed and only a small part of it is known in this
#	assembly.  The bulk of chrY from previous assemblies is now in
#	chrY_random

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/mm6/bed/cpgIsland
    hgLoadBed mm6 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 16100 elements of size 10
    #	Sorted
    #	Saving bed.tab
    #	Loading mm6
    featureBits mm6 cpgIslandExt
    #	10432360 bases of 2597150411 (0.402%) in intersection
    featureBits mm5 cpgIslandExt
    #	10422989 bases of 2615483787 (0.399%) in intersection
    featureBits mm4 cpgIsland
    #	11109692 bases of 2627444668 (0.423%) in intersection
    featureBits mm3 cpgIsland
    #	10102968 bases of 2505900260 (0.403%) in intersection

#########################################################################
# BLASTZ Dog canFam1 (DONE - 2005-03-18 - 2005-04-03 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzCanFam1.2005_03_18
    cd /cluster/data/mm6/bed/blastzCanFam1.2005_03_18

    cat << '_EOF_' > DEF
# mouse vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm6)
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog (canFam1)
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzCanFam1.2005_03_18

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/canFam1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \
	-fileServer eieio -chainMinScore 5000 `pwd`/DEF > blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED - 2005-03-18 09:57
    #	BROKEN - 2005-03-20 11:00 - due to rearrangements on /scratch/
    #		need to regenerate the linSpec not in mouse for canFam1
    #	RESTARTED - 2005-03-30 16:25
    #	COMPLETELY RESTARTED 2005-04-01 11:40
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \
	-continue cat -fileServer eieio -chainMinScore 5000 \
	`pwd`/DEF > cat.run.out 2>&1 &
    #	Check measurements at this point:
    featureBits mm6 netCanFam1
    #   2544343230 bases of 2597150411 (97.967%) in intersection
    featureBits mm5 netCanFam1
    #   2456773441 bases of 2615483787 (93.932%) in intersection
    featureBits mm6 chainCanFam1
    #   2562947250 bases of 2597150411 (98.683%) in intersection
    featureBits mm5 chainCanFam1
    #   2464497454 bases of 2615483787 (94.227%) in intersection
    featureBits mm6 chainCanFam1Link
    #   798637320 bases of 2597150411 (30.751%) in intersection
    featureBits mm5 chainCanFam1Link
    #   859275338 bases of 2615483787 (32.853%) in intersection

    #	Those are looking pretty good, so now do the swap:
    ssh eieio
    cd /cluster/data/mm6/bed/blastzCanFam1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \
	-swap -fileServer eieio -chainMinScore 5000 \
	`pwd`/DEF > swap.run.out 2>&1 &
    #	125 minutes
    featureBits canFam1 netMm6
    #   2305458923 bases of 2359845093 (97.695%) in intersection
    featureBits canFam1 netMm5
    #   2255138517 bases of 2359845093 (95.563%) in intersection
    featureBits canFam1 chainMm6
    #   2310615069 bases of 2359845093 (97.914%) in intersection
    featureBits canFam1 chainMm5
    #   2257403477 bases of 2359845093 (95.659%) in intersection
    featureBits canFam1 chainMm6Link
    #   783631188 bases of 2359845093 (33.207%) in intersection
    featureBits canFam1 chainMm5Link
    #   837236252 bases of 2359845093 (35.478%) in intersection

#########################################################################
# BLASTZ Cow bosTau1 (DONE - 2005-03-18 - 2005-04-08 - Hiram)
    ssh kksilo
    mkdir /cluster/data/mm6/bed/blastzBosTau1.2005_03_18
    cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18

    cat << '_EOF_' > DEF
# mouse vs. cow
# TARGET: Mouse (mm6)
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/mm6/chrom.sizes

# QUERY: Cow (bosTau1)
SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes

BASE=/cluster/data/mm6/bed/blastzBosTau1.2005_03_18
'_EOF_'
    # << keep emacs coloring happy

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	STARTED - 2005-03-18 13:20
    #	BROKEN - 2005-03-20 - 22:03 - power failure to all machines
    #	RESTARTED - 2005-03-30 14:35
    #	After several reruns of the batch, believe it may be finished.
    #	establish check point marker in the run.time file:
    para time > run.time
    #	Now to the rest of the story:
    ssh eieio
    cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-continue cat `pwd`/DEF > cat.run.out 2>&1 &
# Completed: 40 of 40 jobs
# CPU time in finished jobs:        834s      13.90m     0.23h    0.01d  0.000 y
# IO & Wait Time:                  2421s      40.35m     0.67h    0.03d  0.000 y
# Average job time:                  81s       1.36m     0.02h    0.00d
# Longest job:                      334s       5.57m     0.09h    0.00d
# Submission to last job:           365s       6.08m     0.10h    0.00d

    #	measurements:
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18
    time ../../jkStuff/netChainCheck.sh mm6 mm5 bosTau1 >measurements.out 2>&1 &
    featureBits mm6 netBosTau1
    #   1483158691 bases of 2597150411 (57.107%) in intersection
    featureBits mm5 netBosTau1
    #   1491250043 bases of 2615483787 (57.016%) in intersection
    featureBits mm6 chainBosTau1
    #   1551920940 bases of 2597150411 (59.755%) in intersection
    featureBits mm5 chainBosTau1
    #   1557897465 bases of 2615483787 (59.564%) in intersection
    featureBits mm6 chainBosTau1Link
    #   603091864 bases of 2597150411 (23.221%) in intersection
    featureBits mm5 chainBosTau1Link
    #   606973993 bases of 2615483787 (23.207%) in intersection

    #	Looking OK, so do the swap
    ssh eieio
    cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap `pwd`/DEF > swap.run.out 2>&1 &
    #	308 m = 5h 8m
    #	failed on kolossus due to NFS problems
    ssh kolossus
    cd /cluster/data/bosTau1/bed/blastz.mm6.swap/axtChain
    #	extract the unfinished portion of netChains.csh into
    #	finiChains.csh and run it:
    time ./finiChains.csh
    #	STARTED - 2005-04-06
    #	13h 50m
    #	continuing
    ssh eieio
    cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap -continue load `pwd`/DEF > load.run.out 2>&1 &
    #	5h 6min load time
    #	checking measurements:
    featureBits bosTau1 netMm6
    #	1317934269 bases of 2261116798 (58.287%) in intersection
    featureBits bosTau1 netMm5
    #   1317539731 bases of 2261116798 (58.269%) in intersection
    featureBits bosTau1 chainMm6
    # 1325743373 bases of 2261116798 (58.632%) in intersection
    featureBits bosTau1 chainMm5
    #   1325445280 bases of 2261116798 (58.619%) in intersection
    featureBits bosTau1 chainMm6Link
    # 589779558 bases of 2261116798 (26.084%) in intersection
    featureBits bosTau1 chainMm5Link
    #   588460684 bases of 2261116798 (26.025%) in intersection

    #	looks good, done.

#############################################################################
# BLASTZ SELF (DONE - 2005-03-31 - 2005-04-08 - Hiram)
    # The procedure for lineage spec business with self is to simply
    # use the actual repeat masker output for this mouse assembly as
    # the lineage specific repeats for itself.  Thus, merely make
    # symlinks to the repeat masker out files and name them as expected
    # for blastz.  In this case they are called notInMouse but they
    # really mean InMouse.  Yes, it is confusing, but that's just the
    # nature of the game in this case.

    ssh eieio
    mkdir /panasas/store/mm6/linSpecRep.notInMouse
    cd /panasas/store/mm6/linSpecRep.notInMouse
    foreach f (../rmsk/*.fa.out)
	set base = $f:t:r:r
	echo $base.out.spec
	ln -s $f $base.out.spec
    end
    
    mkdir /cluster/data/mm6/bed/blastzSelf
    cd /cluster/data/mm6/bed/blastzSelf
    cat << '_EOF_' > DEF
# mouse vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm6
SEQ1_DIR=/panasas/store/mm6/nib
# RMSK not currently used
SEQ1_RMSK=/panasas/store/mm6/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm6
SEQ2_DIR=/panasas/store/mm6/nib
# RMSK not currently used
SEQ2_RMSK=/panasas/store/mm6/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/panasas/store/mm6/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzSelf

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    cp /cluster/data/mm6/chrom.sizes ./S2.len
    #	establish a screen to control this job
    screen
    #	kksilo was off-limits to logins as this started, use eieio
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED - 2005-03-31 - 13:53
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh eieio
    screen -d -r
    #	The job had difficulties due to failing nodes on the KK
    #	recover the job by going to kk machine and
    #	directory /cluster/data/mm6/bed/blastzSelf
    #	to get it to complete with para recover etc...
    #	One job seemed to take forever, it appears the section:
    #	chrUn_random:50000000-60010000 with the same piece, 40 hours.
# Completed: 8861 of 8861 jobs
# CPU time in finished jobs:    3519718s   58661.97m   977.70h   40.74d  0.112 y
# IO & Wait Time:                460422s    7673.70m   127.89h    5.33d  0.015 y
# Average job time:                 449s       7.49m     0.12h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          145760s    2429.33m    40.49h    1.69d
# Submission to last job:        263627s    4393.78m    73.23h    3.05d
    #	After it was complete, create the run.time file:
    ssh kk
    cd /cluster/data/mm6/bed/blastzSelf/run.blastz
    para time > run.time
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 &
    # STARTED - 2005-04-04 11:40
    #	three jobs failed, go to kolossus and try them there:
    ssh kolossus
    cd /cluster/data/mm6/bed/blastzSelf/axtChain/run
    ./chain.csh chrUn_random.nib:chrUn_random: \
	chain/chrUn_random.nib:chrUn_random:.chain
    #	chrUn failed too on kolossus
    ./chain.csh chrY_random.nib:chrY_random: \
	chain/chrY_random.nib:chrY_random:.chain
    #	chrY worked in 28 minutes
    ./chain.csh chrX.nib:chrX: chain/chrX.nib:chrX:.chain
    #	chrX worked in 44 minutes
    #	The chrUn business, in fact any of the chr*_random's shouldn't
    #	be chained anyway since they aren't contiguous sequence anyhow.
    #	So, simply leave chrUn out of the chain and net stuff.
    #	to continue:
    ssh kk
    cd /cluster/data/mm6/bed/blastzSelf/axtChain/run
    para time > run.time
    ssh eieio
    cd /cluster/data/mm6/bed/blastzSelf
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue chainMerge -fileServer eieio `pwd`/DEF > merge.run.out 2>&1 &
    #	385 min = 3h 25m
    #	that finished OK, checking the measurements:
    #	this self alignment only appears to be present on mm3 as mouseChain
    time featureBits mm6 netSelf
    # 2336281173 bases of 2597150411 (89.956%) in intersection
    time featureBits mm6 chainSelf
    #  2579948751 bases of 2597150411 (99.338%) in intersection
    time featureBits mm3 mouseChain
    # 889252994 bases of 2505900260 (35.486%) in intersection

    #	the chainLink measurements need kolossus:
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink
    #	417927047 bases of 2597150411 (16.092%) in intersection
    #	244 minutes
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm3 mouseChainLink
    #	383345536 bases of 2505900260 (15.298%) in intersection

    #	Gill likes to see the blastzSelf track:
    ssh eieio
    cd /cluster/data/mm6/bed/blastzSelf
    #	cat the pslParts together, per-chrom, and in chromStart order:
    ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C
    do
	echo -n "working: ${C} ... "
	zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \
            | gzip > pslChrom/${C}_blastzSelf.psl.gz
	echo "done"
    done

    # Load blastzSelf
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzSelf/pslChrom
    for I in *.psl.gz
    do
	$HOME/bin/i386/hgLoadPsl -noTNameIx mm6 ${I}
	echo "done: ${I}"
    done
    #	STARTED - 2005-04-06 15:24
    #	4h 24m load time - chrUn_random failed to load
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 blastzSelf
    #	8h 34m job
    #	471978757 bases of 2597150411 (18.173%) in intersection

#############################################################################
# BLASTZ CHICKEN - (DONE - 2005-03-21 - 2005-04-08 - Hiram)

    # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Do the same for mouse.  
    # Scripts expect *.out.spec filenames, so set that up:
    ssh eieio
    mkdir /panasas/store/mm6/linSpecRep.notInChicken
    cd /panasas/store/mm6/linSpecRep.notInChicken
    foreach f (../rmsk/*.fa.out)
	set base = $f:t:r:r
	echo $base.out.spec
	ln -s $f $base.out.spec
    end
    
    mkdir /cluster/data/mm6/bed/blastzGalGal2.2005_03_31
    cd /cluster/data/mm6/bed/blastzGalGal2.2005_03_31
    cat << '_EOF_' > DEF
# mouse vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm6
SEQ1_DIR=/panasas/store/mm6/nib
# RMSK not currently used
SEQ1_RMSK=/panasas/store/mm6/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInChicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzGalGal2.2005_03_31

DEF=$BASE/DEF
RAW=$BASE/raw
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    ssh eieio
    cd /cluster/data/mm6/bed/blastzGalGal2.2005_03_31
    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/galGal2/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-fileServer eieio `pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-04-01 11:30
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	CRASHED due to kksilo problems, finished batch manually, then
    #	continuing:
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 &
    #	measurements are looking good:
    featureBits mm6 netGalGal2
    #   1937053597 bases of 2597150411 (74.584%) in intersection
    featureBits mm5 netGalGal2
    #   1958796258 bases of 2615483787 (74.892%) in intersection
    featureBits mm6 chainGalGal2
    #   1969505681 bases of 2597150411 (75.833%) in intersection
    featureBits mm5 chainGalGal2
    #   1990102297 bases of 2615483787 (76.089%) in intersection
    featureBits mm6 chainGalGal2Link
    #   82018349 bases of 2597150411 (3.158%) in intersection
    featureBits mm5 chainGalGal2Link
    #   78951466 bases of 2615483787 (3.019%) in intersection

    #	Since those are OK, now do the swap:
    ssh eieio
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap -chainMinScore=5000 \
	-fileServer eieio `pwd`/DEF > swap.run.out 2>&1 &

    #	and measure:
    featureBits galGal2 netMm6
    #   832583709 bases of 1054197620 (78.978%) in intersection
    featureBits galGal2 netMm5
    #   835277984 bases of 1054197620 (79.234%) in intersection
    featureBits galGal2 chainMm6
    #   843746491 bases of 1054197620 (80.037%) in intersection
    featureBits galGal2 chainMm5
    #   846905330 bases of 1054197620 (80.336%) in intersection
    featureBits galGal2 chainMm6Link
    #   72687426 bases of 1054197620 (6.895%) in intersection
    featureBits galGal2 chainMm5Link
    #   70542788 bases of 1054197620 (6.692%) in intersection

#############################################################################
# BLASTZ OPOSSUM (DONE - 2005-04-01 - 2005-04-08 - Hiram)
    ssh eieio
    mkdir /cluster/data/mm6/bed/blastzMonDom1.2005_04_01
    cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01

    cat << '_EOF_' > DEF
# mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
#	Using parameters from monDom1<->mm5 alignments, see notes there.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse Mm6
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum MonDom1
SEQ2_DIR=/iscratch/i/monDom1/monDom1.2bit
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzMonDom1.2005_04_01

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
  # << this line keeps emacs coloring happy

    ssh eieio
    cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01
    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/monDom1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-fileServer eieio `pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-04-01 11:30
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    # CRASHED due to kksilo problems, finished batch manually and
    #	created the run.time file
    #	continuing
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 &
    #	STARTED 2005-04-03 11:45
    #	during the load of the tables, this command failed, perhaps due
    #	to kksilo crashes:
    netClass -verbose=0 -noAr noClass.net mm6 monDom1 mm6.monDom1.net
    #	So, trying to recover: 2005-04-04 15:15
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01/axtChain
    time netClass -verbose=0 -noAr noClass.net mm6 monDom1 mm6.monDom1.net
    #	Fails with the same error:
Can't start query:
select genoName,genoStart,genoEnd,repName,repClass,repFamily from rmsk order by genoName,genoStart

mySQL error 3: Error writing file '/tmp/MYUAGF4h' (Errcode: 28)

    #	I wonder if it is due to tmp space:
    #	Filesystem            Size  Used Avail Use% Mounted on
    #	/dev/sdb3             267G  244G  9.1G  97% /

    #	Yes, it was that, I removed some garbage from /scratch to make
    #	more space:
    #	Filesystem            Size  Used Avail Use% Mounted on
    #	/dev/sdb3             267G  242G   12G  96% /
    #	And the command finished.  I don't know where it was keeping its
    #	files as I was watching for something large to show up in /tmp
    #	during this 65 minute command, but I never saw a large file
    #	there but I did see available space get down to only a couple Gb
    #	free.  Now, to finish the load of the nets:
    netFilter -minGap=10 mm6.monDom1.net | hgLoadNet \
	-verbose=0 mm6 netMonDom1 stdin
    #	5 minute load time
    #	continuing 2005-04-05 16:00
    ssh eieio
    cd  /cluster/data/mm6/bed/blastzMonDom1.2005_04_01
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-continue download -fileServer eieio `pwd`/DEF > download.run.out 2>&1 &

    #	And measurments:
    ssh hgwdev
    cd  /cluster/data/mm6/bed/blastzMonDom1.2005_04_01
    ../../jkStuff/netChainCheck.sh mm6 mm5 monDom1 > measurements.out 2>&1 &
    
    featureBits mm6 netMonDom1
    #   2082064216 bases of 2597150411 (80.167%) in intersection
    featureBits mm5 netMonDom1
    #   2094316044 bases of 2615483787 (80.074%) in intersection
    featureBits mm6 chainMonDom1
    #   2109438148 bases of 2597150411 (81.221%) in intersection
    featureBits mm5 chainMonDom1
    #   2121448151 bases of 2615483787 (81.111%) in intersection
    featureBits mm6 chainMonDom1Link
    #   249576105 bases of 2597150411 (9.610%) in intersection
    featureBits mm5 chainMonDom1Link
    #   248180346 bases of 2615483787 (9.489%) in intersection

    # looks OK, so to the swap:
    ssh eieio
    cd  /cluster/data/mm6/bed/blastzMonDom1.2005_04_01
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 &
    #
    #	measurements:
     featureBits monDom1 netMm6
    # 2884735370 bases of 3492108230 (82.607%) in intersection
    featureBits monDom1 netMm5
    #   2889580530 bases of 3492108230 (82.746%) in intersection
    featureBits monDom1 chainMm6
    # 2908045004 bases of 3492108230 (83.275%) in intersection
    featureBits monDom1 chainMm5
    #   2913812625 bases of 3492108230 (83.440%) in intersection
    featureBits monDom1 chainMm6Link
    # 253105698 bases of 3492108230 (7.248%) in intersection
    featureBits monDom1 chainMm5Link
    #   249594220 bases of 3492108230 (7.147%) in intersection

    #	looks OK, done

##############################################################################
# BLASTZ FROG Xenopus tropicalis (DONE - 2005-04-05 - 2005-04-08 - Hiram)
    ssh eieio
    mkdir /cluster/data/mm6/bed/blastzXenTro1.2005_04_05
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05

    cat << '_EOF_' > DEF
# mouse vs. frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Set up blastz parameters using parameters between chicken and fish,
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: mouse mm6
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Frog xenTro1
SEQ2_DIR=/iscratch/i/xenTro1/xenTro1.2bit
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzXenTro1.2005_04_05

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy
    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/xenTro1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-fileServer eieio `pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-04-05 10:30 - new machine serving this filesystem today
    #	635 min = 10h 35m
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
    #	had a failure on many of the jobs here.  Clean up:
    ssh kk9
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05/run.blastz
    para time > time.0
    para problems > probs.1
    para recover jobList recoverJobList
    ssh kk
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05/run.blastz
    para create recoverJobList
    para try
    para push ... check ... etc ...
    #	with that successfully completed:
    para time > run.time
    ssh eieio
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 &
    #	measurements
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05
    time ../../jkStuff/netChainCheck.sh mm6 mm5 xenTro1 > measures.out 2>&1 &
    featureBits mm6 netXenTro1
    #   1033071781 bases of 2597150411 (39.777%) in intersection
    featureBits mm5 netXenTro1
    #   1042210258 bases of 2615483787 (39.848%) in intersection
    featureBits mm6 chainXenTro1
    #   1063392793 bases of 2597150411 (40.945%) in intersection
    featureBits mm5 chainXenTro1
    #   1078618413 bases of 2615483787 (41.240%) in intersection
    featureBits mm6 chainXenTro1Link
    #   67119684 bases of 2597150411 (2.584%) in intersection
    featureBits mm5 chainXenTro1Link
    #   73115446 bases of 2615483787 (2.795%) in intersection

    #	Those are looking good, now to the swap:
    ssh eieio
    cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05
    time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \
	-swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 &
    #	70 minutes
    #	Measurements:

    featureBits xenTro1 netMm6
    #   683225633 bases of 1381238994 (49.465%) in intersection
    featureBits xenTro1 netMm5
    #   697384254 bases of 1381238994 (50.490%) in intersection
    featureBits xenTro1 chainMm6
    #   700638086 bases of 1381238994 (50.725%) in intersection
    featureBits xenTro1 chainMm5
    #   721494705 bases of 1381238994 (52.235%) in intersection
    featureBits xenTro1 chainMm6Link
    #   64584213 bases of 1381238994 (4.676%) in intersectio
    featureBits xenTro1 chainMm5Link
    #   76415718 bases of 1381238994 (5.532%) in intersection

#############################################################################
# BLASTZ CHIMP PanTro1 (DONE - 2005-04-05 - 2005-04-15 - Hiram)
    ssh eieio
    mkdir /cluster/data/mm6/bed/blastzPanTro1.2005_04_08
    cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08

    #	same parameters as Human alignment, except for the use of the
    #	SMSK linSpecRepeats - in this case, using none.  Should be an
    #	interesting comparison if the lineage specific repeats make much
    #	difference in the result.
    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse mm6
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp panTro1
SEQ2_DIR=/scratch/chimp/panTro1/nib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzPanTro1.2005_04_08

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/panTro1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-fileServer eieio `pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-04-06 10:40
    #	489 minutes = 8h 09m
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kksilo
    screen -d -r
Completed: 155570 of 155570 jobs
CPU time in finished jobs:   14707939s  245132.32m  4085.54h  170.23d  0.466 y
IO & Wait Time:                609798s   10163.29m   169.39h    7.06d  0.019 y
Average job time:                  98s       1.64m     0.03h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            5146s      85.77m     1.43h    0.06d
Submission to last job:         20972s     349.53m     5.83h    0.24d

Completed: 331 of 331 jobs
CPU time in finished jobs:        260s       4.33m     0.07h    0.00d  0.000 y
IO & Wait Time:                  1135s      18.92m     0.32h    0.01d  0.000 y
Average job time:                   4s       0.07m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:              16s       0.27m     0.00h    0.00d
Submission to last job:           234s       3.90m     0.07h    0.00d

Completed: 40 of 40 jobs
CPU time in finished jobs:       7229s     120.48m     2.01h    0.08d  0.000 y
IO & Wait Time:                   207s       3.46m     0.06h    0.00d  0.000 y
Average job time:                 186s       3.10m     0.05h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             597s       9.95m     0.17h    0.01d
Submission to last job:          1287s      21.45m     0.36h    0.01d

    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08
    featureBits mm6 netPanTro1
    #   2569701404 bases of 2597150411 (98.943%) in intersection
    time featureBits mm6 netHg17
    #	2579747741 bases of 2597150411 (99.330%) in intersection
    featureBits mm6 chainPanTro1
    #   2585896564 bases of 2597150411 (99.567%) in intersection
    time featureBits mm6 chainHg17
    #	2596946329 bases of 2597150411 (99.992%) in intersection
    featureBits mm6 chainPanTro1Link
    #   924893452 bases of 2597150411 (35.612%) in intersection
    featureBits mm6 chainHg17Link (on kolossus)
    #	966916309 bases of 2597150411 (37.230%) in intersection

    # Looks about correct, now for the swap
    ssh eieio
    cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 &
    #	107 minutes
    featureBits panTro1 netMm6
    #	3306360710 bases of 2733948177 (120.937%) in intersection
    featureBits panTro1 chainMm6
    #	3363239156 bases of 2733948177 (123.018%) in intersection
    featureBits panTro1 chainMm6Link
    #	922583825 bases of 2733948177 (33.745%) in intersection
    featureBits -countGaps panTro1 netMm6
    #	3306360710 bases of 4420375440 (74.798%) in intersection
    featureBits -countGaps panTro1 netHg16
    #	4015411490 bases of 4420375440 (90.839%) in intersection
    featureBits -countGaps panTro1 chainMm6
    #	3363239156 bases of 4420375440 (76.085%) in intersection
    featureBits -countGaps panTro1 chainHg16
    #	4056193816 bases of 4420375440 (91.761%) in intersection
    #	on kolossus:
    HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainHg16Link
    #	2611490291 bases of 4420375440 (59.078%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainMm6Link
    #	922583825 bases of 4420375440 (20.871%) in intersection

    #	Appears to be reasonable, check the genome-test browser on both
    #	the Mm6 assembly and the PanTro1 assembly to see if the net and
    #	chain tracks appear and are in the proper order.

#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2005-04-06 - 2005-04-18 - Hiram)
#	Applied a filter to primers.psl - 2005-10-20 - Hiram
#       Removed 404 rows from all_sts_primers where qName had bad format - 2005-11-02 - Jen
    ssh eieio
    mkdir -p /cluster/data/mm6/bed/STSmarkers/downloads
    cd /cluster/data/mm6/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    #	AND, they were incorrect when I fetched them on April 6th,
    #	they were corrected on April 8th
    #	HOWEVER, they still appear to be incorrect.  There are IDs in
    #	the UniSTS_mouse.sts file that do not appear in the aliases file
    #	2005-04-14 Further information on this says that some IDs do not
    #	have aliases, thus need no entries in the alises file.
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    # these map files used to be static for some years, now they appear
    # to be new
    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
    #	These files used to be unchanging.  This time they seem to be
    #	new files:
    #	   396858 Jan 28 19:15 10090.MGI.txt
    #	   173344 Mar 16 19:45 10090.WI-Genetic.txt
    #	   240688 Mar 16 19:45 10090.WI-YAC.txt
    #	   390139 Mar 16 20:16 10090.WI_MRC_RH.txt
    #  Will have to watch below to see how these figure into the construction.

    # these reports from jax.org appear to be changing daily
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt


    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file

    cp -p /cluster/data/mm5/bed/STSmarkers/downloads/*.sh .
    cp -p /cluster/data/mm5/bed/STSmarkers/downloads/*.pl .
    #	There is a line in the fetchAllAliases.sh script that needs to
    #	be updated, it must point to the previous bed file:
    #   BEDFile=/cluster/data/mm5/bed/STSmarkers/stsInfoMouse.bed
    #	Next time, this should read:
    #   BEDFile=/cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed

    #	The perl scripts were reworked, updated, cleaned up, and fixed
    #	to handle a new type of format found in the UniSTS.aliases file.
    #	*!*! ACTUALLY there was an error in the UniSTS.aliases file
    #	format, it was not correct.  Upon submitting a query to Deanna
    #	Church, I got the following answer:
    #	Dear Hiram,

    #	Thanks for reporting formatting problems in UniSTS.aliases file.

    #	It was fixed in the program and correct file was put to ftp site.

    #	-Wonhee Jang
    #	---------------------------------------------------
    #	Wonhee Jang, Ph.D
    #	National Center for Biotechnology Information/NIH
    #	Building 45, Room 5AS43D-49, Bethesda, MD 20894
    #	jang@ncbi.nlm.nih.gov  phone)301-402-9307
    #	fax) 301-480-2484
    #	---------------------------------------------------


    #	This process has been captured in the script:
    #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
    # which uses a couple of perl scripts in that same directory.
    # briefly it is:
    
    # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
    # grep MGI: UniSTS.aliases > MGI.aliases
    # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
    #	stsInfoAliases.txt
    # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
    # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
    #    | sort -n > UniSTS_mouse.alias

    time ./fetchAllAliases.sh

    # with that, we can create a new stsInfoMouse.bed file:
    cd /cluster/data/mm6/bed/STSmarkers
    /cluster/store5/mouseMarker/code/updateBed.pl \
	/cluster/data/mm5/bed/STSmarkers/stsInfoMouse.bed \
	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile

    # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
    /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
	
    # copy the stsInfoMouse.bed file from working dir to the marker info storage fold.
    # added 2 new steps by Yontao	
    mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to Mm5
    #	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    #	/cluster/store5/mouseMarker/stsInfoMouse.bed
    58493   778055  6524821 stsInfoMouse.bed_mm5
    58980   784786  6690105 stsInfoMouse.bed

    # and from that, create new primer fa, epcr, etc:
    /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty
    wc mouse?.*
    #	     0       0       0 mouseC.fa
    #	293305  293251 6624638 mouseP.fa
    #	 32890  164528 2087271 mouseP.info
    #	326195  457779 8711909 total
    #	the equivalent Mm5 files:
    #	     0       0       0 mouseC.fa
    #	286740  286686 6474893 mouseP.fa
    #	 32232  161234 2044810 mouseP.info
    #	318972  447920 8519703 total

    #	copy the primers over to the panasas for the kluster run
    mkdir /panasas/store/mm6/STSmarkers
    cp -p mouseP.fa /panasas/store/mm6/STSmarkers
    cp -p mouseP.info /panasas/store/mm6/STSmarkers

    #  CLUSTER RUN FOR THE STS PRIMERS

    ssh kk9
    mkdir -p /cluster/data/mm6/bed/STSmarkers/primer
    mkdir -p /cluster/data/mm6/bed/STSmarkers/ePCR
    cd /cluster/data/mm6/bed/STSmarkers/primer

    # the mouseP.fa comes from above

    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 

cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat.2 $(path1) /panasas/store/mm6/STSmarkers/mouseP.fa -ooc=/scratch/hg/h/mouse11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
    mkdir primers.out

    ls -1S /panasas/store/mm6/fasta/chr*.fa > contig.lst
    gensub2 contig.lst single template jobList
    para create jobList
    para try
    para check
    para push
    #	STARTED - 2005-04-14 15:40 
# Completed: 40 of 40 jobs
# CPU time in finished jobs:     445070s    7417.83m   123.63h    5.15d  0.014 y
# IO & Wait Time:                   463s       7.72m     0.13h    0.01d  0.000 y
# Average job time:               11138s     185.64m     3.09h    0.13d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           53053s     884.22m    14.74h    0.61d
# Submission to last job:         53053s     884.22m    14.74h    0.61d

    # on the file server
    ssh kksilo
    cd /cluster/data/mm6/bed/STSmarkers/primer
    /cluster/bin/i386/pslSort dirs primers.psl temp primers.out
    #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)    2005-10-20
    #	should not be more than 100 bases different.
    #	This filters out about 973,365 alignments, or
    #	%17.0 = 100.0 * 973365 / 5724127
    pslSort dirs stdout temp primers.out | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
	((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100
    rmdir temp

    # a rough comparison with previous results:
    wc primers.psl.100  (after applying filter to primers.psl)
    #	4750762   99765920  495766873 primers.psl.100
    wc primers.psl  (before applying filter to primers.psl)
    #	5724127 120206606 615248041 primers.psl
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl
    #	5719969 120119288 590806241
    wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
    #	5745617 120657896 592135728 

    # another kluster run for the ePCR
    ssh kk9
    cd /cluster/data/mm6/bed/STSmarkers/ePCR
    ls -1S /panasas/store/mm6/fasta/chr*.fa > contig.lst
    mkdir epcr.out
    cat << '_EOF_' > runPCR.csh
#!/bin/csh -fe
/cluster/bin/i386/e-PCR $1 $2 N=1 M=50 W=5 > $3
'_EOF_'
    # emacs happy ?
    chmod +x runPCR.csh

    cat << '_EOF_' > template
#LOOP
./runPCR.csh /panasas/store/mm6/STSmarkers/mouseP.info $(path1) {check out line+ epcr.out/$(num1).epcr}
#ENDLOOP
'_EOF_'
    # the mouseP.info was created above
    gensub2 contig.lst single template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
# Completed: 40 of 40 jobs
# CPU time in finished jobs:      77676s    1294.60m    21.58h    0.90d  0.002 y
# IO & Wait Time:                   370s       6.17m     0.10h    0.00d  0.000 y
# Average job time:                1951s      32.52m     0.54h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6173s     102.88m     1.71h    0.07d
# Submission to last job:          6173s     102.88m     1.71h    0.07d

    ssh hgwdev
    cd /cluster/data/mm6/bed/STSmarkers/ePCR
    # all those results become all.epcr
    cat epcr.out/*.epcr > all.epcr

    # comparing to previous results:
    wc all.epcr
    #	55871  223484 3086148 all.epcr
    wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    #	55677  222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	74705  298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	Mm4 seems to be out of whack

    cd /cluster/data/mm6/bed/STSmarkers/primer

    /cluster/bin/scripts/filterSTSPrimers \
    -mouse ../stsInfoMouse.bed primers.psl.100 \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	5700000
    #	Determining ePCR not found
    #
    wc primers.psl.filter.blat  (after applying filter to primers.psl above)
    #	33128  695688 3542978 primers.psl.filter.blat
    wc primers.psl.filter.blat  (before applying filter to primers.psl above)
    #	33662  706902 3605847 primers.psl.filter.blat
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33476  702996 3442402 
    wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl.filter.blat
    #	32729  687309 3331894
    #	It appears Mm4 became sane after the filter

    # create accession_info.rdb
    touch empty_sequence.inf
    /cluster/bin/scripts/compileAccInfo -mouse \
	/cluster/data/mm6 empty_sequence.inf
    # works with errors on missing randoms, etc...:
    # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
    # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
    mv accession_info.rdb accession_info.rdb.tmp
    /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
	accession_info.rdb
    rm accession_info.rdb.tmp
    # comparing results to previous
    wc accession_info.rdb
    #	93052 1023576 6824900 accession_info.rdb
    wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb
    #	131845 1450299 9681940
    wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb
    #	86935  956289 6374930 

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    #	/cluster/bin/scripts/epcrToPsl
    #	Fixed this script to make it not look for contigs in the usual
    #	manner, we don't have those for this assembly	
    ./epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm6

    # Comparing results to previous:
    wc epcr*
    #	 467    1868   17135 epcr.not.found
    #	  63     756    6041 epcr.not.found.nomatch
    #	 404    8484   40254 epcr.not.found.psl
    #	 158     535    4308 epcrToPsl
    #	1092   11643   67738 total

    # Mm5 wc epcr*
    wc /cluster/data/mm5/bed/STSmarkers/primer/epcr*
    #	 463    1852   17080 epcr.not.found
    #	  61     732    5845 epcr.not.found.nomatch
    #	 398    8358   38591 epcr.not.found.psl
    #	 402    8442   39011 epcr.not.found.psl.orig
    #	1324   19384  100527 total

    # Mm4 wc epcr*
    wc /cluster/data/mm4/bed/STSmarkers/primer/epcr*
    #	328    1312   12011 epcr.not.found
    #	 57     684    5474 epcr.not.found.nomatch
    #	266    5586   25711 epcr.not.found.psl
    #	163     552    4370 epcrToPsl
    #	814    8134   47566 total

    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
    wc primers.psl.filter  (after filter applied above to primers.psl)
    #	33532  704172 3583232 primers.psl.filter
    wc primers.psl.filter  (before filter applied above to primers.psl)
    #	34066  715386 3646101 primers.psl.filter

    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
    # 33691  707511 3601164 primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \
	primers.psl.filter
    wc primers.psl.filter.initial (after filter applied above to primers.psl)
    #	33514  201084 1786769 primers.psl.filter.initial
    wc primers.psl.filter.initial (before filter applied above to primers.psl)
    #	34048  204288 1815222 primers.psl.filter.initial
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
    # 33689  202134 1799016 primers.psl.filter.lifted.initial

    # create primers.psl.filter.lifted.initial.acc
    /cluster/bin/scripts/findAccession -agp \
	-mouse primers.psl.filter.initial /cluster/data/mm6
    #	it complains about missing _random items, it is OK
    wc primers.psl.filter.initial.acc (after filter applied above to primers.psl)
    #	33514  234598 2120939 primers.psl.filter.initial.acc
    wc primers.psl.filter.initial.acc (before filter applied above to primers.psl)
    #	34048  238336 2154798 primers.psl.filter.initial.acc

    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial.acc
    # 33689  235823 2158029 primers.psl.filter.lifted.initial.acc

    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    /cluster/bin/scripts/getStsId -rat \
	../stsInfoMouse.bed  primers.psl.filter.initial.acc \
	> primers.initial.acc.trans
    #	No id for 61645_RH126840
    #	No id for 4187_D10MIT171.2
    #	No id for 63449_RH125771
    #	No id for 67188_PMC99911P4
    #	No id for 8839_D6MIT360.1
    #	No id for 62732_RH126829
    #	No id for 63746_RH127126

    wc primers.initial.acc.trans  (after filter applied to primers.psl above)
    #	33507  234549 1800766 primers.initial.acc.trans
    wc primers.initial.acc.trans  (before filter applied to primers.psl above)
    #	34041  238287 1829724 primers.initial.acc.trans

    sort -k 4n primers.initial.acc.trans > primers.final
    wc primers.final  (after filter applied to primers.psl above)
    #	33507  234549 1800766 primers.final
    wc primers.final  (before filter applied to primers.psl above)
    #	34041  238287 1829724 primers.final
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.final 
    #	33689  235823 1834889 /cluster/data/mm5/bed/STSmarkers/primer/primers.final

    rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans

    cd /cluster/data/mm6/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    wc stsMarkers_pos.rdb  (after filter applied to primers.psl above)
    #	31889  223223 1881886 stsMarkers_pos.rdb
    wc stsMarkers_pos.rdb  (before filter applied to primers.psl above)
    #	32350  226450 1909506 stsMarkers_pos.rdb
    wc /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    # 32085  224595 1862816 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    wc /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
    # 31270  218890 1869417 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb

    /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
	stsInfoMouse.bed  stsMarkers_pos.rdb 500 > stsMapMouse.bed
    wc stsMapMouse.bed   (after filter applied to primers.psl above)
    #	28713  298319 2072647 stsMapMouse.bed
    wc stsMapMouse.bed   (before filter applied to primers.psl above)
    #	29079  301678 2097544 stsMapMouse.bed

    wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
    #	29069  301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed

    #  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm6/bed/STSmarkers
    cp -p /cluster/store6/mm5/bed/STSmarkers/ucscAlias.pl .
    ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
    #	this does leave messages in ucscStsAlias.warnings but the seem
    #	to be the same as they were in Mm5
     
    wc ucscStsAlias.tab
    #	141585  424725 3284106 ucscStsAlias.tab

    wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
    # 126624  379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
     
    hgsql -e "drop table stsAlias;" mm6
    hgsql mm6 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm6
    hgsql -e "drop table stsMapMouseNew;" mm6
    hgsql mm6 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm6
    hgsql -e "drop table stsInfoMouseNew;" mm6
    hgsql mm6 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm6

    hgLoadPsl -nobin -table=all_sts_primer mm6 primer/primers.psl.filter
    #	load of all_sts_primer did not go as planned: 33532 record(s),
    #	0 row(s) skipped, 14 warning(s) loading primer/primers.psl.filter

    #	load of all_sts_primer did not go as planned: 34066 record(s),
    #	    0 row(s) skipped, 14 warning(s) loading primer/primers.psl.filter

    # load primer sequences	
    mkdir /gbdb/mm6/stsMarker
    ln -s /cluster/data/mm6/bed/STSmarkers/mouseP.fa \
	/gbdb/mm6/stsMarker/mouseP.fa
    # PLEASE NOTE THAT THE If you are going to reload this business, use the
    #	-replace option on this hgLoadSeq
    #	hgLoadSeq -replace mm6 /gbdb/mm6/stsMarker/mouseP.fa
    # otherwise there will be a problem that the seq and extFile tables 
    # will be out of sync. 
    hgLoadSeq mm6 /gbdb/mm6/stsMarker/mouseP.fa
    #  Adding /gbdb/mm6/stsMarker/mouseP.fa
    #  32890 sequences

    #	After applying filter to primers.psl above
    featureBits mm6 all_sts_primer
    #	3706406 bases of 2597150411 (0.143%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4638338 bases of 2597150411 (0.179%) in intersection
    featureBits mm7 stsMapMouseNew
    #	4805958 bases of 2583394090 (0.186%) in intersection

    #	Before applying filter to primers.psl above
    featureBits mm6 all_sts_primer
    #	3735649 bases of 2597150411 (0.144%) in intersection
    featureBits mm5 all_sts_primer
    #	3727268 bases of 2615483787 (0.143%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4736039 bases of 2597150411 (0.182%) in intersection
    featureBits mm5 stsMapMouseNew
    #	4719679 bases of 2615483787 (0.180%) in intersection

    hgsql -N mm6 -e "select count(*) from stsAlias;"
    #	137738
    hgsql -N mm5 -e "select count(*) from stsAlias;"
    #	122944
    hgsql -N mm6 -e "select count(*) from stsInfoMouseNew;"
    #	58980
    hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
    #	58493

    #	compare old and new name lists:
    awk '{print $4}' /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed | \
	sort -u > mm5.nameList
    awk '{print $4}' stsMapMouse.bed | sort -u > mm6.nameList
    #	After applying filter to primers.psl above
    comm -12 mm?.nameList | wc
    #	27109   27109  264019		<- 27,109 names in common
    comm -23 mm5.nameList mm6.nameList | wc
    #	527     527    4617		<- 527 unique to mm5 list
    comm -13 mm5.nameList mm6.nameList | wc
    #	399     399    3646		<- 399 unique to mm6 list

    #	Before applying filter to primers.psl above
    comm -12 mm?.nameList | wc
    #	27454   27454  266951		<- 27,545 names in common
    comm -23 mm5.nameList mm6.nameList | wc
    #	182     182    1685		<- 182 unique to mm5 list
    comm -13 mm5.nameList mm6.nameList | wc
    #	1625    1625   15090		<- 1,625 unique to mm6 list

#############################################################################
# HGCENTRAL DEFAULTDB UPDATE (DONE - 2005-04-08 - Hiram)
    #	May as well make this assembly be the default on hgwdev
    ssh hgwdev
    hgsql hgcentraltest \
	-e 'update defaultDb set name="mm6" where genome="Mouse";'

#############################################################################
# 10-WAY VAR_MULTIZ - SECOND TIME, PERHAPS CORRECTLY (WORKING 2005-11-30)
#		- Hiram
    ssh kkstore01

    mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30

    mkdir mafLinks
    mkdir mafLinks/rn3
    mkdir mafLinks/hg17
    mkdir mafLinks/canFam2
    mkdir mafLinks/bosTau1
    mkdir mafLinks/monDom1
    mkdir mafLinks/galGal2
    mkdir mafLinks/xenTro1
    mkdir mafLinks/danRer3
    mkdir mafLinks/tetNig1

    export H=/cluster/data/mm6/bed
    ln -s ${H}/blastzRn3.2005-11-30/mafNet/*.maf.gz ./mafLinks/rn3
    ln -s ${H}/blastzHg17.2005-11-30/mafNet/*.maf.gz ./mafLinks/hg17
    ln -s ${H}/blastzCanFam2.2005-12-02/mafNet/*.maf.gz ./mafLinks/canFam2
    ln -s ${H}/blastzBosTau1.2005_03_18/mafNet/*.maf.gz ./mafLinks/bosTau1
    ln -s ${H}/blastzMonDom1.2005_04_01/mafNet/*.maf.gz ./mafLinks/monDom1
    ln -s ${H}/blastzGalGal2.2005-11-30/mafNet/*.maf.gz ./mafLinks/galGal2
    ln -s ${H}/blastzXenTro1.2005_04_05/mafNet/*.maf.gz ./mafLinks/xenTro1
    ln -s ${H}/blastzDanRer3.2005-08-05/mafNet/*.maf.gz ./mafLinks/danRer3
    ln -s ${H}/blastzTetNig1.2005_03_17/mafNet/*.maf.gz ./mafLinks/tetNig1

    
    #	Copy MAFs to san for pk kluster run
    mkdir /san/sanvol1/scratch/mm6/multiz10way.2005-11-30
    cd /san/sanvol1/scratch/mm6/multiz10way.2005-11-30
    mkdir mafs
    rsync -a --copy-links --progress \
	/cluster/data/mm6/bed/multiz10way.2005-11-30/mafLinks/ ./mafs/
    #	3.3 Gb of data to copy, about 10 minutes or so
    mkdir penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn

    #	And for the kluster run
    ssh pk
    mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    mkdir -p maf run
    cd run

    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/mm6/multiz10way.$c
    set pairs = /san/sanvol1/scratch/mm6/multiz10way.2005-11-30/mafs

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif 
        
    set s1 = $2
    set s2 = $3
    set flag = $4
    
    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi 
    if (-d $pairs/$s1) then
        set d1 = $pairs
	set f1 = $d1/$s1/$c.maf.gz
	set t1 = /tmp/$s1.$c.maf
	zcat $f1 > $t1
    else
	set f1 = $d1/$s1/$c.maf
	set t1 = /tmp/$s1.$c.maf
	cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
	set f2 = $d2/$s2/$c.maf.gz
	set t2 = /tmp/$s2.$c.maf
	zcat $f2 > $t2
    else
	set f2 = $d2/$s2/$c.maf
	set t2 = /tmp/$s2.$c.maf
	cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
	/san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/multiz \
	    $t1 $t2 $flag $out/$c.unused1.maf \
		$out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
	/san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/maf_project \
		$out/$c.tmp.maf mm6.$c > $out/$c.maf
	rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
	rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
	rm -f $t2
    endif
'EOF'
# happy emacs
    chmod +x oneMultiz.csh
    cp -p oneMultiz.csh \
	/san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/oneMultiz.csh

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
set s = "/san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/oneMultiz.csh"

$s $c hg17 rn3  1
$s $c bosTau1 canFam2 0
$s $c hg17rn3  bosTau1canFam2 1
$s $c hg17rn3bosTau1canFam2 monDom1 1
$s $c hg17rn3bosTau1canFam2monDom1 galGal2 1
$s $c hg17rn3bosTau1canFam2monDom1galGal2 xenTro1 1
$s $c danRer3 tetNig1 0
$s $c hg17rn3bosTau1canFam2monDom1galGal2xenTro1 danRer3tetNig1 1
# get final alignment file
$s $c hg17rn3bosTau1canFam2monDom1galGal2xenTro1danRer3tetNig1 \
    /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/$c.maf
#cleanup
$s $c
'EOF'
# happy emacs
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/$(root1).maf}
#ENDLOOP
'EOF'

    awk '{print $1}' ../../../chrom.sizes > chrom.lst

    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check
    para push
XXXX - running 2005-12-05 16:30
# Completed: 40 of 40 jobs
# CPU time in finished jobs:     124610s    2076.83m    34.61h    1.44d 0.004 y
# IO & Wait Time:                  1331s      22.18m     0.37h    0.02d 0.000 y
# Average job time:                3149s      52.48m     0.87h    0.04d
# Longest finished job:           12711s     211.85m     3.53h    0.15d
# Submission to last job:         12711s     211.85m     3.53h    0.15d


    ssh kkstore01
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    catDir maf > multiz10wayU1.maf
    #	~ 3.5 minutes

    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    mkdir /gbdb/mm6/multiz10wayU1
    ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/multiz10wayU1.maf \
	/gbdb/mm6/multiz10wayU1/multiz10wayU1.maf
    time hgLoadMaf mm6 multiz10wayU1
    #	real    18m22.810s
    time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm6 \
	multiz10wayU1Summary multiz10wayU1.maf
    #	real    20m45.326s

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS Second time with new multiz10way
#		(WORKING - 2005-12-06 - Hiram)

# Estimate phastCons parameters
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30/cons
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30/cons

    # Create a starting-tree.mod based on chr2 (the biggest maf)
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \
	--refseq ../../../2/chr2.fa --in-format MAF \
	--windows 100000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	~4 minutes

    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
	--tree "((((((hg17,(mm6,rn3)),(canFam2,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer3))" \
    --out-root starting-tree
    #	about 45 minutes
    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND phyloFit.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.408
    #	This 0.408 is used in the --gc argument below

    # Create big bad bloated SS files on san filesystem (takes ~ 1h 05m)
    ssh kkstore01
    mkdir -p  /san/sanvol1/scratch/mm6/cons/ss
    cd /san/sanvol1/scratch/mm6/cons/ss
    for C in `awk '{print $1}' /cluster/data/mm6/chrom.sizes`
    do
      if [ -s /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/${C}.maf \
	    --refseq /cluster/data/mm6/${chrN}/${C}.fa \
	    --in-format MAF --windows 1000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done
    #	real     63m41.485s

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/mm6/cons/treeRun1
    cd /san/sanvol1/scratch/mm6/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/mm6/bed/multiz10way.2005-11-30/cons/phyloFit.mod \
      --gc 0.408 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	emacs happy
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore01
    cd /san/sanvol1/scratch/mm6/cons/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/mm6/bed/multiz10way.2005-11-30/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
                        ave.cons.mod ave.noncons.mod
### !!! ***  This one with .17 and 12 is the one that was finally used
#Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
#Relative entropy: H=1.461641 bits/site
#Expected min. length: L_min=6.838719 sites
#Expected max. length: L_max=5.059638 sites
#Phylogenetic information threshold: PIT=L_min*H=9.995752 bits
    #	We are aiming for PIT to be near 10 (aka total entropy)
    #	This is good enough.  Tuning wasn't necessary this time because
    #	the tuning that was done the first time evidently was still good
    #	for this one.

    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/mm6/cons/consRun1
    cd /san/sanvol1/scratch/mm6/cons/consRun1
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 12 --target-coverage 0.17 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*cons.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
# Completed: 3098 of 3098 jobs
# CPU time in finished jobs:      11377s     189.61m     3.16h    0.13d  0.000 y
# IO & Wait Time:                 88563s    1476.06m    24.60h    1.03d  0.003 y
# Average job time:                  32s       0.54m     0.01h    0.00d
# Longest finished job:             292s       4.87m     0.08h    0.00d
# Submission to last job:           627s      10.45m     0.17h    0.01d


    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/mm6/cons/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed

    #	~ 1m 12s

    # Figure out how much is actually covered by the bed files as so:
    #	The 2597150151 comes from the non-n genome size,
    #	from faSize on all chroms:
    ssh kkstore01
    cd /cluster/data/mm6
    faSize ?/chr*.fa ??/chr*.fa
    #	3079633452 bases (482483301 N's 2597150151 real 1486015217
    #	upper 1111134934

    cd /san/sanvol1/scratch/mm6/cons/consRun1
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/2597150151\n",100.0*sum/2597150151,sum}' \
	mostConserved.bed
    -target-coverage 0.17: % 5.29 = 100.0*137323490/2597150151 length 12

    cp -p mostConserved.bed /cluster/data/mm6/bed/multiz10way.2005-11-30

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    hgLoadBed -strict mm6 phastConsElementsU1 mostConserved.bed
    #	Loaded 2356669 elements of size 5
    #	~5 minute load time
    featureBits mm6 -enrichment refGene:cds phastConsElementsU1
# -target-coverage 0.17 and expected lengths 12:
# refGene:cds 1.013%, phastConsElementsU1 5.287%, both 0.694%, cover 68.54%,
#	enrich 12.96x

    # Create merged posterier probability file and wiggle track data files
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    ssh kkstore01
    cd /san/sanvol1/scratch/mm6/cons/consRun1
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode stdin phastCons10U1.wig phastCons10U1.wib
    # about 22 minutes for above
# -rw-rw-r--   1 1975849149 Dec  6 14:50 phastCons10U1.wib
# -rw-rw-r--   1  253234710 Dec  6 14:50 phastCons10U1.wig

    cp -p phastCons10U1.wig phastCons10U1.wib \
	/cluster/data/mm6/bed/multiz10way.2005-11-30

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    ln -s `pwd`/phastCons10U1.wib /gbdb/mm6/wib/phastCons10U1.wib
    hgLoadWiggle mm6 phastCons10U1 phastCons10U1.wig
    #  ~ 3 minute load

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    time hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm6 phastCons10U1 > histogram.data 2>&1
    #	about 23 minutes to scan all data

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/mm6/cons/consRun1
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons10Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons10Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	happy emacs
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    #	takes about 37 minutes, makes 2.9 Gb of data
    #	copy them for downloads
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores
    rsync -a --progress \
	pk:/san/sanvol1/scratch/mm6/cons/consRun1/phastCons10Scores/ .
    #	~5 minute copy

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores
    cd /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores
    ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores/*.gz .

    #	prepare maf downloads
    ssh kkstore01
    cd /cluster/data/mm6/bed/multiz10way.2005-11-30
    mkdir mafDownloads
    for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done
    #	Creates 2.7 gb of data

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm6/multiz10way
    cd /usr/local/apache/htdocs/goldenPath/mm6/multiz10way
    ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/mafDownloads/*.maf.gz .

#############################################################################
# 10-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2005-04-08 - 2005-04-18 - Hiram)
    ssh eieio
    mkdir /cluster/data/mm6/bed/multiz10way
    cd /cluster/data/mm6/bed/multiz10way
    mkdir mafLinks
    
    
    mkdir mafLinks/rn3
    mkdir mafLinks/hg17
    mkdir mafLinks/canFam1
    mkdir mafLinks/bosTau1
    mkdir mafLinks/monDom1
    mkdir mafLinks/galGal2
    mkdir mafLinks/xenTro1
    mkdir mafLinks/danRer2
    mkdir mafLinks/tetNig1

    export H=/cluster/data/mm6/bed
    ln -s ${H}/blastzRn3.2005_03_22/mafNet/*.maf.gz ./mafLinks/rn3
    ln -s ${H}/blastzHg17.2005_03_14/mafNet/*.maf.gz ./mafLinks/hg17
    ln -s ${H}/blastzCanFam1.2005_03_18/mafNet/*.maf.gz ./mafLinks/canFam1
    ln -s ${H}/blastzBosTau1.2005_03_18/mafNet/*.maf.gz ./mafLinks/bosTau1
    ln -s ${H}/blastzMonDom1.2005_04_01/mafNet/*.maf.gz ./mafLinks/monDom1
    ln -s ${H}/blastzGalGal2.2005_03_31/mafNet/*.maf.gz ./mafLinks/galGal2
    ln -s ${H}/blastzXenTro1.2005_04_05/mafNet/*.maf.gz ./mafLinks/xenTro1
    ln -s ${H}/blastzDanRer2.2005_03_17/mafNet/*.maf.gz ./mafLinks/danRer2
    ln -s ${H}/blastzTetNig1.2005_03_17/mafNet/*.maf.gz ./mafLinks/tetNig1

    #	Copy MAFs to Iservers for kluster run
    ssh kkr1u00
    mkdir /iscratch/i/mm6/multiz10way
    cd /iscratch/i/mm6/multiz10way
    rsync -a --copy-links --progress \
	/cluster/data/mm6/bed/multiz10way/mafLinks/ .
    #	We have about 3.2 Gb of data here, takes ~ 15-20 minutes to copy over
    #	At least it does today, something is fishy with the connection.
    mkdir penn
    cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/maf_project penn
    /cluster/bin/iSync

#       Progressive alignment up the tree w/o stager, 
#       using multiz.v10 (var_multiz)
#       Method: align internal subtrees (using 0 flag to var_multiz)
#               Then, align these to human (using 1 flag to var_multiz)
#       NOTE: must use maf_project after each multiz run, in order
#       to order output.  Single-cov guaranteed by use of net MAF's,
#       so it is not necessary to run single_cov2.

    ssh eieio
    cd /cluster/data/mm6/bed/multiz

    # make output dir and run dir
    ssh kki
    cd /cluster/data/mm6/bed/multiz10way
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/mm6/multiz10way.$c
    set pairs = /iscratch/i/mm6/multiz10way

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif 
        
    set s1 = $2
    set s2 = $3
    set flag = $4
    
    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi 
    if (-d $pairs/$s1) then
        set d1 = $pairs
	set f1 = $d1/$s1/$c.maf.gz
	set t1 = /tmp/$s1.$c.maf
	zcat $f1 > $t1
    else
	set f1 = $d1/$s1/$c.maf
	set t1 = /tmp/$s1.$c.maf
	cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
	set f2 = $d2/$s2/$c.maf.gz
	set t2 = /tmp/$s2.$c.maf
	zcat $f2 > $t2
    else
	set f2 = $d2/$s2/$c.maf
	set t2 = /tmp/$s2.$c.maf
	cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
        /iscratch/i/mm6/multiz10way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        /iscratch/i/mm6/multiz10way/penn/maf_project $out/$c.tmp.maf mm6.$c > $out/$c.maf
	rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
	rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
	rm -f $t2
    endif
'EOF'
# << keep emacs coloring happy
    chmod +x oneMultiz.csh

    #	Copy this script to iscratch
    ssh kkr1u00
    cd /iscratch/i/mm6/multiz10way/penn
    cp -p /cluster/data/mm6/bed/multiz10way/run/oneMultiz.csh .
    /cluster/bin/iSync

    #	back to run the job
    ssh kki
    cd /cluster/data/mm6/bed/multiz10way/run

    #	This tree.nh was used in the distant past for early versions
    #	of phastCons.  Now, this is merely a convenient reference to the
    #	tree under construction.  This is also used to draw a graphic
    #	tree as species.nh, see below.
    cat << '_EOF_' > tree.nh
((((((hg17,(mm6,rn3)),(canFam1,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer2))
'_EOF_'
    # << this line keeps emacs coloring happy

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c hg17 rn3  1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c bosTau1 canFam1 0
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c bosTau1canFam1 hg17rn3  1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c monDom1 bosTau1canFam1hg17rn3  1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c galGal2 monDom1bosTau1canFam1hg17rn3  1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c xenTro1 galGal2monDom1bosTau1canFam1hg17rn3  1
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2 tetNig1 0
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2tetNig1 xenTro1galGal2monDom1bosTau1canFam1hg17rn3  1
# get final alignment file
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2tetNig1xenTro1galGal2monDom1bosTau1canFam1hg17rn3 /cluster/data/mm6/bed/multiz10way/maf/$c.maf
#cleanup
/iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c
'EOF'
# << keep emacs coloring happy
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/mm6/bed/multiz10way/maf/$(root1).maf}
#ENDLOOP
'EOF'

    cd /cluster/data/mm6/bed/multiz10way/run
    awk '{print $1}' ../../../chrom.sizes > chrom.lst

    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check
    para push
# Completed: 40 of 40 jobs
# CPU time in finished jobs:     151565s    2526.08m    42.10h    1.75d  0.005 y
# IO & Wait Time:                 25097s     418.29m     6.97h    0.29d  0.001 y
# Average job time:                4417s      73.61m     1.23h    0.05d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           18222s     303.70m     5.06h    0.21d
# Submission to last job:         18592s     309.87m     5.16h    0.22d

    #	combine results into a single file for loading
    #	it is too large for kksilo, use kolossus
    ssh kolossus
    cd /cluster/data/mm6/bed/multiz10way
    catDir maf | mafFilter stdin -minScore=500 > multiz10way.maf
    #	rejected 1548566 blocks
    #	7m 22s
    #	makes an 8 Gb file:
    #	-rw-rw-r--   1 8443473465 Apr 18 09:57 multiz10way.maf

    #	Create per-chrom individual maf files for downloads
    #		2005-08-02 - Hiram
    ssh kkstore01
    cd /cluster/data/mm6/bed/multiz10way
    mkdir mafDownloads
    for M in maf/chr*.maf
    do
	B=`basename $M`
	echo "cat ${M} | mafFilter stdin -minScore=500 > mafDownloads/${B}"
	cat ${M} | mafFilter stdin -minScore=500 > mafDownloads/${B}
    done
    cd mafDownloads
    gzip chr*.maf
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm6/multiz10way
    ln -s /cluster/data/mm6/bed/multiz10way/mafDownloads/chr*.maf.gz .

    # Load into database  (DONE - 2005-01-04 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way
    mkdir /gbdb/mm6/multiz10way
    ln -s /cluster/data/mm6/bed/multiz10way/multiz10way.maf \
	/gbdb/mm6/multiz10way
    hgLoadMaf mm6 multiz10way
    #	Loaded 6284892 mafs in 1 files from /gbdb/mm6/multiz10way
    #	14 minutes to load
    hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm6 \
	multiz10waySummary multiz10way.maf
    # Processed 18067226 components in 6284892 mafs from multiz10way.maf
    #	19m 15s

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql mm6 -e "alter table multiz10waySummary drop index chrom_2"
    hgsql mm6 -e "alter table multiz10waySummary drop index chrom_3"

    # create tree image:
    cat << '_EOF_' > species.nh
((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish))
'_EOF_'
    /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
    # photoshop to enhance, reduce the amount of whitespace to make it
    # smaller, then save as jpg
    cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm6_10way.jpg

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE - 2005-04-18 - 2004-04-20- Hiram)

# Estimate phastCons parameters
    ssh kksilo
    mkdir /cluster/data/mm6/bed/multiz10way/cons
    cd /cluster/data/mm6/bed/multiz10way/cons

    # Create a starting-tree.mod based on chr1 (the largest one)
    /cluster/bin/phast/msa_split ../maf/chr1.maf \
	--refseq ../../../1/chr1.fa --in-format MAF \
	--windows 100000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	5 minutes

    /cluster/bin/phast/phyloFit -i SS s1.*.ss \
	--tree "((((((hg17,(mm6,rn3)),(canFam1,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer2))" \
    --out-root starting-tree
    #	more than 1h 30m
    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.403
    #	This 0.403 is used in the --gc argument below

    # Create big bad bloated SS files in bluearc (takes ~45 minutes)
    ssh kksilo
    mkdir -p /cluster/bluearc/mm6/cons/ss
    cd /cluster/bluearc/mm6/cons/ss
    #	this is making over 3000 files in one directory, might be better
    #	to put them into per-chrom hierarchies
    for C in `awk '{print $1}' /cluster/data/mm6/chrom.sizes`
    do
      if [ -s /cluster/data/mm6/bed/multiz10way/maf/${C}.maf ]; then
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/msa_split \
	    /cluster/data/mm6/bed/multiz10way/maf/${C}.maf \
	    --refseq /cluster/data/mm6/${chrN}/${C}.fa \
	    --in-format MAF --windows 1000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}
      fi
    done

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    ls -l | grep -v random | awk '$5 > 4000000 {print $9;}' | \
	randomLines stdin 50 ../randomSs

    # Set up parasol directory to calculate trees on these 50 regions
    ssh kk9
    mkdir /cluster/bluearc/mm6/cons/treeRun1
    cd /cluster/bluearc/mm6/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree << '_EOF_'
    /cluster/bin/phast/phastCons ../ss/$1.ss \
      /cluster/data/mm6/bed/multiz10way/cons/starting-tree.mod \
      --gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	emacs happy
    chmod a+x makeTree

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree $(root1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:      83332s    1388.87m    23.15h    0.96d  0.003 y
# IO & Wait Time:                   429s       7.15m     0.12h    0.00d  0.000 y
# Average job time:                1675s      27.92m     0.47h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3533s      58.88m     0.98h    0.04d
# Submission to last job:          3543s      59.05m     0.98h    0.04d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ls tree/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
	--output-average ../ave.cons.mod > cons_summary.txt
    ls tree/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/mm6/bed/multiz10way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    /cluster/bin/phast/consEntropy .10 12 \
                        ave.cons.mod ave.noncons.mod
#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits

# consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1
# Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
# Relative entropy: H=1.454874 bits/site
# Required length: N=6.629337 sites
# Total entropy: NH=9.644850 bits

# consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2
# Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.527815 bits/site
# Required length: N=7.205526 sites
# Total entropy: NH=11.008713 bits

# consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3
# Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250
# Relative entropy: H=1.654878 bits/site
# Required length: N=5.146793 sites
# Total entropy: NH=8.517313 bits

### !!! ***  This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits


    ssh kk9
    # Create cluster dir to do main phastCons run
    mkdir  /cluster/bluearc/mm6/cons/consRun4
    cd  /cluster/bluearc/mm6/cons/consRun4
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files.  To make this
    #	cluster safe, it would be better to do this work somewhere over
    #	in /tmp/... and copy the final result back.  kk9 can do this
    #	run, but kk cannot.
    cat > doPhast << '_EOF_'
mkdir -p ppRaw/$2
/cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
   --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \
   --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
   ppRaw/$2/$1.pp
'_EOF_'
    # emacs happy
    chmod a+x doPhast

    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast $(file1) $(root1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Create parasol batch and run it
    ls -1 ../ss | sed 's/.ss//' > in.lst
    gensub2 in.lst single template jobList
    para create jobList
    para try/check/push/etc.
# Completed: 3098 of 3098 jobs
# CPU time in finished jobs:      28179s     469.65m     7.83h    0.33d  0.001 y
# IO & Wait Time:                204688s    3411.47m    56.86h    2.37d  0.006 y
# Average job time:                  75s       1.25m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             150s       2.50m     0.04h    0.00d
# Submission to last job:          2569s      42.82m     0.71h    0.03d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /cluster/bluearc/mm6/cons/consRun4
    catDir bed | awk '
    	{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
        /cluster/bin/scripts/lodToBedScore /dev/stdin > \
	/cluster/data/mm6/bed/multiz10way/mostConserved.bed
    #	~ 1 minute

    # Figure out how much is actually covered by the bed files as so:
    ssh kkstore01
    cd /cluster/data/mm6/bed/multiz10way
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/2597150151\n",100.0*sum/2597150151,sum}' \
	mostConserved.bed
    -target-coverage 0.17: % 5.40 = 100.0*140350815/2597150151 length 12
    -target-coverage 0.10: % 5.26 = 100.0*136494994/2597150151
    -target-coverage 0.15: % 7.34 = 100.0*190616745/2597150151
    -target-coverage 0.20: % 7.86 = 100.0*204262705/2597150151
    #	I was mistakenly reading the results below as, for example %2.14
    #	when in reality it was %21.4 - way way much too high.
    -target-coverage 0.50: 554319262/2597150151 = 0.214 - no complaints
    -target-coverage 0.52: 619851159/2597150151 = 0.239 - no complaints
    -target-coverage 0.53: 655016636/2597150151 = 0.252 - seven complaints
    -target-coverage 0.55: 729540911/2597150151 = 0.281 - many negative scores
    -target-coverage 0.60: 928959674/2597150151 = 0.358 - many negative scores
    #	the non-n genome size, from faSize on all chroms: 2597150151

    #	Given the above measurements, we are using the 0.52 target
    #	coverage run.

    # If the results of the this divided by the non-n genome size (1.5G) aren't
    # around 4%, then do it again, adjusting the target-coverage phastCons
    # parameter.  Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.
    # -target-coverage 0.17 and expected lengths 12:
    featureBits mm6 -enrichment refGene:cds mostConserved.bed
refGene:cds 0.980%, mostConserved.bed 5.404%, both 0.679%, cover 69.24%, enrich 12.81x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way
    hgLoadBed mm6 phastConsElements mostConserved.bed
    #	Loaded 2291164 elements of size 5
    #	5 minute load time
    featureBits mm6 -enrichment refGene:cds phastConsElements
# refGene:cds 0.980%, phastConsElements 5.256%, both 0.688%, cover 70.18%,
#	enrich 13.35x

    # Create merged posterier probability file and wiggle track data files
    ssh kksilo
    cd /cluster/bluearc/mm6/cons/consRun4
    # interesting sort here on the chr name and position.
    # first convert all . and - characters to special strings x/ and x_/
    #	to get a consistent delimiter of / for all fields to be sorted.
    #	Then do the sort on the chrom name and the start position, after
    #	the sort convert the special stringx x_/ and x/ back to - and .
    #	respectively.  This gets everything in order by chrom name and
    #	chrom start.
    find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \
	sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \
	    wigEncode stdin phastCons10.wig phastCons10.wib
    # about 45 minutes for above

    ssh kkstore01
    cd /cluster/bluearc/mm6/cons/consRun4
    cp -p phastCons10.wi? /cluster/data/mm6/bed/multiz10way/cons
    # 2m 30s copy on kkstore01

    #	prepare compressed copy of ascii data values for downloads
    cd /cluster/bluearc/mm6/cons/consRun4
    zcat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons10Scores

ls ppRaw | while read D
do
    out=${TOP}/phastCons10Scores/${D}.gz
    echo -n "$out ... "
    cd ${TOP}/ppRaw/${D}
    gzip -c `ls *.pp  | sed -e "s#-#.x-x.#g;" | \
        sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out}
    echo "done"
exit 255
done
'_EOF_'
    #	happy emacs
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    #	takes about 40 minutes, makes 2.8 Gb of data
    #	copy them for downloads
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/multiz10way/phastCons10Scores
    cd /cluster/data/mm6/bed/multiz10way/phastCons10Scores
    rsync -a --progress /cluster/bluearc/mm6/cons/consRun4/phastCons10Scores/ .
    #	3 minute copy

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores
    cd /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores
    ln -s /cluster/data/mm6/bed/multiz10way/phastCons10Scores/*.gz .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way/cons
    ln -s `pwd`/phastCons10.wib /gbdb/mm6/wib/phastCons10.wib
    hgLoadWiggle mm6 phastCons10 phastCons10.wig
    #  ~ 3 minute load

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/mm6/bed/multiz10way/cons
    time hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm6 phastCons10 > histogram.data 2>&1
    #	about 23 minutes to scan all data

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm6 Histogram phastCons10 track"
set xlabel " phastCons10 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	happy emacs

    display histo.png &

############################################################################
# BUILD KNOWN GENES TABLES (Started 3/19/05, done 4/13/05. Fan)

# First build protein databases, sp050315 and proteins050315
# See makeProteins050315.doc for details.

# Create working subdirectories and temporary databases

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgMm6A  
  ln -s /cluster/store10/kg/kgMm6A /cluster/store6/kgDB/bed/kgMm6A
  ln -s /cluster/store10/kg/kgMm6A /cluster/data/mm6/bed/kgMm6A
   
  hgsql mm6 -e "create database kgMm6ATemp"

  mkdir /cluster/bluearc/kgDB/kgMm6A
  mkdir /cluster/bluearc/kgDB/kgMm6A/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm6A/protBlat /cluster/store10/kg/kgMm6A/protBlat
  cd /cluster/store10/kg/kgMm6A/protBlat

# Get all mouse protein sequences

  hgsql -N sp050315 -e \
  'select proteins050315.spXref3.accession,protein.val from proteins050315.spXref3,protein where division="10090" and acc=accession' \
  |awk '{print ">" $1;print $2}' >mouseProt.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm6/bed/kgMm6A/protBlat
  mkdir prot
  faSplit sequence mouseProt.fa 1000 prot/prot

  ls /cluster/bluearc/kgDB/kgMm6A/protBlat/prot/* > prot.lis
  hgsql mm6 -N -e 'select chrom from chromInfo'   > chrom.lis

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /panasas/store/mm6/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para check
  para push
  para check ...

# This cluster run takes about two days.  Crashed jobs are due to empty BLAT result.  It is OK.
Completed: 31081 of 39600 jobs
Crashed: 8519 jobs
CPU time in finished jobs:   28671747s  477862.45m  7964.37h  331.85d  0.909 y
IO & Wait Time:               1469964s   24499.40m   408.32h   17.01d  0.047 y
Average job time:                 970s      16.16m     0.27h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:           39632s     660.53m    11.01h    0.46d
Submission to last job:        124276s    2071.27m    34.52h    1.44d

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm6/bed/kgMm6A/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.1
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result4/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl
   hgLoadPsl mm6 protBlat.psl

# Remember to remove result2 and result3 when KG is built and validated.

   cd /cluster/data/mm6/bed/kgMm6A

# create all_mrna.psl and tight_mrna.psl
   hgsql mm6 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
           all_mrna.psl tight_mrna.psl /dev/null

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm6ATemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm6ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm6ATemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm6ATemp DB.

   hgFaToTab mrna.fa mrnaSeq.tab

   hgsql kgMm6ATemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm6ATemp -e "load data local infile "mrnaSeq.tab" into table mrnaSeq"

# Prepare files for cluster run
   ~/src/hg/protein/KG2.sh kgMm6A mm6 050315

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgMm6A mm6 050315

# Collect cluster run results
   cd kgBestMrna

   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm6ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm6ATemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm6ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# Prepare refGene and all_mrna gp files.

   cd ..
   hgsql mm6 -N -e 'select * from refGene' >ref.gp

   hgsql mm6 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.90 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm6ATemp <~/src/hg/lib/spRef.sql
   hgsql kgMm6ATemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm6A mm6 050315
   ~/src/hg/protein/KGRef3.sh kgMm6A mm6 050315

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgMm6ATemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm6ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm6ATemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries

   cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/store10/mm6/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6ATemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgKeep to filter out invalid gene candidates

   kgCheck kgMm6ATemp mm6 kgCandidate.tab
   hgsql kgMm6ATemp -e  'drop table kgCandidate'
   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'

# Update and clean up kgResultBestMrna2.c and then check it in.

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 050201 kgMm6ATemp mm6|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  050315 kgMm6ATemp mm6|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgSelect to select highest scoring mRNA or RefSeq for each protein.
 
   kgSelect kgMm6ATemp kgCandidate2.gp
   hgsql kgMm6ATemp -e  'drop table kgCandidate2'
   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate2.sql
   hgsql kgMm6ATemp -e 'load data local infile "kgCandidate2.gp" into table kgCandidate2'

# Create sorted file to get entries with identical CDS regions group together. 
   hgsql kgMm6ATemp -N -e \
   'select name,chrom,cdsStart,cdsEnd,score,proteinID from kgCandidate2,protMrnaScore where proteinID=protAcc and name=mrnaAcc order by name,cdsStart,cdsEnd,score desc,proteinID' \
   >kgSorted.tab

# Run kgUniq to pick the top mRNA/RefSeq with hightest score for each CDS structure.

   kgUniq kgMm6ATemp sp050315 kgSorted.tab knownGene.gp dupSpMrna.tab

   hgsql mm6 -e  'drop table dupSpMrna'
   hgsql mm6 <~/src/hg/lib/dupSpMrna.sql
   hgsql mm6 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Build mrnaRefseq table first before loading knownGene table

   cd /cluster/store10/entrez
   mkdir 050401
   ln -s /cluster/store10/entrez/050401 /cluster/data/entrez/050401
   cd /cluster/data/entrez/050401

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Sort knownGene table
   ~/kent/src/hg/protein/sortKg.pl knownGene.gp > sortedKnownGene.gp

# Load knownGene table
   cd /cluster/data/kgDB/bed/kgMm6A
   hgsql mm6 -e  'drop table knownGene'
   hgsql mm6 <~/src/hg/lib/knownGene.sql
   hgsql mm6 -e 'load data local infile "sortedKnownGene.gp" into table knownGene'

# Build kgXref table

   kgXref2 mm6 proteins050315 mm6

   hgsql mm6 -e  'drop table kgXref'
   hgsql mm6 <~/src/hg/lib/kgXref.sql
   hgsql mm6 -e 'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm6 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab

   hgsql mm6 -e  'drop table spMrna'
   hgsql mm6 <~/src/hg/lib/spMrna.sql
   hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build knownGenePep table

   hgsql mm6 -N -e \
   'select name, protein.val from knownGene, sp050315.displayId, sp050315.protein where proteinID=displayId.val and displayId.acc=protein.acc' \
   >knownGenePep.tab

   hgsql mm6 -e  'drop table knownGenePep'
   hgsql mm6 <~/src/hg/lib/knownGenePep.sql
   hgsql mm6 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build knownGeneMrna table

   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \
   -gbRoot=/cluster/data/genbank refseq mrna stdout \
   | faToTab stdin refseqSeq.tab

   hgsql kgMm6ATemp -e "drop table refseqSeq"
   hgsql kgMm6ATemp <~/src/hg/lib/refseqSeq.sql
   hgsql kgMm6ATemp -e 'load data local infile "refseqSeq.tab" into table refseqSeq'

   hgsql kgMm6ATemp -N -e \
   'select knownGene.name, seq from refseqSeq, mm6.knownGene where knownGene.name=refseqSeq.name'\
   >j1.tmp

   hgsql kgMm6ATemp -N -e \
   'select knownGene.name, seq from mrnaSeq, mm6.knownGene where knownGene.name=mrnaSeq.name' \
   >j2.tmp
   cat j1.tmp j2.tmp >knownGeneMrna.tab
   rm j1.tmp j2.tmp

   hgsql mm6 -e "drop table mm6.knownGeneMrna"
   hgsql mm6 <~/src/hg/lib/knownGeneMrna.sql
   hgsql mm6 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'

# Build KEGG pathway tables

   ~/src/hg/protein/KGpath.sh kgMm6A mm6 050315

   hgsql kgMm6ATemp -e "drop table keggList"
   hgsql kgMm6ATemp <~/src/hg/lib/keggList.sql
   hgsql kgMm6ATemp -e 'load data local infile "keggList.tab" into table keggList'

   hgsql mm6 -e "drop table keggMapDesc"
   hgsql mm6 -e "drop table keggPathway"
   hgsql mm6 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm6 <~/src/hg/lib/keggPathway.sql
   hgsql mm6 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm6 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables

   ~/src/hg/protein/KGcgap.sh kgMm6A mm6 050315

   hgsql sp050315 -N -e \
   'select name, gene.val from mm6.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \
   | sort -u >kgAliasP.tab

# Build alias tables

#	kgAliasM reads from proteins050315.hugo.symbol, proteins050315.hugo.aliases
#	proteins050315.hugo.withdraws, mm6.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   kgAliasM mm6 proteins050315

#	kgAliasKgXref reads from mm6.knownGene.proteinID,
#	mm6.knownGene.name, mm6.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm6


#	kgAliasRefseq reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm6

   hgsql sp050315 -N -e \
   'select name, gene.val from mm6.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \
   | sort -u >kgAliasP.tab

   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab | \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm6 
   hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.knownGene.alignID,
#	proteins050315.spXref3.accession, proteins050315.spSecondaryID, proteins050315.pdbSP.pdb
#	to create kgProtAlias.tab
#

   kgProtAlias mm6 050315

   hgsql mm6 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

    cat kgProtAliasNCBI.tab kgProtAlias.tab | sort | uniq > kgProtAliasBoth.tab
    rm kgProtAliasNCBI.tab kgProtAlias.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm6 -e "drop table kgProtAlias;"
    hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasBoth.tab" into table kgProtAlias;'  


# MAKING FOLDUTR TABLES (DONE 2005-04-21, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    mkdir -p /cluster/data/mm6/bed/rnaStruct
    cd /cluster/data/mm6/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm6 knownGene utr3 utr3/utr.fa
    utrFa mm6 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm6/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

Completed: 19325 of 19325 jobs
CPU time in finished jobs:     421619s    7026.99m   117.12h    4.88d  0.013 y
IO & Wait Time:                 87355s    1455.91m    24.27h    1.01d  0.003 y
Average job time:                  26s       0.44m     0.01h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            7988s     133.13m     2.22h    0.09d
Submission to last job:          8644s     144.07m     2.40h    0.10d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

Completed: 18053 of 18053 jobs
CPU time in finished jobs:      25309s     421.82m     7.03h    0.29d  0.001 y
IO & Wait Time:                 53547s     892.45m    14.87h    0.62d  0.002 y
Average job time:                   4s       0.07m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             314s       5.23m     0.09h    0.00d
Submission to last job:           565s       9.42m     0.16h    0.01d

# Load database
    ssh hgwdev
    cd /cluster/data/mm6/bed/rnaStruct/utr5
    hgLoadRnaFold mm6 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm6 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak


# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB:

   update dbDb set defaultPos="chr6:29107216-29120872" where name="mm6";

# Create QA Push Queue entry with the following tables:
# cgapAlias
# cgapBiocDesc
# cgapBiocPathway
# dupSpMrna
# keggMapDesc
# keggPathway
# kgAlias
# kgProtAlias
# kgXref
# knownGene
# knownGeneMrna
# knownGenePep
# mrnaRefseq
# spMrna
# foldUtr3
# foldUtr5

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/mm6/bed/kgMm6B
     mkdir index
     cd index
     hgKgGetText mm6 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/mm6/bed/kgMm6B/index/knownGene.ix /gbdb/mm6/knownGene.ix
     ln -s /cluster/data/mm6/bed/kgMm6B/index/knownGene.ixx /gbdb/mm6/knownGene.ixx


############################################################################

# CYTOBAND TRACK (DONE - 2005-04-14 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm6/cytoBand
    cd /cluster/data/mm6/cytoBand
    # Get file from NCBI
    #	This one was a special delivery before it reached the usual
    #	location in mapviewer
    WGETRC=/cluster/data/mm6/ncbi/.wgetrc
    export WGETRC
    wget --timestamping ftp://ftp-private.ncbi.nih.gov/mouse_34/ideogram.gz
    #	I'm guessing when the mapview is done, this file would be here:
    # wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.34/ideogram.gz
    gunzip ideogram
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -noBin \
	-sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm6 \
	cytoBand cytoBand.bed
    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql mm6 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"

############################################################################

#  BLATSERVERS ENTRY (DONE - 2005-04-11 - Ali)
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm6", "blat6", "17778", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm6", "blat7", "17782", "0", "0");' \
	    hgcentraltest

    #	the untrans server is running with -stepSize=5, so:
    hgsql hgcentraltest \
	-e 'update blatServers set canPcr=1 where db="mm6" and port=17782;'

############################################################################

### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
#                              (DONE - 2005-04-14 - Fan)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
# ssh kkr1u00
# mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
# ssh kk
# cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm6/bed/affyU74.2005-04-14
cd /cluster/data/mm6/bed/affyU74.2005-04-14
mkdir run
cd run
mkdir psl
#echo /scratch/mus/mm6/maskedContigs/*.fa | wordLine stdin > genome.lst
echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
# do usual para check/para push etc. until the job is done. 
# Completed: 120 of 120 jobs
# CPU time in finished jobs:       7197s     119.94m     2.00h    0.08d  0.000 y
# IO & Wait Time:                  1047s      17.46m     0.29h    0.01d  0.000 y
# Average job time:                  69s       1.15m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             208s       3.47m     0.06h    0.00d
# Submission to last job:           751s      12.52m     0.21h    0.01d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kk
cd /cluster/data/mm6/bed/affyU74.2005-04-14/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm6/bed/affyU74.2005-04-14
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm6 affyU74.psl
# rm -fr chrom temp run

##   MAKE THE affyGnfU74 TRACKs (DONE - 2005-04-14 - Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
# Fix broken symlinks to microarray data after directory structure changed
# (DONE, 2005-05-03, hartera)
----------------------------------
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm6/bed/affyGnf.2005-04-14
cd /cluster/data/mm6/bed/affyGnf.2005-04-14
#	may need to build this command in src/hg/affyGnf
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
mkdir sav
cp *.bed sav -p
cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed

# and reload data into table
hgLoadBed mm6 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm6 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm6 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
#    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm6 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm6 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm6 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  [DONE Fan 2005-04-14]
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm6/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    #mkdir -p /cluster/bluearc/geneAtlas2
    #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
    #ls -1 /scratch/mus/mm6/maskedContigs/ > genome.lst
    echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst

    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time

# Completed: 40 of 40 jobs
# CPU time in finished jobs:      56570s     942.84m    15.71h    0.65d  0.002 y
# IO & Wait Time:                   392s       6.53m     0.11h    0.00d  0.000 y
# Average job time:                1424s      23.73m     0.40h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3979s      66.32m     1.11h    0.05d
# Submission to last job:          3993s      66.55m     1.11h    0.05d


    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null

    #rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm6/bed/geneAtlas2
#    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm6 affyGnf1m.psl
    hgLoadSeq mm6 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed mm6 gnfAtlas2 gnfAtlas2.bed

# MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2005-04-14, Fan)
#    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
#    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

#    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
#    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
#    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
#       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04.

    # Set up cluster job to align MOE430 consensus sequences to mm6
    ssh kkr1u00
    cd /cluster/data/mm6/bed
    mkdir -p affyMOE430
    cd affyMOE430
#    mkdir -p /iscratch/i/affy
#    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
#    iSync

    ssh kk
    cd /cluster/data/mm6/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    # Actually do the job with usual para try/check/push/time etc.
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       9414s     156.90m     2.61h    0.11d  0.000 y
# IO & Wait Time:                   281s       4.69m     0.08h    0.00d  0.000 y
# Average job time:                 242s       4.04m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             597s       9.95m     0.17h    0.01d
# Submission to last job:           657s      10.95m     0.18h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm6/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm6 affyMOE430.psl
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
#    mkdir -p /gbdb/hgFixed/affyProbes
#    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
#       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm6 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
#    rm batch.bak contig.psl raw.psl 
    
    # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table

######## MAKING GENE SORTER TABLES #######  (STARTED - 2005-04-15, DONE 4/18/05 - Fan)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm6 knownGene knownIsoforms knownCanonical
#	You may need to build this binary in src/hg/near/hgClusterGenes
#	Got 24603 clusters, from 41208 genes in 43 chromosomes
#	featureBits mm6 knownCanonical
# 	686054706 bases of 2597150411 (26.416%) in intersection
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection
#	! ! ! Can not do featureBits on knownIsoforms

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm6/bed/geneSorter/blastp
cd /cluster/data/mm6/bed/geneSorter/blastp
pepPredToFa mm6 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/panasas/home/store/mm6/blastp
cp -p /cluster/data/mm6/bed/geneSorter/blastp/known.* \
/cluster/panasas/home/store/mm6/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm6/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm6/bed/geneSorter/blastp/self
cd /cluster/data/mm6/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
	-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...

# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      31525s     525.42m     8.76h    0.36d  0.001 y
# IO & Wait Time:                 34031s     567.18m     9.45h    0.39d  0.001 y
# Average job time:                   8s       0.14m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              61s       1.02m     0.02h    0.00d
# Submission to last job:           142s       2.37m     0.04h    0.00d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm6/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm6 knownBlastTab *.tab
Scanning through 7715 files
Loading database with 1972005 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
# DONE (05-04-15 Fan)

hgMapToGene mm6 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm6 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m

# Create table that maps between known genes and RefSeq
hgMapToGene mm6 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm6 affyU74  knownGene knownToU74
hgMapToGene mm6 affyMOE430 knownGene knownToMOE430
hgMapToGene mm6 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm6/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
    ../affyMOE430/affyMOE430.psl
hgLoadBed mm6 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm6/bed/affyGnf95
cd /cluster/data/mm6/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm6 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
hgExpDistance mm6 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
# Got 7720 unique elements in affyGnfU74A
hgExpDistance mm6 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
# Got 4619 unique elements in affyGnfU74B
hgExpDistance mm6 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Got 1406 unique elements in affyGnfU74C

# C.ELEGANS BLASTP FOR GENE SORTER (DONE 4/15/05 Fan)
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 142 below.
    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/ce2/run/out
    cd /cluster/data/mm6/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      29337s     488.96m     8.15h    0.34d  0.001 y
# IO & Wait Time:                 24651s     410.84m     6.85h    0.29d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              40s       0.67m     0.01h    0.00d
# Submission to last job:           206s       3.43m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/ce2/run/out
    hgLoadBlastTab mm6 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER (DONE 4/18/05 Fan)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg17/bed/blastp
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
# PLEASE NOTE, hg17B IS USED INSTEAD OF hg17 for /iscratch/i,
# TO GO AROUND A SUBDIRECTORY ACCESS RIGHT PROBLEM.

    ssh kkr1u00
    if (-e /iscratch/i/hg17B/blastp) then
      rm -r /iscratch/i/hg17B/blastp
    endif
    mkdir -p /iscratch/i/hg17B/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17B/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/hg17/run/out
    cd /cluster/data/mm6/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17B/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      67090s    1118.17m    18.64h    0.78d  0.002 y
# IO & Wait Time:                 22543s     375.72m     6.26h    0.26d  0.001 y
# Average job time:                  12s       0.19m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              83s       1.38m     0.02h    0.00d
# Submission to last job:           213s       3.55m     0.06h    0.00d
 
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/hg17/run/out
    hgLoadBlastTab mm6 hgBlastTab -maxPer=1 *.tab

# ZEBRAFISH BLASTP FOR GENE SORTER (DONE 4/15/05 Fan)
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer1/bed/blastp
    cd /cluster/data/danRer1/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz
    
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer1/blastp) then
      rm -r /iscratch/i/danRer1/blastp
    endif
    mkdir -p /iscratch/i/danRer1/blastp
    cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/danRer1/run/out
    cd /cluster/data/mm6/bed/blastp/danRer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      53430s     890.51m    14.84h    0.62d  0.002 y
# IO & Wait Time:                 24688s     411.46m     6.86h    0.29d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              76s       1.27m     0.02h    0.00d
# Submission to last job:           202s       3.37m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/danRer1/run/out
    hgLoadBlastTab mm6 drBlastTab -maxPer=1 *.tab

# YEAST BLASTP FOR GENE SORTER (DONE 4/15/05 Fan)
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep

    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm6/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:       8741s     145.68m     2.43h    0.10d  0.000 y
# IO & Wait Time:                 20376s     339.60m     5.66h    0.24d  0.001 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              17s       0.28m     0.00h    0.00d
# Submission to last job:           199s       3.32m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/sacCer1/run/out
    hgLoadBlastTab mm6 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (DONE 4/18/05, Fan)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    # ssh kkr1u00
    # if (-e /iscratch/i/dm1/blastp) then
    #   rm -r /iscratch/i/dm1/blastp
    # endif
    # mkdir -p /iscratch/i/dm1/blastp
    # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    # iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/dm1/run/out
    cd /cluster/data/mm6/bed/blastp/dm1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...

# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      33260s     554.33m     9.24h    0.38d  0.001 y
# IO & Wait Time:                 24452s     407.54m     6.79h    0.28d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              45s       0.75m     0.01h    0.00d
# Submission to last job:           121s       2.02m     0.03h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/dm1/run/out
    hgLoadBlastTab mm6 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink (DONE 4/18/05 Fan)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm6 \
        > refToLl.txt
hgMapToGene mm6 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 17480 

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam
# row count is 17132

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm6 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and genePix database 
    knownToGenePix mm6

# ENABLE GENE SORTER FOR mm6 IN HGCENTRALTEST (DONE 7/20/04 Fan)
    echo "update dbDb set hgNearOk = 1 where name = 'mm6';" \
      | hgsql -h genome-testdb hgcentraltest

# RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan)
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/rn3/bed/blastp
    cd /cluster/data/rn3/bed/blastp
    pepPredToFa rn3 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/rn3/blastp) then
      rm -r /iscratch/i/rn3/blastp
    endif
    mkdir -p /iscratch/i/rn3/blastp
    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/rn3/run/out
    cd /cluster/data/mm6/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      12896s     214.93m     3.58h    0.15d  0.000 y
# IO & Wait Time:                 21725s     362.08m     6.03h    0.25d  0.001 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              22s       0.37m     0.01h    0.00d
# Submission to last job:           246s       4.10m     0.07h    0.00d
 
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastp/rn3/run/out
    hgLoadBlastTab mm6 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF
#############################################################################

### MM6 PROTEOME BROWSER TABLES BUILD ####  (DONE - 2005-04-20 - Fan)
# These are instructions for building tables 
# needed for the Proteome Browser to be used with mm6.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050315.

# Create the working directory

   ssh hgwdev
   mkdir /cluster/data/mm6/bed/pb.2005-04-20
   cd /cluster/data/mm6/bed
   ln -s /cluster/data/mm6/bed/pb.2005-04-20 pb
   cd pb

# Define pep* tables in mm6 DB

   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# First edit out pepPred table definition, then

   hgsql mm6 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins050315 -e "select info.acc, molWeight, aaSize from sp050315.info, sp050315.accToTaxon where 
accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050315 -e "select info.acc from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050315 pepPi.tab

hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050315 proteins050315 10090 mm6 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm6

    load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm6.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm6.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd 050315 10090 mm6

# Create pbAnomLimit and pbResAvgStd tables

   hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;'
   hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB (Done 4/20/05)

    cd /cluster/data/mm6/bed/pb
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab
    rm j.tmp

    hgsql mm6 -e 'drop table kgSpAlias';
    hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias'
    gzip mm6.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm6 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm6 -e 'delete from pbStamp'
  hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp ignore 1 lines;'

# ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST 
    echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \
      | hgsql -h genome-testdb hgcentraltest

# Connect to genome-testdb and use hgcentraltest DB.
# Update the entry in gdbPdb table from mySql prompt:

    delete from gdbPdb where genomeDb='mm6';
    insert into gdbPdb values('mm6', 'proteins050415');
    
# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm6, then
  notify QA for formal review.

#####################################################################
# MAP CONTIGS TRACK (DONE - 2005-04-21 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm6/bed/ctgPos
    cd /cluster/data/mm6/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
    # from the assembly.  (In the future, we should go with the NT's!)
    # So... just for this release, go straight from the seq_contig.md
    # to the table def'n: contig, size, chrom, chromStart, chromEnd
    #	This script is an improvement from before, this is now doing the
    #	randoms properly.
    cat << '_EOF_' > seqContigToCtgPos.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $prevRandom="";
my $randomPosition=0;

while(my $line=<>)
{
chomp($line);
my @a = split('\s+',$line);
if ($a[1] =~ m/\|/)
    {
    my @b = split('\|',$a[1]);
    if ($b[0] ne $prevRandom)
	{
	$randomPosition=0;
	$prevRandom=$b[0];
	}
    my $size = $a[3]-$a[2]+1;
    my $start = $randomPosition;
    my $end = $randomPosition + $size;
    printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end;
    if ($b[0] ne "Un") { $randomPosition += 50000; }
	else { $randomPosition += 1000; }
    $randomPosition += $size;
    }
elsif ($a[5] =~ m/^N[TC]_\d+$/)
    {
    my $start = $a[2]-1;
    my $end = $a[3];
    my $size = $end-$start;
    printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end;
    }
}
'_EOF_'
    #	emacs happy
    chmod +x seqContigToCtgPos.pl

    # /cluster/data/mm6/ncbi/seq_contig.md contains more than just C57BL/6J.
    # Filter those out with the grep.
    zcat ../../seq_contig.md.gz | grep C57BL | \
	./seqContigToCtgPos.pl > ctgPos.tab

    hgsql mm6 < ~/kent/src/hg/lib/ctgPos.sql
    hgsql mm6 -e 'load data local infile "ctgPos.tab" into table ctgPos;'

    featureBits -countGaps mm6 ctgPos
    #	2638893452 bases of 3079633452 (85.689%) in intersection
    featureBits -countGaps mm5 ctgPos
    #	2557081173 bases of 3164952073 (80.794%) in intersection

#########################################################################
# BLASTZ HUMAN Hg16 (DONE - 2005-04-27 - 2005-04-29 - Hiram)
#	to replace the Mm4 chains and links on Hg16 since Mm4 is being
#	retired with this Mm6 release
    ssh eieio
    mkdir /cluster/data/mm6/bed/blastzHg16.2005_04_27
    cd /cluster/data/mm6/bed
    ln -s blastzHg16.2005_04_27 blastz.hg16
    cd blastzHg16.2005_04_27

    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm6
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Human Hg16
SEQ2_DIR=/scratch/hg/hg16/bothMaskedNibs
SEQ2_SMSK=/scratch/hg/hg16/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzHg16.2005_04_27

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/hg16/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    cd /cluster/data/mm6/bed/blastzHg16.2005_04_27
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	STARTED - 2005-04-27 12:20
    #	FINISHED - 2005-04-28 00:11
    #	real    712m59.223s
    #	user    0m0.669s
    #	sys     0m0.442s
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh eieio
    screen -d -r
    #	STARTED - 2005-03-17 21:25
    #	FINISHED - 2005-03-18 14:00
# Completed: 44354 of 44354 jobs
# CPU time in finished jobs:   16945019s  282416.99m  4706.95h  196.12d  0.537 y
# IO & Wait Time:               2624756s   43745.93m   729.10h   30.38d  0.083 y
# Average job time:                 441s       7.35m     0.12h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            9471s     157.85m     2.63h    0.11d
# Submission to last job:         35934s     598.90m     9.98h    0.42d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:        274s       4.56m     0.08h    0.00d  0.000 y
# IO & Wait Time:                  1104s      18.40m     0.31h    0.01d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              15s       0.25m     0.00h    0.00d
# Submission to last job:           101s       1.68m     0.03h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:       6328s     105.47m     1.76h    0.07d  0.000 y
# IO & Wait Time:                   551s       9.18m     0.15h    0.01d  0.000 y
# Average job time:                 172s       2.87m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             620s      10.33m     0.17h    0.01d
# Submission to last job:           830s      13.83m     0.23h    0.01d

    #	swap results to place mm6 alignments onto Hg16
    ssh eieio
    cd /cluster/data/mm6/bed/blastzHg16.2005_04_27
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &

    featureBits mm6 netHg16
    #   2580637164 bases of 2597150411 (99.364%) in intersection
    featureBits mm6 netHg17
    #	2579747741 bases of 2597150411 (99.330%) in intersection
    featureBits mm6 chainHg16
    #   2597476551 bases of 2597150411 (100.013%) in intersection
    featureBits mm6 chainHg17
    #	2596946329 bases of 2597150411 (99.992%) in intersection
    featureBits hg16 netMm6
    #   2890452713 bases of 2865248791 (100.880%) in intersection
    featureBits hg16 chainMm6
    #   2913361200 bases of 2865248791 (101.679%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainHg16Link
    #	966699669 bases of 2597150411 (37.222%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainHg17Link
    #	966916309 bases of 2597150411 (37.230%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits hg16 chainMm6Link
    #	969979195 bases of 2865248791 (33.853%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link
    #	1020106336 bases of 2866216770 (35.591%) in intersection

####  Blat knownGene proteins to determine exons (braney 2005-05-20 DONE)
    ssh hgwdev
    cd /cluster/data/mm6/bed
    mkdir blat.mm6KG.2005-05-02
    rm blat.mm6KG
    ln -s  blat.mm6KG.2005-05-02 blat.mm6KG
    cd blat.mm6KG
    pepPredToFa mm6 knownGenePep known.fa
    hgPepPred mm6 generic blastKGPep03 known.fa
    grep ">" known.fa | sed "s/>//" > kgName.lst
    ssh kk
    cd /cluster/data/mm6/bed/blat.mm6KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /panasas/store/mm6/nib/*.nib > mouse.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 3000 kg
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 mouse.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../mouse.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 115720 of 115720 jobs
# CPU time in finished jobs:   14938417s  248973.62m  4149.56h  172.90d  0.474 y
# IO & Wait Time:               2116275s   35271.25m   587.85h   24.49d  0.067 y
# Average job time:                 147s       2.46m     0.04h    0.00d
# Longest finished job:            9235s     153.92m     2.57h    0.11d
# Submission to last job:         25264s     421.07m     7.02h    0.29d

    ssh eieio
    cd /cluster/data/mm6/bed/blat.mm6KG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslUniq cooked.psl mm6KG.psl
    pslxToFa mm6KG.psl mm6KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft

    ssh hgwdev
    kgName mm6 mm6KG.psl blastKGRef03
    hgsql mm6 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastKGRef03" | hgsql mm6
    echo "load data local infile 'blastKGRef03' into table blastKGRef03" | hgsql mm6

# LOAD GENEID GENES (DONE 5/16/05 angie)
    mkdir -p /cluster/data/mm6/bed/geneid/download
    cd /cluster/data/mm6/bed/geneid/download
    foreach chr (`awk '{print $1;}' ../../../chrom.sizes`)
      echo $chr
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmMar2005/geneid_v1.2/$chr.gtf
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmMar2005/geneid_v1.2/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene -genePredExt -gtf mm6 geneid download/*.gtf
    hgPepPred mm6 generic geneidPep download/*-fixed.prot
    featureBits mm6 -enrichment refGene geneid
#refGene 1.623%, geneid 1.561%, both 0.794%, cover 48.91%, enrich 31.34x

##############################################################################
# CLONE ENDS - BACEND TRACK (started - 2005-05-20 - Heather)

  ssh kkstore01
  cd /cluster/data/mm6
  # check disk space: 120 Gigs free
  cd bed
  mkdir cloneend
  cd cloneend
  mkdir ncbi
  cd ncbi

  ftp ftp.ncbi.nih.gov
  # anonymous login
  cd genomes/CLONEEND/mus_musculus
  binary
  prompt 
  mget *

  gunzip *
  # 650 megs

  # seems like the *.mfa files were split just for convenience
  # concatenate
  foreach f (*.mfa)
    cat $f >> all.mfa
  end

  # Convert the title line of the all.mfa file
  # Location of perl different on kkstore01
  cat << '_EOF_' > convert.pl
#!/usr/bin/perl -w

use strict;

while (my $line = <>) {
    if (substr($line,0,1) ne ">") {
        print $line;
    } else {
        my @fields = split(/\|/, $line);
        my $printed = 0;
        for (my $i = 0; $i < $#fields; $i++) {
                if ($fields[$i] eq "gb" || $fields[$1] eq "dbj") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $#fields;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    # << for emacs
chmod +x convert.pl
./convert.pl < all.mfa > cloneEnds.fa

# check that files still have the same number of lines
# expecting 8290734
wc -l all.mfa
wc -l cloneEnds.fa

  # concatenate the text files, too
  foreach f (*.txt)
    cat $f >> all.txt
  end

  # generate cloneEndPairs.txt and cloneEndSingles.txt
  ./convertTxt.pl all.txt

  # Reading in end info
  # Writing out pair info
  # Writing out singleton info
  # 354485 pairs and 78424 singles

  # a bit of cleanup
  mkdir archive
  mv 10090* archive

  # split
  mkdir splitdir
  faSplit sequence cloneEnds.fa 100 cloneEnds
  mkdir /cluster/bluearc/scratch/mus/mm6
  mkdir /cluster/bluearc/scratch/mus/mm6/cloneEnds
  mv cloneEnds???.fa /cluster/bluearc/scratch/mus/mm6/cloneEnds
  cp -p cloneEnds.fa /cluster/bluearc/scratch/mus/mm6/cloneEnds
  # request updateLocal to make available on /scratch/mus/mm6/cloneEnds on the cluster

  # load sequences
  ssh hgwdev
  cd /gbdb/mm6
  mkdir cloneend
  cd cloneend
  ln -s /cluster/data/mm6/bed/cloneend/ncbi/cloneEnds.fa .
  cd /tmp
  hgLoadSeq mm6 /gbdb/mm6/cloneend/cloneEnds.fa
  #  Advisory lock created
  # Creating .tab file
  # Adding /gbdb/mm6/cloneend/cloneEnds.fa
  # 789467 sequences
  # Updating seq table
  # Advisory lock has been released
  # All done

############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2005-06-02 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/mm6/noMask
    cd /cluster/data/mm6/
    #	Need an unmasked sequence for this work
    for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	C=`basename ${CHR}`
	echo -n "working ${C} ... "
	head -1 ${CHR} > noMask/${C}
	tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C}
	echo "done"
    done
    mkdir ooc
    ls noMask/chr*.fa > fa.list
    blat -makeOoc=ooc/11.ooc -repMatch=1024 fa.list fa.list output.psl
    #	Wrote 25952 overused 11-mers to ooc/11.ooc
    ssh kkr1u00
    mkdir /iscratch/i/mm6/ooc
    cp -p /cluster/data/mm6/ooc/11.ooc /iscratch/i/mm6/ooc
    mkdir /iscratch/i/mm6/noMask
    cp -p /cluster/data/mm6/noMask/chr*.fa /iscratch/i/mm6/noMask
    for U in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm6/ooc/ kkr${U}u00:/iscratch/i/mm6/ooc
	rsync -a --progress /iscratch/i/mm6/noMask/ \
		kkr${U}u00:/iscratch/i/mm6/noMask
	echo "done kkr${U}u00"
    done
    
    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:
    ssh kk
    mkdir /cluster/data/mm6/bed/bacends
    cd /cluster/data/mm6/bed/bacends

    cat << '_EOF_' > runBlat.sh
#!/bin/sh
path1=$1
path2=$2
root1=$3
root2=$4
result=$5
rm -fr /tmp/${root1}_${root2}
mkdir /tmp/${root1}_${root2}
pushd /tmp/${root1}_${root2}
/cluster/bin/i386/blat ${path1} ${path2} -ooc=/iscratch/i/mm6/ooc/11.ooc \
	${root1}.${root2}.psl
popd
rm -f ${result}
mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /tmp/${root1}_${root2}
'_EOF_'
    # << emacs happy
    chmod +x runBlat.sh

    cat << '_EOF_' > template
#LOOP
./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs happy

    ls -1S /scratch/mus/mm6/cloneEnds/cloneEnds???.fa > bacEnds.lst
    mkdir bacEnds.out
    #	create results directories for each to avoid the all result files in
    #	one directory problem
    foreach f (`cat bacEnds.lst`)
	set b = $f:t:r
	echo $b
	mkdir bacEnds.out/$b
    end

    ls -1S /iscratch/i/mm6/noMask/chr*.fa > contig.lst
    gensub2 contig.lst bacEnds.lst template jobList
    para create jobList
    # 7850 jobs written to batch
    para try, check, push, etc ...
# Completed: 3920 of 3920 jobs
# CPU time in finished jobs:    2681337s   44688.95m   744.82h   31.03d  0.085 y
# IO & Wait Time:                110523s    1842.05m    30.70h    1.28d  0.004 y
# Average job time:                 712s      11.87m     0.20h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5203s      86.72m     1.45h    0.06d
# Submission to last job:          6402s     106.70m     1.78h    0.07d


    ssh kkstore01
    cd /cluster/data/mm6/bed/bacends
    screen

    mkdir temp
    time pslSort dirs raw.psl temp bacEnds.out/* > pslSort.out 2>&1 &
    #	real    27m20.352s
    #	user    20m10.329s
    #	sys     1m55.287s

    time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
	raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    8m15.671s
    #	user    7m18.229s
    #	sys     0m20.554s

    cp -p ~booch/clusterJobs/bacends/split.pl .
    cp -p ~booch/clusterJobs/bacends/header .
    time ./split.pl header < bacEnds.psl

    cp -p bacEnds.psl bacEnds.psl.save
    time pslSort dirs bacEnds.psl temp split
    #	~ 3 minutes

    # Copy files to final destination and remove
    mkdir /cluster/data/mm6/bacends
    cp -p bacEnds.psl /cluster/data/mm6/bacends

############################################################################
# BACEND PAIRS TRACK (DONE  2005-06-02 - Hiram)

    ssh kolossus
    cd /cluster/data/mm6/bacends

time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
	../bed/cloneend/ncbi/cloneEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
    echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
    echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header

    cat header bacEnds.pairs | \
	/cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairs.bed

    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed

    /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed >j1.out
    cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
    cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl

    rm j1.out j2.out

    # load into database
    ssh hgwdev
    cd /cluster/data/mm6/bacends

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort -u
    #	result should be the scores, no extraneous strings:
#	1000
#	300
#	375
#	500
#	750
    #	edit the file and fix it if it has a bad name.

    hgLoadBed -notItemRgb mm6 bacEndPairs bacEndPairs.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	Loaded 226279 elements of size 11

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -notItemRgb mm6 bacEndPairsBad bacEndPairsBad.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    #	Loaded 104221 elements of size 11

    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl mm6 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 9046691 record(s), 0 row(s)
#	skipped, 37 warning(s) loading psl.tab

#	real    36m1.178s
#	user    5m55.630s
#	sys     0m41.380s

# featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
# featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
# featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection

# featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
# featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
# featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection

# featureBits mm6 bacEndPairsBad
# 1006314997 bases of 2597150411 (38.747%) in intersection
# featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
# featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection


# SGP GENES (DONE 5/25/05 angie)
    ssh hgwdev
    mkdir /cluster/data/mm6/bed/sgp
    cd /cluster/data/mm6/bed/sgp
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/M.musculus/mmMar2005/SGP/humangp200405/$chr.gtf
      wget http://genome.imim.es/genepredictions/M.musculus/mmMar2005/SGP/humangp200405/$chr.prot
    end
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    cp /dev/null sgpPep.fa
    foreach f (chr*.prot)
      nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
    end
    ldHgGene -gtf -genePredExt mm6 sgpGene chr*.gtf
    hgPepPred mm6 generic sgpPep sgpPep.fa
    featureBits mm6 -enrichment refGene:CDS sgpGene
#refGene:CDS 0.983%, sgpGene 1.421%, both 0.841%, cover 85.57%, enrich 60.21x
 
# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

############################################################################
# RE-BUILD KNOWN GENES RELATED TABLES for mm6 (STARTED 5/26/05, DONE6/1/05. Fan)

# First build protein databases, sp050415 and proteins050415
# See makeProteins050415.doc for details.
# Please note that the protein and displayId tables in sp050415 have data of variant splice proteins.

# Create working subdirectories and temporary databases

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgMm6B  
  ln -s /cluster/store10/kg/kgMm6B /cluster/store6/kgDB/bed/kgMm6B
  ln -s /cluster/store10/kg/kgMm6B /cluster/data/mm6/bed/kgMm6B

  hgsql mm6 -e "create database kgMm6B"   
  hgsql mm6 -e "create database kgMm6BTemp"

  mkdir /cluster/bluearc/kgDB/kgMm6B
  mkdir /cluster/bluearc/kgDB/kgMm6B/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm6B/protBlat /cluster/store10/kg/kgMm6B/protBlat
  cd /cluster/store10/kg/kgMm6B/protBlat

# Get all human protein sequences

  hgsql -N sp050415 -e \
  'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="10090" and acc=accession' \
  |awk '{print ">" $1;print $2}' >mm6Prot.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm6/bed/kgMm6B/protBlat
  mkdir prot
  faSplit sequence mm6Prot.fa 1000 prot/prot
  ls /cluster/bluearc/kgDB/kgMm6B/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/mm6/bed/kgMm6B/protBlat
  hgsql mm6 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm6/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6B/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
# Completed: 31386 of 39600 jobs
# Crashed: 8214 jobs
# CPU time in finished jobs:   32377544s  539625.74m  8993.76h  374.74d  1.027 y
# IO & Wait Time:                727341s   12122.34m   202.04h    8.42d  0.023 y
# Average job time:                1055s      17.58m     0.29h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           34182s     569.70m     9.49h    0.40d
# Submission to last job:         57659s     960.98m    16.02h    0.67d

# Many output .psl files are empty, these warnings are OK.  
# Check to see if there is any other error type.

  para problems |grep empty|wc
#   8214   24642  642357

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm6/bed/kgMm6B/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl
   hgLoadPsl mm6 protBlat.psl
# Processing protBlat.psl
# load of protBlat did not go as planned: 82296 record(s), 0 row(s) skipped, 750 warning(s) loading psl.tab
# Looked into the cause of the warnings before and found that it was due to that qBaseInsert 
# and tBaseInsert have negative values, probably due to that this is protein alignment.

# Remember to remove result2 and result3 when KG is built and validated.

   cd /cluster/data/mm6/bed/kgMm6B

# create all_mrna.psl and tight_mrna.psl
   hgsql mm6 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null
# Processed 194640 alignments

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm6BTemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm6BTemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm6BTemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/mm6/bed/kgMm6B
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm6BTemp DB.

   hgFaToTab mrna.fa mrnaSeq.tab

   hgsql kgMm6BTemp -e 'drop table mrnaSeq'
   hgsql kgMm6BTemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm6BTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
   rm mrnaSeq.tab

# Prepare files for cluster run
   ~/src/hg/protein/KG2.sh kgMm6B mm6 050415

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG4.sh kgMm6B mm6 050415

# Collect cluster run results
   cd kgBestMrna
   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm6BTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm6BTemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm6BTemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# Prepare refGene and all_mrna gp files.

   cd ..
   hgsql mm6 -N -e 'select * from refGene' >ref.gp

   hgsql mm6 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm6BTemp -e 'drop table spRef'
   hgsql kgMm6BTemp <~/src/hg/lib/spRef.sql
   hgsql kgMm6BTemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm6B mm6 050415
   ~/src/hg/protein/KGRef3.sh kgMm6B mm6 050415

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgMm6BTemp -e 'drop table protRefBlat'
   hgsql kgMm6BTemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm6BTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm6BTemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/mm6/bed/kgMm6B
   cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir \
   /cluster/data/mm6/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6BTemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgMm6BTemp mm6 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm6BTemp -e  'drop table kgCandidate'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm6BTemp -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgMm6BTemp -e  'drop table kgCandidateX'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 050415 kgMm6BTemp mm6|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  050415 kgMm6BTemp mm6|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6BTemp -e 'drop table protMrnaScore'
   hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6BTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6BTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm6BTemp kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgMm6BTemp -e  'drop table kgCandidateY'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm6BTemp kgCandidateZ.tab
   hgsql kgMm6BTemp -e  'drop table kgCandidateZ'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm6BTemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm6BTemp mm6 proteins050415 kg4.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab
   hgsql mm6 -e  'drop table dupSpMrna'
   hgsql mm6 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm6 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Sort KG genes to make the kg4.gp table file.
   ~/kent/src/hg/protein/sortKg.pl kg4.tmp >kg4.gp

   hgsql kgMm6BTemp -e  'drop table knownGene'
   hgsql kgMm6BTemp < ~/src/hg/lib/knownGene.sql
   hgsql kgMm6BTemp -e  'load data local infile "kg4.gp" into table knownGene'

   hgsql mm6 -e  'drop table kg4'
   hgsql mm6 < ~/src/hg/lib/kg4.sql
   hgsql mm6 -e  'load data local infile "kg4.gp" into table kg4'

# Perform analysis before loading kg4 table data to mm6.knownGene table.

# Load data into mm6 knownGene table.
   hgsql mm6 -e  'drop table knownGene'
   hgsql mm6 < ~/src/hg/lib/knownGene.sql
   hgsql mm6 -e  'load data local infile "kg4.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgMm6BTemp mm6 050415
   hgsql mm6 -e  'drop table knownGeneMrna'
   hgsql mm6 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm6 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm6 -e  'drop table knownGenePep'
   hgsql mm6 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm6 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'


# Build kgXref table

   kgXref2 kgMm6BTemp 050415 mm6

   hgsql mm6 -e  'drop table kgXref'
   hgsql mm6 < ~/src/hg/lib/kgXref.sql
   hgsql mm6 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm6 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab

   hgsql mm6 -e  'drop table spMrna'
   hgsql mm6 <~/src/hg/lib/spMrna.sql
   hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build mrnaRefseq table

   cd /cluster/store10/entrez
   mkdir 050601
   rm /cluster/data/entrez
   ln -s /cluster/store10/entrez/050601 /cluster/data/entrez
   cd /cluster/data/entrez

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm6 -e 'drop table mrnaRefseq'
   hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgMm6B mm6 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build alias tables.		
#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, mm6.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   cd /cluster/store10/kg/kgMm6B
   mkdir alias
   cd alias
   kgAliasM mm6 proteins050415

#	kgAliasKgXref reads from mm6.knownGene.proteinID,
#	mm6.knownGene.name, mm6.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm6

#	kgAliasRefseq reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm6

   hgsql sp050415 -N -e 'select name,gene.val from mm6.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm6 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm6 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm6 
   hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab

   kgProtAlias mm6 050415

   hgsql mm6 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm6 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm6 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm6 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm6 -e "drop table kgProtAlias;"
    hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab
    rm j.tmp

    hgsql mm6 -e 'drop table kgSpAlias';
    hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES (DONE 2005-05-31 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm6/bed
    mkdir rnaStruct.2005-05-31
    rm rnaStruct
    ln -s rnaStruct.2005-05-31 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm6 knownGene utr3 utr3/utr.fa
    utrFa mm6 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm6/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 25085 of 25085 jobs
# CPU time in finished jobs:     553473s    9224.55m   153.74h    6.41d  0.018 y
# IO & Wait Time:                 66725s    1112.08m    18.53h    0.77d  0.002 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6003s     100.05m     1.67h    0.07d
# Submission to last job:          6524s     108.73m     1.81h    0.08d
# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 23507 of 23507 jobs
# CPU time in finished jobs:      98380s    1639.66m    27.33h    1.14d  0.003 y
# IO & Wait Time:                 60713s    1011.89m    16.86h    0.70d  0.002 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            7754s     129.23m     2.15h    0.09d
# Submission to last job:          7840s     130.67m     2.18h    0.09d

# Load database
    ssh hgwdev
    cd /cluster/data/mm6/bed/rnaStruct/utr5
    hgLoadRnaFold mm6 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm6 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  DONE 5/19/05.  Fan.

   ssh hgwdev
   cd /cluster/store10/kg/kgMm6B
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm6B mm6 050415

   hgsql mm6 -e "drop table keggMapDesc"
   hgsql mm6 -e "drop table keggPathway"
   hgsql mm6 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm6 <~/src/hg/lib/keggPathway.sql
   hgsql mm6 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm6 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables
# Reloaded cgapAlias.tab file after removing replicate rows - 
# see the other cgap table entry in this document (hartera, 2005-10-07).

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm6B mm6 050415
   hgsql mm6 -e "drop table cgapAlias"
   hgsql mm6 -e "drop table cgapBiocDesc"
   hgsql mm6 -e "drop table cgapBiocPathway"
   hgsql mm6 <~/src/hg/lib/cgapAlias.sql
   hgsql mm6 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm6 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql mm6 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql mm6 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql mm6 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'


######## RE-BUILD GENE SORTER TABLES #######  (DONE - 2005-05-30 - Fan)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm6 knownGene knownIsoforms knownCanonical
# Got 17843 clusters, from 27131 genes in 40 chromosomes
#	featureBits mm6 knownCanonical
# 	764263619 bases of 2597150411 (29.427%) in intersection
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm6/bed/geneSorter/blastp
cd /cluster/data/mm6/bed/geneSorter/blastp
pepPredToFa mm6 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/panasas/home/store/mm6/blastp
cp -p /cluster/data/mm6/bed/geneSorter/blastp/known.* /cluster/panasas/home/store/mm6/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm6/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm6/bed/geneSorter/blastp/self
cd /cluster/data/mm6/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      58630s     977.16m    16.29h    0.68d  0.002 y
# IO & Wait Time:                 39839s     663.99m    11.07h    0.46d  0.001 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             116s       1.93m     0.03h    0.00d
# Submission to last job:           188s       3.13m     0.05h    0.00d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm6/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm6 knownBlastTab *.tab
# Scanning through 7729 files
# Loading database with 3391069 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
# DONE (05-04-15 Fan)

hgMapToGene mm6 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm6 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
# Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio

# Create table that maps between known genes and RefSeq
hgMapToGene mm6 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm6 affyU74  knownGene knownToU74
hgMapToGene mm6 affyMOE430 knownGene knownToMOE430
hgMapToGene mm6 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm6/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed mm6 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm6/bed/affyGnf95
cd /cluster/data/mm6/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm6 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
cd /cluster/data/mm6/bed/geneSorter
hgExpDistance mm6 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
# Got 10157 unique elements in affyGnfU74A
hgExpDistance mm6 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
# Got 6076 unique elements in affyGnfU74B
hgExpDistance mm6 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Got 1793 unique elements in affyGnfU74C

# C.ELEGANS BLASTP FOR GENE SORTER 
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 142 below.
    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/ce2/run/out
    cd /cluster/data/mm6/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ... 
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      40061s     667.69m    11.13h    0.46d  0.001 y
# IO & Wait Time:                 21049s     350.81m     5.85h    0.24d  0.001 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:           134s       2.23m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/ce2/run/out
hgLoadBlastTab mm6 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER (DONE 4/18/05 Fan)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg17/bed/blastp
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/hg17/blastp) then
      rm -r /iscratch/i/hg17/blastp
    endif
    mkdir -p /iscratch/i/hg17/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/hg17/run/out
    cd /cluster/data/mm6/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      81526s    1358.76m    22.65h    0.94d  0.003 y
# IO & Wait Time:                 23670s     394.51m     6.58h    0.27d  0.001 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              84s       1.40m     0.02h    0.00d
# Submission to last job:           185s       3.08m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/hg17/run/out
hgLoadBlastTab mm6 hgBlastTab -maxPer=1 *.tab

# ZEBRAFISH BLASTP FOR GENE SORTER 
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/I
    # The below is done by hg17, that section from makeHg17.doc is copied here.
    ssh kkstore
    mkdir /cluster/data/danRer2/bed/blastp
    cd /cluster/data/danRer2/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz
    
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer2/blastp) then
      rm -r /iscratch/i/danRer2/blastp
    endif
    mkdir -p /iscratch/i/danRer2/blastp
    cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
    iSync

# The above is copied from makeHg17.doc.

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/danRer2/run/out
    cd /cluster/data/mm6/bed/blastp/danRer2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    
# Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...

# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      72894s    1214.89m    20.25h    0.84d  0.002 y
# IO & Wait Time:                 21284s     354.74m     5.91h    0.25d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              73s       1.22m     0.02h    0.00d
# Submission to last job:           176s       2.93m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/danRer2/run/out
hgLoadBlastTab mm6 drBlastTab -maxPer=1 *.tab

# YEAST BLASTP FOR GENE SORTER 
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep

    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm6/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      11663s     194.38m     3.24h    0.13d  0.000 y
# IO & Wait Time:                 20479s     341.32m     5.69h    0.24d  0.001 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:           143s       2.38m     0.04h    0.00d
# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/sacCer1/run/out
hgLoadBlastTab mm6 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (DONE 5/30/05, Fan)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    # ssh kkr1u00
    # if (-e /iscratch/i/dm1/blastp) then
    #   rm -r /iscratch/i/dm1/blastp
    # endif
    # mkdir -p /iscratch/i/dm1/blastp
    # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    # iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/dm1/run/out
    cd /cluster/data/mm6/bed/blastp/dm1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      45146s     752.44m    12.54h    0.52d  0.001 y
# IO & Wait Time:                 21289s     354.81m     5.91h    0.25d  0.001 y
# Average job time:                   9s       0.14m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              43s       0.72m     0.01h    0.00d
# Submission to last job:           139s       2.32m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/dm1/run/out
hgLoadBlastTab mm6 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink 
cd /cluster/data/mm6/bed/geneSorter
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm6 > refToLl.txt
hgMapToGene mm6 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 23074  

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam
# row count is 22525 

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm6 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and genePix database 
    knownToGenePix mm6

# ENABLE GENE SORTER FOR mm6 IN HGCENTRALTEST (already done during first mm6 KG build)
    echo "update dbDb set hgNearOk = 1 where name = 'mm6';" \
      | hgsql -h genome-testdb hgcentraltest

# RAT BLASTP FOR GENE SORTER 
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/rn3/bed/blastp
    cd /cluster/data/rn3/bed/blastp
    pepPredToFa rn3 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/rn3/blastp) then
      rm -r /iscratch/i/rn3/blastp
    endif
    mkdir -p /iscratch/i/rn3/blastp
    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastp/rn3/run/out
    cd /cluster/data/mm6/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm6/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      17126s     285.44m     4.76h    0.20d  0.001 y
# IO & Wait Time:                 20493s     341.54m     5.69h    0.24d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              24s       0.40m     0.01h    0.00d
# Submission to last job:           131s       2.18m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm6/bed/blastp/rn3/run/out
hgLoadBlastTab mm6 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF
#############################################################################

### MM6 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2005-06-01 - Fan)
# These are instructions for re-building tables 
# needed for the Proteome Browser to be used with mm6.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050415.

# Create the working directory
   ssh hgwdev
   mkdir /cluster/data/mm6/bed/pb.2005-06-01
   cd /cluster/data/mm6/bed
   rm pb
   ln -s /cluster/data/mm6/bed/pb.2005-06-01 pb
   cd pb

# Define pep* tables in mm6 DB

   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# delete from the following tables (previously built):

  hgsql mm6
  delete from  pepCCntDist ;
  delete from  pepExonCntDist ;
  delete from  pepHydroDist ;
  delete from  pepIPCntDist ;
  delete from  pepMolWtDist ;
  delete from  pepMwAa ;
  delete from  pepPi ;
  delete from  pepPiDist ;
  delete from  pepPred ;
  delete from  pepResDist ;
  delete from pbAnomLimit;
  delete from pbResAvgStd;
  delete from pbStamp; 
  quit; 

# Build the pepMwAa table

  hgsql proteins050415 -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050415 pepPi.tab

  hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050415 proteins050415 10090 mm6 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm6

    load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm6.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm6.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp050415 10090 mm6

# Create pbAnomLimit and pbResAvgStd tables

#  hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql
#  hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;'
   hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB 

    cd /cluster/data/mm6/bed/pb
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab
    rm j.tmp

    hgsql mm6 -e 'drop table kgSpAlias';
    hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias'
    gzip mm6.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm6 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm6 -e 'delete from pbStamp'
  hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp'

# ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST (already done previously)
    echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm6, then
  notify QA for formal review.

# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB (done previously):

   update dbDb set defaultPos="chrX:87947304-87959012" where name="mm6";

# Create QA Push Queue entry with the following tables:

 ceBlastTab 
 cgapAlias 
 cgapBiocDesc 
 cgapBiocPathway 
 dmBlastTab 
 drBlastTab 
 dupSpMrna 
 foldUtr3 
 foldUtr5 
 gnfAtlas2Distance 
 hgBlastTab 
 keggMapDesc 
 keggPathway 
 kgAlias 
 kgProtAlias 
 kgProtMap 
 kgXref 
 knownBlastTab 
 knownCanonical 
 knownGene 
 knownGeneMrna 
 knownGenePep 
 knownIsoforms 
 knownToGenePix 
 knownToGnf1m 
 knownToGnfAtlas2 
 knownToLocusLink 
 knownToMOE430 
 knownToMOE430A 
 knownToPfam 
 knownToRefSeq 
 knownToU74 
 knownToXmBest 
 rinnSex 
 rnBlastTab 
 scBlastTab 
 spMrna

# END OF mm6 KG/GS/PB RE-BUILD. 6/1/05 Fan.
#####################################################################

####################################################################################
# RE-BUILD KNOWN GENES TABLES, 3RD TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/8/05 Fan)
   ssh hgwdev 
   cd /cluster/store10/kg/kgMm6B
   mkdir try2
   mv * try2

   hgsql mm6 -e 'create database kgMm6BTempTry2'

   hgsql kgMm6BTempTry2 -e 'drop table kgCandidate0'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate0.sql 
   hgsql kgMm6BTempTry2 -e  'load data local infile "try2/kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6BTempTry2 -e 'drop table geneCheck'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "try2/kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgMm6BTempTry2 mm6 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidate'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm6BTempTry2 -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab
   rm j1.tmp j2.tmp
   
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateX'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

#   kgResultBestMrna2 050415 kgMm6BTempTry2 mm6|sort -u >protMrnaBlatScore.tab
#   kgResultBestRef2  050415 kgMm6BTempTry2 mm6|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
#   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6BTempTry2 -e 'drop table protMrnaScore'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6BTempTry2 -e 'load data local infile "try2/protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6BTempTry2 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'


# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm6BTempTry2 kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
#   rm jY.tmp
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateY'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm6BTempTry2 kgCandidateZ.tab
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateZ'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm6BTempTry2 -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm6BTempTry2 mm6 proteins050415 kgTry2.tmp dupSpMrna.tmp

   cat kgTry2.tmp | grep NM_ > jNM
   cat kgTry2.tmp | grep -v NM_ >jnoNM
   cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1
   cut -f 2-12  jnoNM >jnoNM2
   paste jnoNM1 jnoNM2 > kgTry2B.tmp
   cat jNM >> kgTry2B.tmp

   sort -u dupSpMrna.tmp >dupSpMrna.tab
   hgsql mm6 -e  'drop table dupSpMrna'
   hgsql mm6 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm6 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Add entries in the put back list
# Obtain the mouse put back list from Mark and save it as kgPutBack.tab

   hgsql kgMm6BTempTry2 -e  'drop table kgPutBack'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgPutBack.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgPutBack.tab" into table kgPutBack'

   kgPutBack kgMm6BTempTry2 mm6 proteins050415 kgPutBack kgPutBack.gp

# Sort KG genes to make the kgTry2.gp table file.

   cat kgTry2B.tmp kgPutBack.gp >kgTry2C.tmp
  ~/kent/src/hg/protein/sortKg.pl kgTry2C.tmp >kgTry2.gp

# Manually edit to correct one line problem of O75438_BC009691

   hgsql kgMm6BTempTry2 -e  'drop table knownGene'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/knownGene.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgTry2.gp" into table knownGene'

# Load data into mm6 knownGene table.
   hgsql mm6 -e  'drop table knownGene'
   hgsql mm6 < ~/src/hg/lib/knownGene.sql
   hgsql mm6 -e  'load data local infile "kgTry2.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.
   hgsql kgMm6BTempTry2 -e  'drop table mrnaSeq'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/mrnaSeq.sql
#  hgsql kgMm6BTempTry2 -e  'load data local infile "try2/mrnaSeq.tab" into table mrnaSeq'
   hgsql kgMm6BTempTry2 -e  'load data local infile "/cluster/store10/kg/kgMm6A/mrnaSeq.tab" into table mrnaSeq'
   kgPepMrna kgMm6BTempTry2 mm6 050415

   hgsql mm6 -e  'drop table knownGeneMrna'
   hgsql mm6 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm6 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm6 -e  'drop table knownGenePep'
   hgsql mm6 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm6 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'


# Build kgXref table

   kgXref2 kgMm6BTempTry2 050415 mm6

   hgsql mm6 -e  'drop table kgXref'
   hgsql mm6 < ~/src/hg/lib/kgXref.sql
   hgsql mm6 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgMm6B mm6 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build spMrna table

   hgsql mm6 -N -e 'select name, proteinID from knownGene' |sort -u| >kgSpMrna.tab

   hgsql mm6 -e  'drop table spMrna'
   hgsql mm6 <~/src/hg/lib/spMrna.sql
   hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build mrnaRefseq table

   cd /cluster/store10/entrez
   mkdir 050601
   rm /cluster/data/entrez
   ln -s /cluster/store10/entrez/050601 /cluster/data/entrez
   cd /cluster/data/entrez

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm6 -e 'drop table mrnaRefseq'
   hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Build alias tables.		
#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, mm6.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   cd /cluster/store10/kg/kgMm6B
   mkdir alias
   cd alias
   kgAliasM mm6 proteins050415

#	kgAliasKgXref reads from mm6.knownGene.proteinID,
#	mm6.knownGene.name, mm6.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm6

#	kgAliasRefseq reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm6

   hgsql sp050415 -N -e 'select name,gene.val from mm6.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm6 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm6 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm6 
   hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm6.knownGene.name,
#	mm6.knownGene.proteinID, mm6.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab

   kgProtAlias mm6 050415

   hgsql mm6 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm6 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm6 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm6 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm6 -e "drop table kgProtAlias;"
    hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab
    rm j.tmp

    hgsql mm6 -e 'drop table kgSpAlias';
    hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES 
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm6/bed
    mkdir rnaStruct.2005-06-05
    rm rnaStruct
    ln -s rnaStruct.2005-06-05 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm6 knownGene utr3 utr3/utr.fa
    utrFa mm6 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm6/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 25133 of 25133 jobs
# CPU time in finished jobs:     554915s    9248.58m   154.14h    6.42d  0.018 y
# IO & Wait Time:                 67099s    1118.32m    18.64h    0.78d  0.002 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6713s     111.88m     1.86h    0.08d
# Submission to last job:          7435s     123.92m     2.07h    0.09d

    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
Completed: 23548 of 23548 jobs
CPU time in finished jobs:     102308s    1705.14m    28.42h    1.18d  0.003 y
IO & Wait Time:                 64370s    1072.83m    17.88h    0.75d  0.002 y
Average job time:                   7s       0.12m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            8211s     136.85m     2.28h    0.10d
Submission to last job:          8311s     138.52m     2.31h    0.10d

# Load database
    ssh hgwdev
    cd /cluster/data/mm6/bed/rnaStruct/utr5
    hgLoadRnaFold mm6 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm6 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  
   ssh hgwdev
   cd /cluster/store10/kg/kgMm6B
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm6B mm6 050415

   hgsql mm6 -e "drop table keggMapDesc"
   hgsql mm6 -e "drop table keggPathway"
   hgsql mm6 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm6 <~/src/hg/lib/keggPathway.sql
   hgsql mm6 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm6 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables
# RELOAD cgapAlias TABLE AS THERE ARE REPLICATE ROWS IN THE TABLE.
# (hartera, 2005-10-06)

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm6B mm6 050415
   hgsql mm6 -e "drop table cgapAlias"
   hgsql mm6 -e "drop table cgapBiocDesc"
   hgsql mm6 -e "drop table cgapBiocPathway"
   hgsql mm6 <~/src/hg/lib/cgapAlias.sql
   hgsql mm6 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm6 <~/src/hg/lib/cgapBiocPathway.sql
   # Remove replicate rows from cgapAlias and do a numeric sort for the IDs 
   # and reload the table (2005-10-06)
   # can not use sort -nu as this removes more rows than necessary
   sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
   hgsql mm6 -e 'load data local infile "cgapAliasSorted.tab" \
                 into table cgapAlias'
   hgsql mm6 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \
                 into table cgapBiocDesc'
   hgsql mm6 -e 'load data local infile "cgapBIOCARTA.tab" \
                 into table cgapBiocPathway'
   
### MM6 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2005-06-06 - Fan)
# These are instructions for re-building tables 
# needed for the Proteome Browser to be used with mm6.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050415.

# Create the working directory
   ssh hgwdev
   mkdir /cluster/data/mm6/bed/pb.2005-06-06
   cd /cluster/data/mm6/bed
   rm pb
   ln -s /cluster/data/mm6/bed/pb.2005-06-06 pb
   cd pb

# Define pep* tables in mm6 DB

#   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# delete from the following tables (previously built):

  hgsql mm6
  delete from  pepCCntDist ;
  delete from  pepExonCntDist ;
  delete from  pepHydroDist ;
  delete from  pepIPCntDist ;
  delete from  pepMolWtDist ;
  delete from  pepMwAa ;
  delete from  pepPi ;
  delete from  pepPiDist ;
  delete from  pepPred ;
  delete from  pepResDist ;
  delete from pbAnomLimit;
  delete from pbResAvgStd;
  delete from pbStamp; 
  quit; 

# Build the pepMwAa table

  hgsql proteins050415 -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050415 pepPi.tab

  hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050415 proteins050415 10090 mm6 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm6

    load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm6.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm6.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp050415 10090 mm6

# Create pbAnomLimit and pbResAvgStd tables

#  hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql
#  hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;'
   hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB 

    cd /cluster/data/mm6/bed/pb
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm6 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab
    rm j.tmp

    hgsql mm6 -e 'drop table kgSpAlias';
    hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias'
    gzip mm6.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm6 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm6 -e 'delete from pbStamp'
  hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp'

# ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST (already done previously)
    echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm6, then
  notify QA for formal review.

# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB (done previously):

   update dbDb set defaultPos="chrX:87947304-87959012" where name="mm6";

# Create QA Push Queue entry with the following tables:

 ceBlastTab 
 cgapAlias 
 cgapBiocDesc 
 cgapBiocPathway 
 dmBlastTab 
 drBlastTab 
 dupSpMrna 
 foldUtr3 
 foldUtr5 
 gnfAtlas2Distance 
 hgBlastTab 
 keggMapDesc 
 keggPathway 
 kgAlias 
 kgProtAlias 
 kgProtMap 
 kgXref 
 knownBlastTab 
 knownCanonical 
 knownGene 
 knownGeneMrna 
 knownGenePep 
 knownIsoforms 
 knownToGenePix 
 knownToGnf1m 
 knownToGnfAtlas2 
 knownToLocusLink 
 knownToMOE430 
 knownToMOE430A 
 knownToPfam 
 knownToRefSeq 
 knownToU74 
 knownToXmBest 
 rinnSex 
 rnBlastTab 
 scBlastTab 
 spMrna

# END OF mm6 KG/GS/PB RE-BUILD. 6/6/05 Fan.
#####################################################################

## NIA Mouse Gene Index - (DONE - 2005-06-21 Fan)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    mkdir -p /cluster/data/mm6/bed/NIAGene
    cd /cluster/data/mm6/bed/NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz

    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm6 NIAGene.tab

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz
    gzip -d T-fasta.fa.gz
    
    mkdir /gbdb/mm6/NIAGene
    ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa
    
    hgLoadSeq mm6 /gbdb/mm6/NIAGene/T-fasta.fa

    Create/edit/check in NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm6

# Update mrnaRefseq table (DONE - Fan 6/22/05)
# The old table contains non-mouse mrna/RefSeqs.
# The new table contains only mouse mrna/RefSeq and RefSeq/RefSeq.

# First build entrez DB tables, see the section on mrnaRefseq earlier 
# for details.

    ssh hgwdev
    cd /cluster/store10/kg/kgMm6B
    hgsql entrez -N -e \
    'select mrna, refseq from entrezRefseq, entrezMrna, mm6.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \
    >mrnaRefseq1.tab

# Include RefSeq as valid mRNA too.
    hgsql mm6 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgsql mm6 -e 'drop table mrnaRefseq'
    hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql
    hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# BUILD KNOWN GENE LIST FOR GOOGLE.  DONE 6/27/05 Fan.

    cd /cluster/data/mm6/bed
    rm -rf knownGeneList/mm6

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm6

    hgKnownGeneList mm6

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm6
    mkdir -p /usr/local/apache/htdocs/knownGeneList/mm6
    cp -Rfp knownGeneList/mm6/* /usr/local/apache/htdocs/knownGeneList/mm6

# Build kgReactome table for KG to Reactome xref.  Done 6/28/05 Fan.

# First, make sure the reactome DB is built.  See makeHg17.doc for details.

    ssh hgwdev
    mkdir -p /cluster/data/mm6/bed/reactome
    cd /cluster/data/mm6/bed/reactome

    hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, mm6.kgXref where identifier=spID' >kgReactome.tab;

    hgsql mm6 -e 'drop table kgReactome'
    hgsql mm6 < ~/src/hg/lib/kgReactome.sql
    hgsql mm6 -e 'load data local infile "kgReactome.tab" into table kgReactome'

#  miRNA track (DONE - 2005-06-29 - Fan)
    #   data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    cd /cluster/data/mm6/bed
    mkdir miRNA
    cd miRNA
    save miRNA_track_mm6.txt file from email
    cp miRNA_track_mm6.txt miRNA.bed
    
# edit miRNA.bed to get rid of the top field description lines

    hgLoadBed mm6 miRNA miRNA.bed
    
# check previous release track before update
    nice featureBits mm5 miRNA
    # 17957 bases of 2615483787 (0.001%) in intersection

    nice featureBits mm6 
    #19126 bases of 2597150411 (0.001%) in intersection

# ADDED THE EXONPRIMER TO QUICK LINKS SECTION OF KG DEAILS PAGE (05/07/11, Fan) 

# Added the following lines to links.ra under src/hg/hgGene/hgGeneData/Mouse/mm6

name exonPrimer
shortLabel ExonPrimer
tables kgXref
idSql select kgID from kgXref where kgID = '%s'
url http://ihg.gsf.de/cgi-bin/primer/ExonPrimerUCSC.pl?db=mm6&acc=%s
priority 95


# REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan)
# hgMapViaSwissProt.c was updated to support this.
# Create table that maps between known genes and Pfam domains
~/bin/i386/hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam
# row count is  24650

# SCDb CLONES (7/12/2005 Andy)
    cd /cluster/data/mm6/bed
    mkdir blat.SCDb-07-05-2005
    cd blat.SCDb-07-05-2005/
    ln -s `pwd` ~/scdb
    pushd /santest/scratch/andy
    wget http://stemcell.princeton.edu/download/scdb.fa.gz
    mkdir scdb
    faSplit sequence scdb.fa.gz 80 scdb/scdb_
    popd
    find /santest/scratch/andy/scdb -type f > scdb.lst
    find /panasas/store/mm6/nib -type f > mm6.lst
    cat << "_EOF_" > blat.sh
#!/bin/bash
cdir=${3%/*}
mkdir -p $cdir
blat -q=dna -t=dna -noHead -ooc=/iscratch/i/mm6/ooc/11.ooc $1 $2 $3
_EOF_
    cat << "_EOF_" > gsub
#LOOP
./blat.sh {check in exists $(path2)} {check in line+ $(path1)} {check out line /cluster/bluearc/andy/scdb.psl/$(root2)/$(root2)_$(root1).psl}
#ENDLOOP
_EOF_
    chmod +x blat.sh
    ssh kk
    cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005
    gensub2 scdb.lst mm6.lst gsub spec
    para create spec
    para try
    para push
    para time
#Completed: 3200 of 3200 jobs
#CPU time in finished jobs:      24158s     402.64m     6.71h    0.28d  0.001 y
#IO & Wait Time:                 14437s     240.61m     4.01h    0.17d  0.000 y
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             244s       4.07m     0.07h    0.00d
#Submission to last job:           727s      12.12m     0.20h    0.01d
    ssh hgwdev
    cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005
    # See if things check out.
    find /cluster/bluearc/andy/scdb.psl -type f -exec cat '{}' ';' > scdb.all.psl
    pslReps -singleHit scdb.all.psl scdb.best.psl info.psr
    # All the original names
    grep '>' scdb.fa | sed 's/^>//' | cut -f1 -d' ' | sort | uniq > names.scdb
    # All the names from ones that hit.
    cut -f10 scdb.all.psl | sort | uniq > all.names.scdb
    # All the ones with a "best" hit.
    cut -f10 scdb.best.psl | sort | uniq > best.names.scdb
    # Yeah a bunch of them (4,443/37,386) are missing.  It seems many of the 
    # clones aren't from mouse anyways.
    mkdir ../scdb
    cp scdb.best.psl ../scdb/scdb.psl
    cp scdb.fa ../scdb/
    cp best.names.scdb ../scdb/
    cd ../scdb/
    faSomeRecords scdb.fa best.names.scdb scdb.best.fa
    rm scdb.fa
    mkdir /gbdb/mm6/scdb
    ln -s /cluster/data/mm6/bed/scdb/scdb.best.fa /gbdb/mm6/scdb/scdb.fa    
    hgLoadSeq mm6 /gbdb/mm6/scdb/scdb.fa
    # clean up the names... basically take the middle part out.   
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.best.fa > new.scdb.best.fa
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.psl > new.scdb.psl
    mv scdb.psl old.scdb.psl
    mv new.scdb.psl scdb.psl
    mv scdb.best.fa old.scdb.best.fa
    mv new.scdb.best.fa scdb.best.fa
    hgLoadPsl -table=scdb mm6 scdb.psl
    hgLoadSeq mm6 /gbdb/mm6/scdb/scdb.fa
#Warning: load of seq did not go as planned: 37381 record(s), 1 row(s) skipped, 0 warning(s) loading ./seq.tab
    # Oh well. 
    # Update 7/26/2005: I'm going more restrictive on the pslReps.
    ssh hgwdev
    cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005
    pslReps -minCover=0.8 -singleHit scdb.all.psl tmp scdb.psr
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' tmp > scdb.psl
    rm tmp
    hgLoadPsl mm6 scdb.psl
    
## REBUILD NIA Mouse Gene Index - (DONE - 2005-07-20 Fan)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    cd /cluster/data/mm6/bed
    mv NIAGene NIAGene_050621
    mkdir NIAGene

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz

    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm6 NIAGene.tab

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz
    gzip -d T-fasta.fa.gz
 
    rm /gbdb/mm6/NIAGene/T-fasta.fa
    ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa
   
# Load the sequences.  PLEASE NOTE THE "-replace" OPTION SHOULD BE USED!!!   
    hgLoadSeq -replace mm6 /gbdb/mm6/NIAGene/T-fasta.fa

# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# ZEBRAFISH (danRer3) (DONE, 2005-08-10, hartera)
# REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera)
    ssh kkr1u00
    # Blastz uses lineage-specific repeats. There are none for mouse
    # and fish so use all repeats for each species as lineage-specific.
    mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish
    foreach f (/panasas/store/mm6/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end

    # get only lineage specific repeats for chr1-25 and chrM
    mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse
    foreach f (/iscratch/i/danRer3/rmsk/chr[0-9M]*.fa.out)
      cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec
    end
    # make a nib dir that is also just chr1-25 and chrM
    mkdir -p /iscratch/i/danRer3/chromNib
    cp /cluster/data/danRer3/nib/chr[0-9M]*.nib /iscratch/i/danRer3/chromNib
    /cluster/bin/iSync
    
    ssh kkstore
    mkdir /cluster/data/mm6/bed/blastz.danRer3.2005-08-05
    cd /cluster/data/mm6/bed
    ln -s blastz.danRer3.2005-08-05 blastz.danRer3
    cd /cluster/data/mm6/bed/blastz.danRer3
    # use parameters as for mm5 - see makeMm5.doc
    cat << '_EOF_' > DEF
# mouse (mm6) vs zebrafish (danRer3)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm6)
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
# just chroms 1-25 and chrM
SEQ2_DIR=/iscratch/i/danRer3/chromNib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastz.danRer3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF

    cp /cluster/data/mm6/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/danRer3/chrom.sizes > S2.len
    # make output directory
    mkdir -p /panasas/store/mm6vsdanRer3Out 
    # do blastz and create chains for danRer3 chr1-25 and chrM 
    # chickenHumanTuned.gap scoring matrix is now used by default 
    # by axtChain.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /panasas/store/mm6vsdanRer3Out -chainMinScore=5000 \
       -stop chainMerge >& do.log &
    #PID 31074 on kk, 
    # Started Fri Aug  5 21:18:13 PDT 2005
    # Finished Aug  6 06:31
    # for chr1-25 and chrM:
    # blastz run:
# para time
# Completed: 44023 of 44023 jobs
# CPU time in finished jobs:   12375882s  206264.70m  3437.75h  143.24d  0.392 y
# IO & Wait Time:                979190s   16319.83m   272.00h   11.33d  0.031 y
# Average job time:                 303s       5.06m     0.08h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1778s      29.63m     0.49h    0.02d
# Submission to last job:         32605s     543.42m     9.06h    0.38d
    # chain run:
# para time
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       1075s      17.92m     0.30h    0.01d  0.000 y
# IO & Wait Time:                   243s       4.04m     0.07h    0.00d  0.000 y
# Average job time:                  33s       0.55m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              51s       0.85m     0.01h    0.00d
# Submission to last job:           107s       1.78m     0.03h    0.00d

    # then do a run with the zebrafish danRer3 NA and Un Scaffolds
    # that are in a 2bit file to do blastz and make chains.
    ssh kk
    mkdir -p /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds
    cd /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds
    # copy DEF file and edit for NA and Un scaffolds
    cat << '_EOF_' > DEF
# mouse (mm6) vs zebrafish (danRer3) NA and Un scaffolds only
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse (mm6)
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
# NA and Un Scaffolds in a 2bit file
SEQ2_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    # get lengths for nibs and scaffolds
    cp /cluster/data/mm6/chrom.sizes ./S1.len
    # for S2.len, need sizes of the sequences in the 2bit file
    sort -rn +1 /cluster/data/danRer3/NAandUnScafs.sizes > ./S2.len
    # make output directory
    mkdir -p /panasas/store/mm6vsdanRer3Out/NAandUnScafs
    # do blastz and create chains for danRer3 chrNA and chrUn scaffolds. 
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /panasas/store/mm6vsdanRer3Out/NAandUnScafs \
       -chainMinScore=5000 -stop chainMerge >& do.log &
    #  Start Mon Aug  8 09:04
    #  Finish Aug  8 13:04
    # for NA and Un Scaffolds:
    # blastz run:
# para time
# Completed: 15226 of 15226 jobs
# CPU time in finished jobs:    6074532s  101242.21m  1687.37h   70.31d  0.193 y
# IO & Wait Time:                289788s    4829.79m    80.50h    3.35d  0.009 y
# Average job time:                 418s       6.97m     0.12h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            7795s     129.92m     2.17h    0.09d
# Submission to last job:         13851s     230.85m     3.85h    0.16d
    # chain run:
# para time
# Completed: 40 of 40 jobs
# CPU time in finished jobs:        270s       4.50m     0.08h    0.00d  0.000 y
# IO & Wait Time:                   252s       4.20m     0.07h    0.00d  0.000 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:            54s       0.90m     0.01h    0.00d

    # now need to do a liftUp to get the chromosomes co-ordinates
    # then merge together and continue on with net step
    ssh kkstore01
    cd /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds/axtChain
    mkdir liftedChain
    foreach f (chain/*.chain)
       set c=$f:t:r
       echo $c
       liftUp -chainQ liftedChain/${c}.lifted.chain \
     /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
       warn $f
    end
    # now merge these lifted chain files with the existing chain files for 
    # the chroms and then sort by score using chainSort 
    cd /cluster/data/mm6/bed/blastz.danRer3/axtChain
    # gzipped file is only chains for chroms1-25 and chrM so rename
    mv mm6.danRer3.all.chain.gz mm6.danRer3.chroms.chain.gz
    mv chain chromChain
    mkdir chain chainUnSorted
    # get all chains to be merged in chainUnSorted dir
    cp ./chromChain/*.chain ./chainUnSorted/
    # copy scaffolds chains, these are *.lifted.chain so they do not 
    # write over the chrom chains.
    cp ../NAandUnScaffolds/axtChain/liftedChain/*.chain ./chainUnSorted/
    # then merge and sort all these chains. they must be merged and all 
    # sorted together so that all IDs are unique across all chroms.
    # IDs are reassigned by chainMergeSort so that IDs are unique.
    nice chainMergeSort chainUnSorted/*.chain | nice gzip -c \
           > mm6.danRer3.all.chain.gz
    # use chainSplit to split this into chains again
    zcat mm6.danRer3.all.chain.gz | chainSplit chain stdin 

    # then pick up the doBlastzChainNet.pl script with the net step
    ssh kk
    cd /cluster/data/mm6/bed/blastz.danRer3
    cp DEF DEF.chroms
    # edit DEF so SEQ2_DIR=/iscratch/i/danRer3/nib as need all nib files now
    # make sure that :~/.ssh/config has only user write permission and not
    # group otherwise the ssh will fail.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /panasas/store/mm6vsdanRer3Out -chainMinScore=5000 \
       -continue net >& doNet.log &
    # Start: Aug 10 13:34
    # Finished: Aug 10 13:54
    # it crashes at the cleanup step as it can not get to /panasas/ from 
    # kkstore01 - should specifiy a different fileServer for this step. 
    # run this step manually
    cd /cluster/data/mm6/bed/blastz.danRer3
    cleanUp.csh &
    # All done now 
    # check README.txt in downloads directory and also add html and 
    # trackDb.ra entry for chain and net tracks for danRer3
# featureBits -chrom=chr1 mm6 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 0.808%, chainDanRer3Link 5.196%, both 0.522%, cover 64.64%, 
# enrich 12.44x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.818%, chainDanRer2Link 2.058%, both 0.546%, cover 66.75%, 
# enrich 32.43x
# featureBits -chrom=chr1 mm6 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.808%, chainDanRer2Link 6.412%, both 0.542%, cover 67.04%, 
# enrich 10.46x
    # Remake axtNet and then remake mafNet from these  (2005-08-17, harterA)
    # netToAxt was processing nets incorrectly so remake these with 
    # new version of netToAxt 
    # and transfer to downloads dir.
    ssh eieio
    cd /cluster/data/mm6/bed/blastz.danRer3
    rm -r axtNet
    # Make axtNet for download: one .axt per mm6 seq.
    # remake noClass.net
    #Make nets("noClass", i.e. without rmsk/class stats which are added later):
    cd axtChain
    chainPreNet mm6.danRer3.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len stdout \
| chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
    # create net for each chrom again
    netSplit noClass.net net
    # also split up chains again
    mkdir chain
    zcat mm6.danRer3.all.chain.gz | chainSplit chain stdin
    cd ..
    # make axtNet again using new version of axtNet, the previous version was
    # not processing the nets correctly.
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
      /panasas/store/mm6/nib /iscratch/i/danRer3/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.mm6.danRer3.net.axt.gz
    end
    # cleanup
    cd axtChain
    rm noClass.net
    rm -r net
    rm -r chain
    # remake mafNet from the new axtNet
    cd /cluster/data/mm6/bed/blastz.danRer3
    rm -r mafNet
    # Make mafNet for multiz: one .maf per mm6 seq.
    mkdir mafNet
    foreach f (axtNet/*.mm6.danRer3.net.axt.gz)
      axtToMaf -tPrefix=mm6. -qPrefix=danRer3. $f \
      /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len \
      stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    # remove old axtNet downloads and add links to new axtNet files
    ssh hgwdev
    rm -r /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3/axtNet
    ln -s /cluster/data/mm6/bed/blastz.danRer3/axtNet/*.axt.gz axtNet/
    # make md5sum.txt again
    rm md5sum.txt
    md5sum *.gz */*.gz > md5sum.txt


#### LOAD ENSEMBL GENES (DONE - 2005-08-10 Fan)
# ADDDED STABLE URL TO TRACKDB BLOCK (V32, JUL 2005) (2008-01-11, rhead)
#	needed for Gene Sorter procedure below
#	Ensembl released Mouse build 34 the week of August 10th, 2005
   mkdir -p /cluster/store11/mm6/bed/ensGene
   ln -s /cluster/store11/mm6/bed/ensGene /cluster/data/mm6/bed/ensGene
   cd /cluster/data/mm6/bed/ensGene

        Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
        Choose Ensembl 32 and Mus musculus, click next
        Follow this sequence through the pages:
        1) Select "Known genes" in the Gene seciont. Hit next.
        2) Select "Structures". 
        3) Choose GTF as the output, choose gzip compression, name the
	output file ensGeneMm6.gtf.gz and then hit Export

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     zcat ensGeneMm6.gtf.gz | grep -v ^6_DR51 | grep -v NT_ > unrandom.gtf
#	Let's see how much it loses:
#  	None.

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene

    sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf
    ldHgGene mm6 ensGene ensGene.gtf
# Read 38200 transcripts in 674378 lines in 1 files
#   38200 groups 22 seqs 1 sources 4 feature types
# 38200 gene predictions

hgsql mm6 -N -e 'select * from ensGene' | sed -e 's/\./\t/' |\
cut -f 1,3-11 >ensGeneNew.tab

#	save space, gzip them:
    gzip unrandom.gtf
    gzip ensGene.gtf

# Load Ensembl peptides:
        Get the ensembl protein data from BioMar
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        1) Choose "Known genes". Hit next.
        2) Choose "Sequences" and "Peptide" and "Ensembl Transcript ID",
	     choose text/fasta and gzip compression,
	     name the file ensPep and then hit export.

     zcat ensPep.fasta.gz|faToTab -type=protein stdin j1.tmp
     cat j1.tmp|grep -v "SEQXENCEXNAVAILAXLE" >j2.tmp  
     cat j2.tmp |awk '{print ">" $1;print $2}' > ensPep.fa
     rm j1.tmp j2.tmp

     hgPepPred mm6 generic ensPep ensPep.fa

# Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  

    # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
    # Choose Ensembl 32 and Mus musculus, click next
    # Follow this sequence through the pages:
    # 1) Select "Known genes" in the Gene seciont. Hit next.
    # 2) Select "Structures".
    # 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. 
    # 4) select "Text, tab separated" and name the output file as "ensGtp"
    # 5) download the output file "ensGtp.tsv.gz"  

    gunzip ensGtp.tsv.gz
    hgsql mm6 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql mm6 -N -e 'load data local infile "ensGtp.tsv" into table ensGtp ignore 1 lines;' 

# Create knownToEnsembl column
    hgMapToGene mm6 ensGene knownGene knownToEnsembl

# Compress everthing to save space
    gzip *
    
#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2005-08-16 - Fan)

# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSEMBL BIOMART DATA OF MOUSE BUILD 34.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
# SINCE ENSEMBL CHANGED THE DATA FORMAT AGAIN (AS USUAL :-(  ), THERE IS NO VERSION NUMBER
# IN THEIR IDs, A FAKE "0" IS GENERATED FOR EACH ID IN ensemblXref3 TABLE.
    # Get the ensembl gene/protein cross-reference data BioMart
    # Follow this sequence through the pages:
    # 1) Make sure that the Mus musculus choice is selected. Hit next.
    # 2) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
    # 3) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref

    load data local infile "ensXref.tsv" into table ensemblXref3Temp ignore 1 lines;

    hgsql mm6 -N -e 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
    > ensemblXref3.tab

    hgsql mm6 -e 'drop table ensemblXref3'
    hgsql mm6 <~/src/hg/lib/ensemblXref3.sql
    hgsql mm6 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
    
# load the table into proteome DB also    
    hgsql proteome -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'

#### BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-08-17 - Fan)

# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

    mkdir -p /cluster/store11/superfamily/050817
    ln -s /cluster/store11/superfamily/050817 /cluster/data/superfamily/050817
    cd /cluster/data/superfamily/050817

# add the following line to ~/.netrc

    machine supfam.mrc-lmb.cam.ac.uk login license password XXXXX

# ftp over to supfam.mrc-lmb.cam.ac.uk and get the following two files:

    supfam_14-Aug-2005.sql.gz
    ass_14-Aug-2005.tab.gz
    
    gzip -d *.gz
# Load the Superfamily database
    hgsql mm6 -e "create database superfam050817"
    zcat supfam_14-Aug-2005.sql.gz | hgsql superfam050817

# This may take about an hour.

# Make sure to add an index on id of the des table of superfam050817.
    hgsql superfam050817 -e "create index id on des(id);"
    gzip -d ass_14-Aug-2005.tab.gz
    hgsql superfam050817 < ~/src/hg/lib/sfAssign.sql
    hgsql superfam050817 -e \
    'load data local infile "ass_14-Aug-2005.tab" into table superfam050817.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql mm6 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/050817  
   hgsql mm6 -e 'load data local infile "ass_14-Aug-2005.tab" into table mm6.sfAssign;'

# If mm6.sfDes already exists, drop it.

   hgsql superfam050817 -e "select * from des" >sfDes.tab
   hgsql mm6 < ~/src/hg/lib/sfDes.sql
   hgsql mm6 -e 'load data local infile "sfDes.tab" into table mm6.sfDes ignore 1 lines;'

# If mm6.superfamily already exists, drop it.
   cd /cluster/data/mm6/bed
   mkdir /cluster/data/mm6/sf.2005-0817
   ln -s sf.2005-0817 sf
   hgSuperfam mm6 superfam050817 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If mm6.sfDescription exists, drop it.

   hgsql mm6 < ~/src/hg/lib/sfDescription.sql
   hgsql mm6 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm6.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed mm6 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
   
   cat /cluster/data/superfamily/050817/ass_14-Aug-2005.tab | hgKnownToSuper mm6 mm stdin
# 21185 records output 


# RE-EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/11/05 angie)
    # originally done 3/14/05 -- redone 8/11/05 just in case latest RM version
    # has any updates, before aligning to canFam2.
    ssh kolossus
    cd /panasas/store/mm6/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query mouse -comp dog
    end
    # Now extract dog (extra column 1):
    cd ..
    mkdir linSpecRep.notInDog
    foreach f (rmsk/*.out_canis-familiaris)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractRepeats 1 $f > \
		linSpecRep.notInDog/$base.out.spec
    end
    # Clean up.
    rm rmsk/*.out_canis*


# BLASTZ/CHAIN/NET CANFAM2 (DONE 8/12/05 angie)
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/blastz.canFam2.2005-08-11
    cd /cluster/data/mm6/bed/blastz.canFam2.2005-08-11
    cat << '_EOF_' > DEF
# mouse vs. dog
# TARGET: Mouse
SEQ1_DIR=/panasas/store/mm6/nib
SEQ1_RMSK=/panasas/store/mm6/rmsk
SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInDog
SEQ1_LEN=/cluster/data/mm6/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_RMSK=/panasas/store/canFam2/rmsk
SEQ2_SMSK=/panasas/store/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastz.canFam2.2005-08-11
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/blastzMm6CanFam2Out >& do.log &
    ln -s blastz.canFam2.2005-08-11 /cluster/data/mm6/bed/blastz.canFam2

# RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/28/05 angie)
    # Kate fixed netToAxt to avoid duplicated blocks, which is important 
    # for input to multiz.  Regenerate maf using commands from sub-script 
    # netChains.csh generated by doBlastzChainNet.pl above.  
    ssh kolossus
    cd /cluster/data/mm6/bed/blastz.canFam2.2005-08-11/axtChain
    netSplit mm6.canFam2.net.gz net
    chainSplit chain mm6.canFam2.all.chain.gz
    cd ..
    mv axtNet axtNet.orig
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /panasas/store/mm6/nib /iscratch/i/canFam2/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.mm6.canFam2.net.axt.gz
    end
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.mm6.canFam2.net.axt.gz)
      axtToMaf -tPrefix=mm6. -qPrefix=canFam2. $f \
            /cluster/data/mm6/chrom.sizes /cluster/data/canFam2/chrom.sizes \
            stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    rm -r axtChain/{chain,net}/ axtNet.orig


#  UPDATE miRNA track (DONE - 2005-08-24 - Fan)
#   data from: Michel.Weber@ibcg.biotoul.fr
#   notify them when done.

   cd /cluster/data/mm6/bed
   cd miRNA

   mkdir old
   cp -p * old
   rm *
   
#  save miRNA_track_mm6_aug2005.txt file from email
   
   cp miRNA_track_mm6_aug2005.txt miRNA.tab
   vi miRNA.tab

# edit miRNA.bed to get rid of the top description lines
# and a few blank lines

   hgLoadBed mm6 miRNA miRNA.tab
    
# check previous release track before update

   nice featureBits mm5 miRNA
#  17957 bases of 2615483787 (0.001%) in intersection

   nice featureBits mm6 miRNA
#  20898 bases of 2597150411 (0.001%) in intersection

# ADD LINK TO GENENETWORK (Done. 9/6/05 Fan).

# Received the file, rat.RefSeqId, list of RefSeq IDs from GeneNetwork.
# remove extra CR (or LF?) at end of the line.

    rmLf mouse.RefSeqId >mm6.geneNetworkId.tab

    hgsql mm6 -e 'drop table geneNetworkId'
    hgsql mm6 < ~/src/hg/lib/geneNetworkId.sql
    hgsql mm6 -e 'load data local infile "mm6.geneNetworkId.tab" into table geneNetworkId'


# JACKSON LABS / MGI REPRESENTATIVE TRANSCRIPT (DONE 9/29/05 angie)
# Genes reloaded 2/4/06 after Kayla found that exon starts were off-by-one.
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/jaxRepTranscript
    cd /cluster/data/mm6/bed/jaxRepTranscript
    wget ftp://gondor.informatics.jax.org/pub/gbrowse/MGIrep-trans_bld34b.gff
    cat > parseJaxGFF.pl <<'_EOF_'
#!/usr/bin/perl -w
# Parse this particular flavor of GTF into our preferred flavor (stdout) 
# plus an association file (alias.tab) and a fixit SQL file (fixit.sql).  
# Note: for the rep transcript files only, must add 1 to each start coord.

use strict;

# Keep track of transcript names; our GFF-parsing code requires unique 
# transcript names but non-unique ones are used here.  Add uniquifying 
# suffix.  Rely on the fact that a REP_transcript line always immediately 
# precedes the REP_exon lines.  
my %txNameIndx;
my $tweakedName;

open(OUT, ">alias.tab") || die "Can't open alias.tab for writing: $!\n";
open(SQL, ">fixit.sql") || die "Can't open fixit.sql for writing: $!\n";
while (<>) {
  chomp;
  my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) =
    split("\t");
  if ($type eq "rep_transcript") {
    my ($name, $mgiID);
    if ($info =~ /^REP_transcript ([^;]+); Note "[\w_.]+"; Note "(MGI:\d+)"; Note "[\w. ()-]+"$/) {
      ($name, $mgiID) = ($1, $2);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $txNameIndx{$name}) {
      $tweakedName = $name . '_' . $txNameIndx{$name};
      print SQL "update jaxRepTranscript set name = '$name' " .
                "where name = '$tweakedName';\n";
    } else {
      undef $tweakedName;
      print OUT "$name\t$mgiID\n";
    }
    $txNameIndx{$name}++;
  } elsif ($type eq "rep_exon") {
    $type = "exon";
    my ($name, $mgiID);
    if ($info =~ /^REP_transcript ([^;]+); Note "[\w_.]+"; Note "(MGI:\d+)"; Note "[\w. ()-]+"$/) {
      ($name, $mgiID) = ($1, $2);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $tweakedName) {
      if ($tweakedName !~ /^${name}_\d+$/) {
        die "tweakedName $tweakedName does not start with name $name and " .
            " have a numeric suffix like expected";
      }
      $name = $tweakedName;
    }
    $start++;
    print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" .
          "gene_id \"$mgiID\"; transcript_id \"$name\";\n";
  } else {
    die "unrecognized type $type, line $.";
  }
}
close(OUT);
close(SQL);
'_EOF_'
    # << for emacs
    chmod a+x parseJaxGFF.pl
    parseJaxGFF.pl MGIrep-trans_bld34b.gff > jaxRepTranscript.gtf
    # Load up the genes and aliases, and use script-generated fixit.sql
    # to remove our uniquifying suffixes from the duplicated transcripts.
    ssh hgwdev
    cd /cluster/data/mm6/bed/jaxRepTranscript
    ldHgGene mm6 jaxRepTranscript jaxRepTranscript.gtf
    hgsql mm6 < fixit.sql
    sed -e 's/genericAlias/jaxRepTranscriptAlias/' \
      $HOME/kent/src/hg/lib/genericAlias.sql \
    | hgsql mm6
    hgsql mm6 -e \
      'load data local infile "alias.tab" into table jaxRepTranscriptAlias'


# JACKSON LABS / MGI ALLELE TRANSCRIPTS (DONE 11/15/05 angie)
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/jaxAllele
    cd /cluster/data/mm6/bed/jaxAllele
    ftp gondor.informatics.jax.org
    # anonymous log in
      cd pub/gbrowse
      prompt
      mget allele_*
      bye
    # OK, need to ask Bob about how to parse those Note columns... 
    # looks like we could split some of the long names into mult. fields.
    cat > parseJaxGFF.pl <<'_EOF_'
#!/usr/bin/perl -w
# Parse this particular flavor of GTF into our preferred flavor (stdout) 
# plus *append* to an association file (alias.tab) and write a fixit SQL file 
# ($ARGV[0].sql -- first arg must be table name).  

use strict;

my $tableName = shift @ARGV;
die "need an argument (table name)" if (! defined $tableName);

# Keep track of transcript names; our GFF-parsing code requires unique 
# transcript names but non-unique ones are used here.  Add uniquifying 
# suffix.  Rely on the fact that a _transcript line always immediately 
# precedes the _exon lines.  
my %txNameIndx;
my $tweakedName;

open(OUT, ">>alias.tab") || die "Can't open alias.tab for appending: $!\n";
open(SQL, ">$tableName.sql") || die "Can't open $tableName.sql for writing: $!\n";
while (<>) {
  chomp;
  my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) =
    split("\t");
  if ($type =~ /^\w+_transcript$/) {
    my ($name, $alName, $mgiID, $source);
    if ($info =~ /^\S+_transcript ([^<]+<?[^>]*>?)_\w+; Note "(MGI:\d+)"; Note "([^"]+)";$/) {
      ($name, $mgiID, $source) = ($1, $2, $3);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $txNameIndx{$name}) {
      $tweakedName = $name . '_' . $txNameIndx{$name};
      print SQL "update $tableName set name = '$name' " .
                "where name = '$tweakedName';\n";
    } else {
      undef $tweakedName;
      print OUT "$name\t$mgiID\t$source\n";
    }
    $txNameIndx{$name}++;
  } elsif ($type =~ /^\w+_exon$/) {
    $type = "exon";
    my ($name, $mgiID);
    if ($info =~ /^\S+_transcript ([^<]+<?[^>]*>?)_\w+; Note "(MGI:\d+)"; Note "[^"]+";$/) {
      ($name, $mgiID) = ($1, $2);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $tweakedName) {
      my $escName = $name;
      $escName =~ s/\(/\\(/g;  $escName =~ s/\)/\\)/g;
      if ($tweakedName !~ /^${escName}_\d+$/) {
        die "tweakedName $tweakedName does not start with name $name and " .
            " have a numeric suffix like expected";
      }
      $name = $tweakedName;
    }
    print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" .
          "gene_id \"$mgiID\"; transcript_id \"$name\";\n";
  } else {
    die "unrecognized type $type, line $.";
  }
}
close(OUT);
close(SQL);
'_EOF_'
    # << for emacs
    chmod a+x parseJaxGFF.pl
    cp /dev/null alias.tab
    foreach f (*.gff)
      set g = `echo $f | perl -wpe 's/allele_//; s/\.gff//; s/^(\w)/jaxAllele\u$1/;'`
      ./parseJaxGFF.pl $g $f > $g.gtf
    end
    # Load info table
    hgsql mm6 < $HOME/kent/src/hg/lib/jaxAlleleInfo.sql
    hgsql mm6 -e 'load data local infile "alias.tab" into table jaxAlleleInfo'
    # Make a single bed file with Allele type as extra column, for 
    # single track / Gene Sorter.
    ssh hgwdev
    cd /cluster/data/mm6/bed/jaxAllele
    cp /dev/null jaxAllele.bed
    foreach f (jax*.gtf)
      set type = `echo $f | sed -e 's/jaxAllele//; s/\.gtf//;'`
      ldHgGene mm6 $f:t:r $f -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | sed -e 's/$/'"\t$type"'/' \
      >> jaxAllele.bed
    end
    sed -e 's/bed12Source/jaxAllele/g' \
      $HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql
    hgLoadBed -sqlTable=jaxAllele.sql mm6 jaxAllele jaxAllele.bed


# JACKSON LABS / MGI PHENOTYPE (DONE 11/15/05 angie)
    ssh kkstore01
    mkdir /cluster/data/mm6/bed/jaxPhenotype
    cd /cluster/data/mm6/bed/jaxPhenotype
    ftp gondor.informatics.jax.org
    # anonymous log in
      cd pub/gbrowse
      prompt
      mget *_phenotype.gff
      mget lethality* life* tumor*
      bye
    cat > parseJaxGFF.pl <<'_EOF_'
#!/usr/bin/perl -w
# Parse this particular flavor of GTF into our preferred flavor (stdout) 
# plus *append* to an association file (alias.tab) and write a fixit SQL file 
# ($ARGV[0].sql -- first arg must be table name).  

use strict;

my $tableName = shift @ARGV;
die "need an argument (table name)" if (! defined $tableName);

# Keep track of transcript names; our GFF-parsing code requires unique 
# transcript names but non-unique ones are used here.  Add uniquifying 
# suffix.  Rely on the fact that a _transcript line always immediately 
# precedes the _exon lines.  
my %txNameIndx;
my $tweakedName;

open(OUT, ">>alias.tab") || die "Can't open alias.tab for appending: $!\n";
open(SQL, ">$tableName.sql") || die "Can't open $tableName.sql for writing: $!\n";
while (<>) {
  chomp;
  my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) =
    split("\t");
  if ($type =~ /^\w+_transcript$/) {
    my ($name, $mgiID);
    if ($info =~ /^MP_\d+_transcript ([^;]+)_MP_\d+; Note "(MGI:\d+)"; Note "MP_\d+";$/) {
      ($name, $mgiID) = ($1, $2);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $txNameIndx{$name}) {
      $tweakedName = $name . '_' . $txNameIndx{$name};
      print SQL "update $tableName set name = '$name' " .
                "where name = '$tweakedName';\n";
    } else {
      undef $tweakedName;
      print OUT "$name\t$mgiID\n";
    }
    $txNameIndx{$name}++;
  } elsif ($type =~ /^\w+_exon$/) {
    $type = "exon";
    my ($name, $mgiID);
    if ($info =~ /^MP_\d+_transcript ([^;]+)_MP_\d+; Note "(MGI:\d+)"; Note "MP_\d+";$/) {
      ($name, $mgiID) = ($1, $2);
    } else {
      die "parse, line $.:\n$info\n";
    }
    if (defined $tweakedName) {
      if ($tweakedName !~ /^${name}_\d+$/) {
        die "tweakedName $tweakedName does not start with name $name and " .
            " have a numeric suffix like expected";
      }
      $name = $tweakedName;
    }
    print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" .
          "gene_id \"$mgiID\"; transcript_id \"$name\";\n";
  } else {
    die "unrecognized type $type, line $.";
  }
}
close(OUT);
close(SQL);
'_EOF_'
    # << for emacs
    chmod a+x parseJaxGFF.pl
    cp /dev/null alias.tab
    foreach f (*.gff)
      set g = `echo $f | perl -wpe 's/(_phenotype)?\.gff//; s/[_-](\w)/\u$1/g; s/^(\w)/jaxPhenotype\u$1/;'`
      ./parseJaxGFF.pl $g $f > $g.gtf
    end
    sort -u alias.tab > alias.unique.tab
    # Load up the aliases:
    ssh hgwdev
    cd /cluster/data/mm6/bed/jaxPhenotype
    sed -e 's/genericAlias/jaxPhenotypeAlias/' \
      $HOME/kent/src/hg/lib/genericAlias.sql \
    | hgsql mm6
    hgsql mm6 -e \
      'load data local infile "alias.unique.tab" into table jaxPhenotypeAlias'
    # Make a single bed file with phenotype as extra column, for single track 
    # / Gene Sorter.  Use Jim's abbreviations.
    ssh hgwdev
    cd /cluster/data/mm6/bed/jaxPhenotype
    cp /dev/null jaxPhenotype.bed
    foreach f (jax*.gtf)
      set type = `echo $f | sed -e 's@jaxPhenotype@@; s@\.gtf@@; \
                    s@AdiposeTissue@Adipose@; \
                    s@BehaviorNeurological@Behavior@; \
                    s@CardiovascularSystem@Cardiovascular@; \
                    s@DigestiveAlimentary@Digestive@; \
                    s@EndocrineExocrineGland@Gland@; \
                    s@GrowthSize@Growth Size@; \
                    s@HearingEar@Hearing/Ear@; \
                    s@HematopoieticSystem@Hematopoietic@; \
                    s@HomeostasisMetabolism@Homeostasis@; \
                    s@ImmuneSystem@Immune@; \
                    s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \
                    s@LethalityPostnatal@Postnatal Lethal@; \
                    s@LifeSpanPostWeaningAging@Life Span@; \
                    s@LimbsDigitsTail@Limbs and Tail@; \
                    s@LiverBiliarySystem@Liver and Bile@; \
                    s@NervousSystem@Nervous System@; \
                    s@RenalUrinarySystem@Renal/Urinary@; \
                    s@ReproductiveSystem@Reproductive@; \
                    s@RespiratorySystem@Respiratory@; \
                    s@SkinCoatNails@Skin/Coat/Nails@; \
                    s@TasteOlfaction@Taste/Smell@; \
                    s@TouchVibrissae@Touch@; \
                    s@Tumorigenesis@Tumorigenesis@; \
                    s@VisionEye@Vision/Eye@;'`
      ldHgGene mm6 $f:t:r $f -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | sed -e 's@$@'"\t$type"'@' \
      >> jaxPhenotype.bed
    end
    sed -e 's/bed12Source/jaxPhenotype/g' \
      $HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql
    hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm6 jaxPhenotype \
      jaxPhenotype.bed


##########################################################################
# MGI ALLELE-PHENOTYPE MAPPING (DONE 7/18/07 angie)
    # Load in the mapping of alleles to phenotypes from an MGI report
    # file.  The file is independent of assembly version, but it uses 
    # some allele names that apparently have changed (or been added)
    # since mm6 tables were loaded.  So the coverage is not as complete
    # as for mm8.  Still, this info is very nice to have!
    cd /cluster/data/mm6/bed/jaxAllele/
    ln -s alias.tab jaxAlleleInfo.tab
    /cluster/data/mm8/bed/jax/2007_07/parsePhenotypicAllele.pl \
      /cluster/data/mm8/bed/jax/2007_07/MGI_PhenotypicAllele.rpt \
      > jaxAllelePheno.tab
    hgLoadSqlTab mm6 jaxAllelePheno \
      ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
    runJoiner.csh mm6 jaxAllelePheno ~/kent/src/hg/makeDb/schema


##########################################################################
# NSCAN track - (2005-09-29 markd)  loaded proteins 2005-10-13
    cd /cluster/data/mm6/bed/nscan/
    # obtained NSCAN-EST predictions from michael brent's group at WUSTL
    wget http://genome.cse.wustl.edu/predictions/mouse/mm6_09_14_05/mm6Predictions.tar.gz 
    tar -zxf mm6Predictions.tar.gz 

    # change protein fasta file to have transcript id in header
    foreach f (chr_ptx/*.ptx)
        awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
    end

    ldHgGene -gtf -genePredExt mm6 nscanGene  chr_gtf/chr*.gtf
    hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix
    rm -rf chr_* *.tab 

    # update trackDb; need a mm6-specific page to describe informants
    mouse/mm6/nscanGene.html    
    mouse/mm6/trackDb.ra

# Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt)
    knownToVisiGene mm6
    #Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780, 
    #genbankImageHash 1301
    #knownToLocusLink 23124, knownToRefSeq 23124, knownToGene 250882
    
## REBUILD NIA Mouse Gene Index - (DONE - 2005-10-17 Fan)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    cd /cluster/data/mm6/bed
    mv NIAGene NIAGene_050720
    mkdir NIAGene

    cd MOAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz
    gzip -d T-psl.txt.gz

    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm6 NIAGene.tab

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz
    gzip -d T-fasta.fa.gz
 
    rm /gbdb/mm6/NIAGene/T-fasta.fa
    ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa

    hgLoadSeq -replace mm6 /gbdb/mm6/NIAGene/T-fasta.fa

#  UPDATE miRNA track (DONE - 2005-10-20 - Fan)
#   data from: Michel.Weber@ibcg.biotoul.fr
#   notify them when done.

   cd /cluster/data/mm6/bed
   mv miRNA miRNA_050824
   mkdir miRNA
   cd miRNA

#  save miRNA_track_mm6_oct2005.txt file from email
#  MANUALLY EDIT ONE LINE PER WEB's EMAIL OF 10/18/05.   
   
   cp miRNA_track_mm6_oct2005.txt miRNA.tab
   vi miRNA.tab

# edit miRNA.bed to get rid of the top description lines
# and a few blank lines
# and replace blank with tab

   hgLoadBed mm6 miRNA miRNA.tab
    
# check previous release track before update

   nice featureBits mm5 miRNA
#  17957 bases of 2615483787 (0.001%) in intersection

   nice featureBits mm6 miRNA
#  21167 bases of 2597150411 (0.001%) in intersection

#############################################################################
# Add TIGR MGI TC Alignments (In progress Oct 24, 2005 JK)
# These are clusters of ESTs and other sequences on the mouse genome.

# Create directory and download data into it.
   cd /cluster/data/mm6/bed
   mkdir tigrMgiTc
   cd tigrMgiTc
   wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/MGI.release_15.zip
   unzip MGI.release_15.zip

# Extract only the clusters of ESTs from their big sequence file
# that also includes singleton ESTs.  This is about 40% the size
# of the full file, and all we need.
   faFilter '-name=TC???????' MGI.022505 tigrMgiTc.fa

# Split sequence into pieces for cluster run
   mkdir split
   faSplit sequence tigrMgiTc.fa 500 split/tc

# Set up cluster run
   ssh kk
   cd /cluster/data/mm6/bed/tigrMgiTc
   mkdir run
   cd run
   mkdir psl
   ls -1S /iscratch/i/mm6/chrom/*.fa > genome.lst
   ls ../split/*.fa > mrna.lst
   cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
   gensub2 genome.lst mrna.lst gsub spec
   para create spec
# Then do the usual para try/push/time/check until the run is finished

# Then do sorting and near-best-in-genome step on file server
   ssh kkstore
   cd /cluster/data/mm6/bed/tigrMgiTc/run
   pslSort dirs raw.psl tmp psl
   pslReps raw.psl ../tigrMgiTc.psl -nohead -minCover=0.25 -minAli=0.96 -nearTop=0.001 /dev/null

# Clean up big files no longer needed
   rm raw.psl
   rm -r psl
   rm -r ../split
   rm ../MGI.022505
   rm ../MGI.GO.022505
   rm ../MGI.TC_EST.022505


#############################################################################
# Add NCBI XM_ alignments - note this is just to create files for
# the Allen Brain Atlas mapping.  It does not produce a track.

# Create directory and download XM_ sequence from NCBI
   ssh kk
   cd /cluster/data/mm6/bed
   mkdir ncbiXm
   cd ncbiXm
   wget ftp://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.rna.fna.gz

# Unzip, simplify fa headers, and filter out non-XM_ sequence, and split
   mkdir split
   zcat mouse.rna.fna.gz | faNcbiToUcsc -wordBefore=ref stdin stdout | faFilter -name=XM_\* stdin ncbiXm.fa
   zcat mouse.rna.fna.gz | faNcbiToUcsc -wordBefore=ref stdin stdout | faFilter -name=NM_\* stdin ncbiNm.fa
   faSplit sequence ncbiXm.fa 150 split/xm

# Set up cluster job
   mkdir run
   cd run
   mkdir psl
   ls -1S /iscratch/i/mm6/chrom/*.fa > genome.lst
   ls ../split/*.fa > mrna.lst
   cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
   gensub2 genome.lst mrna.lst gsub spec
   para create spec
# Then do the usual para try/push/time/check until the run is finished

# Then do sorting and near-best-in-genome step on file server
   ssh kkstore
   cd /cluster/data/mm6/bed/ncbiXm/run
   pslSort dirs raw.psl tmp psl
   pslReps raw.psl ../ncbiXm.psl -nohead -minCover=0.50 -minAli=0.99 -nearTop=0.001 /dev/null


#############################################################################
# Create Allen Brain Atlas mapping. (Done 28 Oct 2005 JK)
# This needs to be done after have created sequences in
# ncbiXm and tigrMgiTc as above.

# Set up directory
    ssh kk
    cd /cluster/data/mm6/bed
    mkdir allenBrain
    cd allenBrain

# Copy in allen20051021.tab file that was converted from
# spreadsheet mailed by Susan Sunkin <SusanS@alleninstitute.org>
# Also copy in probeSeq.20051027.fasta, also from Susan.

# Create a list of probe sequences filling ones missing from probeSeq.20050127.fa
# with some NCBI and TIGR files, and some downloaded one at a time.
     allenCollectSeq allen20051021.tab probeSeq.20051027.fasta ../ncbiXm/ncbiNm.fa ../ncbiXm/ncbiXm.fa ../tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab
    
# Set up a blat run to align the probes.
    cd /cluster/data/mm6/bed/allenBrain
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /iscratch/i/mm6/chrom/*.fa > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
# Then do the usual para try/push/time/check until the run is finished
#Completed: 7800 of 7800 jobs
#CPU time in finished jobs:     105907s    1765.12m    29.42h    1.23d  0.003 y
#IO & Wait Time:                447478s    7457.96m   124.30h    5.18d  0.014 y
#Average job time:                  71s       1.18m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             227s       3.78m     0.06h    0.00d
#Submission to last job:          1292s      21.53m     0.36h    0.01d

# Then do sorting and near-best-in-genome step on file server
    ssh kkstore
    cd /cluster/data/mm6/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
   rm raw.psl
   rm -r psl
   rm -r ../split

# Load up database
   ssh hgwdev
   cd /cluster/data/mm6/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
   hgsql mm6 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql mm6 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'

# Make probe alignment table, and load sequence.
   hgLoadPsl mm6 allenBrainAli.psl
   ln -s /cluster/data/mm6/bed/allenBrain/allProbes.fa /gbdb/mm6/allenBrain/allProbes.fa
   hgLoadSeq mm6 /gbdb/mm6/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain
   hgMapToGene mm6 allenBrainAli -type=psl knownGene knownToAllenBrain 

###########################################################################
# RIKEN CAGE STUFF (DONE 11-16-2005 Andy)
    cd /cluster/data/mm6/bed
    mkdir rikenCageCtss
    cd rikenCageCtss/
    hgsql mm5 -e 'select * from rikenCageTc' | cut -f2- | tail +2 > rikenCageTc.mm5.bed
    hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssMinus' \
       | tail +2 > minus.mm5.bed
    hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssPlus' \
       | tail +2 > plus.mm5.bed
    liftOver rikenCageTc.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain rikenCageTc.mm6.bed \
       rikenCageTc.mm6.missed
    liftOver plus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain plus.mm6.bed \
       plus.mm6.missed
    liftOver minus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain minus.mm6.bed \
       minus.mm6.missed
    wc -l *.missed
    hgLoadBed mm6 rikenCageTc rikenCageTc.mm6.bed
    hgLoadBed -strict -bedGraph=4 mm6 rikenCageCtssMinus minus.mm6.bed 
    hgLoadBed -strict -bedGraph=4 mm6 rikenCageCtssPlus plus.mm6.bed 

###########################################################################
# BLASTZ HUMAN Hg17 second time correctly
#	The initial run was done at a time when there was an error in
#	the processing scripts and the lineage specific repeats were not
#	handled correctly.  This re-work produces better chains and nets
#	as the lineage specific repeats are handled properly, also set
#	the chain minScore and linearGap at better settings.
#	 (DONE - 2005-11-30 - 2005-12-08 - Hiram)

    ssh pk
    mkdir /cluster/data/mm6/bed/blastzHg17.2005-11-30
    cd /cluster/data/mm6/bed/blastzHg17.2005-11-30

    cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
    
# TARGET: Mouse Mm6
SEQ1_DIR=/san/sanvol1/scratch/mm6/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInHuman
SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Human Hg17 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzHg17.2005-11-30
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=3000 -chainLinearGap=medium \
        -stop=net `pwd`/DEF > to-net.out 2>&1
    #	recover from broken blastz run, then continue
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=3000 -chainLinearGap=medium \
        -continue=cat -stop=net `pwd`/DEF > cat-to-net.out 2>&1
    #	Manually loading to put these in a different table name than the
    #	existing track
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzHg17.2005-11-30/axtChain
    #	Copy the loadUp.csh used in Mm7 and alter the script:
    cp /cluster/data/mm7/bed/blastzHg17.2005-11-14/axtChain/loadUp.csh .
    #	set mm6 data base and table names to chainHg17u1 and netHg17u1
    #	and proper path names to here
    #	then run the script
    time ./loadUp.csh > loadUp.out

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzHg17.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits mm6 chainHg17u1Link > fb.mm6.chainHg17u1Link 2>&1 &
    #	989964288 bases of 2597150411 (38.117%) in intersection
    #	Previously with the broken lineage specific repeats, this was:
    #	966916309 bases of 2597150411 (37.230%) in intersection
    
    #	Move the existing swap directory out of the way
    ssh pk
    cd /cluster/data/hg17/bed
    mv blastz.mm6.swap blastz.mm6.swap.2005-03-29

    cd /cluster/data/mm6/bed/blastzHg17.2005-11-30
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=3000 -chainLinearGap=medium \
        -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1

    #	Manually load to change tables loaded into
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm6.swap/axtChain
    #	Copy loadUp script used in mm7, change db to mm6 and table names
    #	to be chainMm6u1 and netMm6u1
    cp /cluster/data/hg17/bed/blastz.mm7.swap/axtChain/loadUp.csh .
    time ./loadUp.csh > loadUp.out
    #	79 minute load time
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits hg17 chainMm6u1Link > fb.hg16.chainMm6u1Link 2>&1 &
    #	992497149 bases of 2866216770 (34.627%) in intersection
    #	With broken lineage specific repeats, and different chain
    #	minScore and linearGap settings, this measurement was:
    #	969459954 bases of 2866216770 (33.824%) in intersection

    #	prepare new downloads and clean up
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm6
    mv vsHg17 vsHg17.old 
    # There is no file in
    # /usr/local/apache/htdocs/goldenPath/mm6/liftOver
    #	this information was lost in cleaning over the summer
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mv vsMm6 vsMm6.old
    # There is no file in
    # /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    #	this information was lost in cleaning over the summer

    ssh pk
    cd /cluster/data/mm6/bed/blastzHg17.2005-11-30
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -continue download `pwd`/DEF > download-clean.out 2>&1
    #	2 minutes
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue download `pwd`/DEF > swap.download-clean.out 2>&1
    #	2m 20s
    #	Check the README information in 
    #	/usr/local/apache/htdocs/goldenPath/mm6/vsHg17
    #	/usr/local/apache/htdocs/goldenPath/hg17/vsMm6
    #	Use that information to place the matrix definition in the
    #	trackDb chain html files

# QA UPDATE: (2-14-2006 ASZ)
# the tables from this run were originally named with a "u1"
# e.g. chr1_chainMm6u1 (in the hg17 database)
# I have deleted the old chain and net tables (from the original blastz run)
# > DROP TABLE netMm6;
# and renamed the new chain and net tables from this new blastz run
# > RENAME TABLE netMm6u1 TO netMm6;



###########################################################################
# BLASTZ CHICKEN GalGal2 second time correctly
#	The initial run was done at a time when there was an error in
#	the processing scripts and the lineage specific repeats were not
#	handled correctly.  This re-work produces better chains and nets
#	as the lineage specific repeats are handled properly, also set
#	the chain minScore and linearGap at better settings.
#	 (DONE - 2005-11-30 - 2005-12-09 - Hiram)

    ssh pk
    mkdir /cluster/data/mm6/bed/blastzGalGal2.2005-11-30
    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30

    cat << '_EOF_' > DEF
# mouse vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
    
# TARGET: Mouse Mm6
SEQ1_DIR=/san/sanvol1/scratch/mm6/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInChicken
SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000


# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at
# once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzGal
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=5000 -chainLinearGap=loose \
        -stop=net `pwd`/DEF > to-net.out 2>&1
    #	recover from network slowness in making the net file appear
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=5000 -chainLinearGap=loose \
        -continue=net -stop=net `pwd`/DEF > net.out 2>&1

    #	Manually loading to put these in a different table name than the
    #	existing track
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30/axtChain
    #	Copy the loadUp.csh used in Mm7 and alter the script:
    cp \
/cluster/data/mm7/bed/blastzGalGal2.2005-11-14/axtChain/axtChain/loadUp.csh .
    #	set mm6 data base and table names to chainGalGal2u1 and netGalGal2u1
    #	and proper path names to here
    #	then run the script
    time ./loadUp.csh > loadUp.out

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits mm6 chainGalGal2u1Link > fb.mm6.chainGalGal2u1Link 2>&1 &
    #	77836209 bases of 2597150411 (2.997%) in intersection
    #	Previously with the broken lineage specific repeats and
    #	different linearGap matrix, this was:
    #   82018349 bases of 2597150411 (3.158%) in intersection

    #	Move the existing swap directory out of the way
    ssh pk
    cd /cluster/data/galGal2/bed
    mv blastz.mm6.swap blastz.mm6.swap.2005-04-04

    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose \
	    -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1

    #	Manually load to change tables loaded into
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.mm6.swap/axtChain
    #	Copy loadUp script used in mm7, change db to mm6 and table names
    #	to be chainMm6u1 and netMm6u1
    cp /cluster/data/galGal2/bed/blastz.mm7.swap/axtChain/loadUp.csh .
    time ./loadUp.csh > loadUp.out 2>&1

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits galGal2  chainMm6u1Link > fb.mm6.chainMm6u1Link 2>&1 &
    #	70147509 bases of 1054197620 (6.654%) in intersection
    #	Previously, with broken lineage specific repeats and different
    #	minScore and linearGap, this was:
    #   72687426 bases of 1054197620 (6.895%) in intersection

    ssh hgwdev
    #	move the existing data out of the way
    cd /usr/local/apache/htdocs/goldenPath/galGal2
    mv vsMm6 vsMm6.old
    cd /usr/local/apache/htdocs/goldenPath/mm6
    mv vsGalGal2 vsGalGal2.old

    ssh pk
    cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose \
	   -continue=download `pwd`/DEF > download.out 2>&1
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose \
	   -swap -continue=download `pwd`/DEF > swap.download.out 2>&1


# QA UPDATE: (3-1-2006 ASZ)
# the tables from this run were originally named with a "u1"
# e.g. chr1_chainMm6u1 (in the galGal2 database)
# I have deleted the old chain and net tables (from the original blastz run)
# > DROP TABLE netMm6;
# and renamed the new chain and net tables from this new blastz run
# > RENAME TABLE netMm6u1 TO netMm6;


###########################################################################
# BLASTZ Rat Rn3 second time correctly
#	The initial run was done at a time when there was an error in
#	the processing scripts and the lineage specific repeats were not
#	handled correctly.  This re-work produces better chains and nets
#	as the lineage specific repeats are handled properly, also set
#	the chain minScore and linearGap at better settings.
#	 (DONE - 2005-11-30 - 2005-12-09 - Hiram)

    ssh pk
    mkdir /cluster/data/mm6/bed/blastzRn3.2005-11-30
    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30

    cat << '_EOF_' > DEF
 # mouse vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
    
# TARGET: Mouse Mm6
SEQ1_DIR=/san/sanvol1/scratch/mm6/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInRat
SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
# pieces
SEQ2_DIR=/san/sanvol1/scratch/rn3/softNib
SEQ2_SMSK=/san/sanvol1/scratch/rn3/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzRn3.2005-11-30
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
    -chainMinScore=3000 -chainLinearGap=medium \
        -stop=net `pwd`/DEF > to-net.out 2>&1

    #	Manually loading to put these in a different table name than the
    #	existing track
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30/axtChain
    #	Copy the loadUp.csh used in Mm7 and alter the script:
    cp /cluster/data/mm7/bed/blastzRn3.2005-11-14/axtChain/loadUp.csh .
    #	set mm6 data base and table names to chainRn3u1 and netRn3u1
    #	and proper path names to here
    #	then run the script
    time ./loadUp.csh > loadUp.out

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits mm6 chainRn3u1Link > fb.mm6.chainRn3u1Link 2>&1 &
    #	1768516862 bases of 2597150411 (68.095%) in intersection
    #	Previously, with the broken lineage specific repeats and
    #	different minScore and linearGap, this was:
    #	1802980225 bases of 2597150411 (69.421%) in intersection

    #	Move the existing swap directory out of the way
    ssh pk
    cd /cluster/data/rn3/bed
    mv blastz.mm6.swap blastz.mm6.swap.2005-03-29

    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1

    #	Manually load to change tables loaded into
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm6.swap/axtChain
    #	Copy loadUp script used in mm7, change db to mm6 and table names
    #	to be chainMm6u1 and netMm6u1
    cp /cluster/data/rn3/bed/blastz.mm7.swap/axtChain/loadUp.csh .
    time ./loadUp.csh > loadUp.out 2>&1

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits rn3 chainMm6u1Link > fb.rn3.chainMm6u1Link 2>&1 &
    #	1780302108 bases of 2571104688 (69.243%) in intersection
    #	This was previously, with broken lineage specific repeats and
    #	different chain minScore and linearGap:
    #	1812992492 bases of 2571104688 (70.514%) in intersection

    #	move existing downloads out of the way:
    cd /usr/local/apache/htdocs/goldenPath/mm6
    mv vsRn3 vsRn3.old
    cd /usr/local/apache/htdocs/goldenPath/rn3
    mv vsMm6 vsMm6.old

    ssh pk
    cd /cluster/data/mm6/bed/blastzRn3.2005-11-30
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	   -continue=download `pwd`/DEF > download.out 2>&1

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	   -swap -continue=download `pwd`/DEF > swap.download.out 2>&1


# QA UPDATE: (3-1-2006 ASZ)
# the tables from this run were originally named with a "u1"
# e.g. chr1_chainMm6u1 (in the rn3 database)
# I have deleted the old chain and net tables (from the original blastz run)
# > DROP TABLE netMm6;
# and renamed the new chain and net tables from this new blastz run
# > RENAME TABLE netMm6u1 TO netMm6;


###########################################################################
# BLASTZ DOG canFam2 second time correctly
#	The initial run was done at a time when there was an error in
#	the processing scripts and the lineage specific repeats were not
#	handled correctly.  This re-work produces better chains and nets
#	as the lineage specific repeats are handled properly, also set
#	the chain minScore and linearGap at better settings.
#	 (DONE - 2005-12-02 - 2005-12-09 - Hiram)

    ssh pk
    mkdir /cluster/data/mm6/bed/blastzCanFam2.2005-12-02
    cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02

    cat << '_EOF_' > DEF
# mouse vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
    
# TARGET: Mouse Mm6
SEQ1_DIR=/san/sanvol1/scratch/mm6/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInDog
SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole
# pieces
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzCanFam2.2005-12-02
TMPDIR=/scratch/tmp

'_EOF_'
    # << emacs

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -stop=net `pwd`/DEF > to-net.out 2>&1

    #	Manually loading to put these in a different table name than the
    #	existing track
    ssh hgwdev
    cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02/axtChain
    cp /cluster/data/mm7/bed/blastzCanFam2.2005-11-14/axtChain/loadUp.csh .
    #	set mm6 data base and table names to chainCanFam2u1 and
    #	netCanFam2u1
    #	and proper path names to here
    #	then run the script
    time ./loadUp.csh > loadUp.out

    ssh kolossus
    cd /cluster/data/mm6/bed/blastzCanFam2.2005-11-30
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits mm6 chainCanFam2u1Link > fb.mm6.chainCanFam2u1Link 2>&1 &
    #	829007305 bases of 2597150411 (31.920%) in intersection
    #	Previously on canFam1, broken lineage specific repeats,
    #	different minScore and linearGap, this was:
    #   798637320 bases of 2597150411 (30.751%) in intersection

    #	Move the existing swap directory out of the way
    ssh pk
    cd /cluster/data/canFam2/bed
    mv blastz.mm6.swap blastz.mm6.swap.2005-11-01

    cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1

    #	Manually load to change tables loaded into
    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain
    #	Copy loadUp script used in mm7, change db to mm6 and table names
    #	to be chainMm6u1 and netMm6u1
    cp /cluster/data/canFam2/bed/blastz.mm7.swap/axtChain/loadUp.csh .
    time ./loadUp.csh > loadUp.out 2>&1
    #	52 minute load time

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only \
	featureBits canFam2 chainMm6u1Link > fb.canFam2.chainMm6u1Link 2>&1 &
    #	813032415 bases of 2384996543 (34.089%) in intersection
    #	Angie's run of this had:
    #	780509502 bases of 2384996543 (32.726%) in intersection

    #	Move existing chain and net download data of the way
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/canFam2
    mv vsMm6 vsMm6.old
    #	doesn't seem to be any canFam2 data yet in mm6

    ssh pk
    cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -continue download `pwd`/DEF > download.out 2>&1

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue download `pwd`/DEF > swap.download.out 2>&1

# UPDATED mm6.knownToVisiGene (2006-03-14 galt)
ssh hgwdev
knownToVisiGene mm6

# UPDATED mm6.knownToVisiGene (2006-04-05 galt)
ssh hgwdev
knownToVisiGene mm6

#######################################################################
## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram)
    ssh kkr1u00
    #	do not need to run this command since /cluster/data/mm8/split10k
    #	already exists from previous liftOver jobs (mm7 to mm8)
    # $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
    #	mm8 /cluster/data/mm8/nib
    # as it says, DO THIS NEXT:
    ssh kk
    #	if bin/scripts is not in your PATH, add it for this command:
    PATH=$PATH:/cluster/bin/scripts \
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
	mm6 /cluster/data/mm6/nib mm8 /iscratch/i/mm8/split10k \
	/cluster/data/mm8/11.ooc
    # as it says, DO THIS NEXT:
    cd /cluster/data/mm6/bed/blat.mm8.2006-05-15/run
    para try, check, push, check, ...
# Completed: 1360 of 1360 jobs
# CPU time in finished jobs:    3975252s   66254.20m  1104.24h   46.01d  0.126 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                2555s      42.58m     0.71h    0.03d
# Longest finished job:           25347s     422.45m     7.04h    0.29d
# Submission to last job:       1477498s   24624.97m   410.42h   17.10d

    # as it says, DO THIS NEXT:
    #	this does the liftUp and makes the psl files
    #	kkr1u00 is down at this time, fixup this script to work on kkr3u00
    ssh kkr3u00
    cd /cluster/data/mm6/bed
    ln -s blat.mm8.2006-05-15 blat.mm8
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm6 mm8
    #	real    16m5.091s
    # as it says, DO THIS NEXT:
    #	the prepares the batch to run for the chaining
    ssh kki
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
	mm6 /cluster/data/mm6/nib mm8 /cluster/data/mm8/nib
    # as it says, DO THIS NEXT:
    #	running the chain batch
    cd /cluster/data/mm6/bed/blat.mm8.2006-05-15/chainRun
    para try, check, push, check, ...
    Completed: 40 of 40 jobs
# Completed: 34 of 34 jobs
# CPU time in finished jobs:       6655s     110.92m     1.85h    0.08d  0.000 y
# IO & Wait Time:                  1238s      20.63m     0.34h    0.01d  0.000 y
# Average job time:                 232s       3.87m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             759s      12.65m     0.21h    0.01d
# Submission to last job:           759s      12.65m     0.21h    0.01d

    ssh kkstore01
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm6 mm8
    #	Created /cluster/data/mm6/bed/liftOver/mm6ToMm8.over.chain.gz
    # as it says, DO THIS NEXT:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm6 mm8
    #	It says this:
    # 	Now, add link for
    #	/usr/local/apache/htdocs/goldenPath/mm6/liftOver/mm6ToMm8.over.chain
    #	to hgLiftOver
    #	But I believe that link was already done:
    cd /gbdb/mm6/liftOver
    ls -og mm6ToMm8*
    #	lrwxrwxrwx  1 53 Jun  5 16:35 mm6ToMm8.over.chain.gz ->
    #		/cluster/data/mm6/bed/liftOver/mm6ToMm8.over.chain.gz


#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-07 - angie)
### fasta added 2006-06-21
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab

### NOTE -- as of 2007-03-01 the igtc track were automatically 
### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and 
### updateIgtc.pl in kent/src/hg/utils/automation/ .  
### 2007-09-01 was the last update for mm6 because IGTC moved on
### to {mm7, mm8, mm9}.

    ssh hgwdev
    mkdir /cluster/data/mm6/bed/igtc
    cd /cluster/data/mm6/bed/igtc
    wget http://www.genetrap.org/blattrack/genetrap_mm6.psl
    grep -v ^track genetrap_mm6.psl \
    | hgLoadPsl mm6 -table=igtc stdin
    # Probe fasta is shared by all assemblies:
    wget http://www.genetrap.org/blattrack/genetrap.fasta
    mkdir /gbdb/mm6/igtc
    ln -s /cluster/data/mm6/bed/igtc/genetrap.fasta /gbdb/mm6/igtc/
    hgLoadSeq -replace mm6 /gbdb/mm6/igtc/genetrap.fasta


#########################################################################
# SPLIT MM6 SEQUENCES FOR LIFTOVER FROM OTHER ASSEMBLIES 
# (DONE, 2006-06-11, hartera)
    ssh kkr1u00
    cd /cluster/data/mm6/bed
    mkdir bed/liftOver
    cd bed/liftOver
    # split the mouse mm6 sequences first
    makeLoChain-split mm6 /cluster/data/mm6/nib >&! split.log &
    # also add these to the san for pk cluster runs
    ssh pk
    mkdir -p /san/sanvol1/scratch/mm6
    rsync -a --progress \
          kkr1u00:/iscratch/i/mm6/split10k /san/sanvol1/scratch/mm6/

#############################################################################
# Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt)
# We are creating several things: a psl probe-track for the RR on mouse,
# a link out from kg to the probe to the ABA website, 
# and a set of gene/probe info which visiGene will use.
# (This needs to be done after have created sequences in
# ncbiXm and tigrMgiTc as above.)

# metadata.log and SRGEsequence.log was provided by 
#  Susan Sunkin <SusanS@alleninstitute.org>
# this is an update to the visiGene with 6000 new images.
    cd /san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06

# convert new metadata.log to be like previous allen.tab
    cat metadata.log | tail +3 | sed 's/_.*jp2//' \
 | gawk -F ',' '{ print$1"\tUnknown Gene\t"$3"\t"$4"\thttp://www.brain-map.org/search.do?queryText="($3=="0" ? "genesym" : "egeneid")"="($3=="0" ? $1 : $3) }' \
 > allen20061204update.tab

cat /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \
    allen20061204update.tab > allen20061204combined.tab 

# new program allenCleanup 
# (located in ~kent/src/hg/makeDb/outside/allenBrain/allenCleanup)
#  make the output from allen20061204combined.tab go into 
#  /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab
# can also use allenCleanup options to clean up duplicate and unused images, and to check the
# full-image,thumb,and tile .jpgs are present.

allenCleanup \
/san/sanvol1/visiGene/offline/allenBrain/imageDisk \
/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
/san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06/allen20061204combined.tab \
/san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \
> log


# convert new SRGEsequence.log to be new probeSeq.fasta
    cat SRGEsequence.log | tail +2 \
 | gawk -F ',' '{ if (($6=="sagittal")&&($8!="")) print ">aibs|"$1"|sym|"$2"|entrez|"$3"|refseq|"$4"|probe|"$7"\n"$8 }' \
 > probeSeq.20061204.fasta

# update the files in probesAndData for use by mm6,mm7,mm8
    cd /san/sanvol1/visiGene/offline/allenBrain/probesAndData/
    cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06/probeSeq.20061204.fasta .

# copy in the data files (directory already exists from previous build)
    cd /cluster/data/mm6/bed/allenBrain
    mkdir old
    mv * old/
    cp /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab .
    cp /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20061204.fasta .


# updated kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq
#  to relax handling of refseq ids between the .tab and the .fasta
#  i.e. sometimes chopping off trailing [.][0-9], sometimes adding [.][0-4].
#  Last time it had found 23 missing.  Now there are none missing.

# Create a list of probe sequences filling ones missing from probeSeq.20061204.fa
# with some NCBI and TIGR files, and some downloaded one at a time.
    allenCollectSeq allen20061204.tab \
	probeSeq.20061204.fasta ../ncbiXm/ncbiNm.fa ../ncbiXm/ncbiXm.fa ../tigrMgiTc/tigrMgiTc.fa \
    	~/kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq/extra.fa \
	allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab

#18463 sequences in ../ncbiXm/ncbiNm.fa
#8386 sequences in ../ncbiXm/ncbiXm.fa
#161499 sequences in ../tigrMgiTc/tigrMgiTc.fa
#16 sequences in
#/cluster/home/galt/kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq/extra.fa
#21075 sequences in probeSeq.20061204.fasta
#17895 (99.9%) hitProbe
#9 (0.1%) hitNm
#1 (0.0%) hitXm
#3 (0.0%) hitTc
#5 (0.0%) hitExtra
#0 (0.0%) hitNone


# go run vgLoadAllen (see visiGene.txt) to make sure everything is ok
# before proceeding.
# ok, looks fine now after using allenCleanup



# Set up a blat run to align the probes.
    ssk kk
    cd /cluster/data/mm6/bed/allenBrain
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /iscratch/i/mm6/chrom/*.fa > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
#Completed: 7760 of 7760 jobs
#CPU time in finished jobs:      99653s    1660.89m    27.68h    1.15d  0.003 y
#IO & Wait Time:                411590s    6859.83m   114.33h    4.76d  0.013 y
#Average job time:                  66s       1.10m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1279s      21.32m     0.36h    0.01d
#Submission to last job:          2352s      39.20m     0.65h    0.03d


# Then do sorting and near-best-in-genome step on file server
    ssh kkstore
    cd /cluster/data/mm6/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
    rm raw.psl
    rm -r psl
    rm -r ../split

# Load up database
    ssh hgwdev
    cd /cluster/data/mm6/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.

    hgsql mm6 -e 'drop table allenBrainUrl'
    hgsql mm6 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql mm6 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl'

# Make probe alignment table, and load sequence.
    hgLoadPsl mm6 allenBrainAli.psl
    rm /gbdb/mm6/allenBrain/allProbes.fa
    ln -s /cluster/data/mm6/bed/allenBrain/allProbes.fa /gbdb/mm6/allenBrain/allProbes.fa
    hgLoadSeq -replace mm6 /gbdb/mm6/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain
    hgMapToGene mm6 allenBrainAli -type=psl knownGene knownToAllenBrain 

##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm6


# UPDATED mm6.knownToVisiGene (DONE galt 2007-02-15)
ssh hgwdev
knownToVisiGene mm6

