# Chimp pre-assembly

# Downloaded files from Whitehead:

#ftp-genome.wi.mit.edu
# Dir: Assembly.Nov1.2003
#unzip
#The scaffold count is 37927.

#% faSize contigs.bases
#2733948177 bases (0 N's 2733948177 real) in 361864 sequences in 1 files
#Total size: mean 7555.2 sd 9652.7 min 19 (contig_358805) max 160254 (contig_301167) median 3470
#N count: mean 0.0 sd 0.0

# Split into 500KB chunks for RepeatMasking
    ssh kksilo
    cd /cluster/data/pt0
    mkdir -p split500K
    faSplit sequence contigs.bases 10 split500K/
    cd split500K
    foreach f (*.fa)
        set d = $f:t:r
        mkdir -p $d
        faSplit about $f 500000 $d/
        rm $f
    end

# Create RM script and cluster job
# NOTE for browser, remask with -s option, and chimp-specific repeats
    cd /cluster/data/pt0
    mkdir -p jkStuff
    mkdir -p RMRun
    rm -f RMRun/RMJobs

    cat << '_EOF_' > jkStuff/RMChimp
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/pt0/$2
/bin/cp $2 /tmp/pt0/$2
cd /tmp/pt0/$2
/scratch/hg/RepeatMasker/RepeatMasker $2
popd
/bin/cp /tmp/pt0/$2/$2.out ./
/bin/rm -fr /tmp/pt0/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/pt0/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/pt0
'_EOF_'

    chmod +x jkStuff/RMChimp

    mkdir -p RMRun
    rm -f RMRun/RMJobs
    foreach d ( split500K/?? )
        foreach f ( $d/*.fa )
            set f = $f:t
            echo /cluster/data/pt0/jkStuff/RMChimp \
                /cluster/data/pt0/$d $f \
            '{'check out line+ /cluster/data/pt0/$d/$f.out'}' \
            >> RMRun/RMJobs
        end
    end
    #5367 RMJobs

# Run cluster job

    sh kk
    cd /cluster/data/pt0/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
    #Completed: 5367 of 5367 jobs
#CPU time in finished jobs:   14066571s  234442.85m  3907.38h  162.81d  0.446 y
#IO & Wait Time:                 72512s    1208.53m    20.14h    0.84d  0.002 y
#Average job time:                2634s      43.91m     0.73h    0.03d
#Longest job:                     5696s      94.93m     1.58h    0.07d
#Submission to last job:         41294s     688.23m    11.47h    0.48d


# TRF: Tandem Repeat Finder
    # create job list of 5MB chunks
    ssh kksilo
    cd /cluster/data/pt0
    mkdir -p split5M
    faSplit about contigs.bases 5000000 split5M/

    cd /cluster/data/pt0
    mkdir -p bed/simpleRepeat
    cd bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    echo '#\!/bin/csh -fe' > jobs.csh

    foreach f (/cluster/data/pt0/split5M/*.fa)
      set fout = $f:t:r.bed
      echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    chmod +x jobs.csh
    wc -l jobs.csh
    #   546 jobs.csh

    ./jobs.csh >&! jobs.log &
    # in bash:  ./jobs.csh > jobs.log 2>&1 &
    tail -f jobs.log

# FILTER SIMPLE REPEATS INTO MASK
    # make a filtered version # of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/pt0/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        echo "filtering $f"
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

# MASK CONTIGS WITH REPEATMASKER
    ssh kksilo
    cd /cluster/data/pt0
    cd split500K
    foreach d (??)
        foreach f ($d/*.fa)
            echo "Masking $f"
            maskOutFa $f $f.out $f -soft
        end
    end
#WARNING: negative rEnd: -92 contig_143568:1128-1159 MER46B
#WARNING: negative rEnd: -155 contig_143869:5374-5508 AluJb
# etc.
# Comment in rn3 doc (Hiram) indicates these are OK...

    # Merge 500K masked chunks into single file, then split into 5Mb chunks 
    # to prepare for TRF masking
    foreach d (??)
        echo "Contig dir $d"
        foreach f ($d/?.fa $d/??.fa $d/???.fa)
            set g = $f:h
            cat $f >> $g.fa
        end
    end
    # check the split500K/??.fa  masked files, then
    cd /cluster/data/pt0
    cat split500K/??.fa > contigs.bases.rmsk
    # check the rmsk file, then
    rm split500K/??.fa
    mkdir -p split5M.rmsk
    faSplit about contigs.bases.rmsk 5000000 split5M.rmsk/
    foreach f (split5M.rmsk/*.fa)
        echo "TRF Masking $f"
        set b = $f:t:r
        maskOutFa $f bed/simpleRepeat/trfMask/$b.bed $f -softAdd
    end
    cat split5M.rmsk/*.fa > contigs.bases.msk


# RUN BLASTZ VS. HUMAN
# NOTE: This would normally be doc'ed in the hg16 make doc,
# however this is just preliminary until the real assembly arrives

# distribute contigs to bluearc and /iscratch/i for cluster run
    ssh kksilo
    mkdir -p /cluster/bluearc/pt0/split100
    faSplit sequence /cluster/data/pt0/contigs.bases.msk 100 /cluster/bluearc/pt0/split100/
    ssh kkr1u00
    mkdir -p /iscratch/i/pt0/trfFa
    df /iscratch/i
    faSplit sequence /cluster/data/pt0/contigs.bases.msk 100 /iscratch/i/pt0/trfFa/
    /cluster/bin/scripts/iSync

# make DEF file for blastz
    ssh kksilo
    cd /cluster/data/pt0
    mkdir -p bed/blastz.hg16
    cd bed/blastz.hg16

# NOTE: need schwartzbin below for utils still not in penn bin

cat << '_EOF_' > DEF
# human vs. chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
    
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# Specific settings for chimp
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/penn/human_chimp.q

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Chimp
SEQ2_DIR=/iscratch/i/pt0/trfFa
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store6/pt0/bed/blastz.hg16

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'

    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.hg16-pt0.`date -I`

    ssh kk
    cd /cluster/data/pt0/bed/blastz.hg16
    # source the DEF file to establish environment for following commands
    bash
    source ./DEF
    cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh ../../jkStuff
    ../../jkStuff/BlastZ_run0.sh
    cd run.0
    para try
    para check
    para push
# Completed: 33561 of 33561 jobs
# CPU time in finished jobs:    2748604s   45810.06m   763.50h   31.81d  0.087 y
# IO & Wait Time:                468757s    7812.62m   130.21h    5.43d  0.015 y
# Average job time:                  96s       1.60m     0.03h    0.00d
# Longest job:                     3289s      54.82m     0.91h    0.04d
# Submission to last job:          6757s     112.62m     1.88h    0.08d

    #   Second cluster run to convert the .out's to .lav's
    cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh /cluster/data/pt0/jkStuff
    ssh kk
    cd /cluster/data/pt0/bed/blastz.hg16
    bash
    source DEF
    ../../jkStuff/BlastZ_run1.sh
    cd run.1
    para try
    para check
    para push

    #   Prepare third cluster run script to convert lav's to axt's
    cd /cluster/data/pt0/bed/blastz.hg16
cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh
#!/bin/sh
#       prepare third cluster run for blastz processing
# NOTE: should run this on iservers (4G), 
# with chr19 and chr1 on kolossus (8G)
M=`uname -n`
if [ "$M" != "kk" ]; then
    echo "ERROR: you are on machine: '$M'"
    echo -e "\tthis script expects machine kk"
    exit 255
fi
source DEF
mkdir axtChrom
mkdir run.2
cd run.2
# usage:  blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file
echo '#LOOP' > gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR} /cluster/bluearc/pt0/contigs.bases.msk >> gsub
echo '#ENDLOOP' >> gsub
ls -1S ${BASE}/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
echo "running 'para create'"
para create jobList
echo "Ready for cluster run.  para try, check, push, etc ..."
'_EOF_'
    chmod +x ../../jkStuff/BlastZ_run2.sh
    #   Third cluster run to convert lav's to axt's
    source DEF
    ../../jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
    # NOTE: ran this on kolossus and mini-cluster
    # 30 min. to 2 hrs. per chrom

# Lift Contig (query side) AXT files to Scaffold coordinates

    ssh kksilo
    cd /cluster/data/pt0
    agpToLift < assembly.agp > jkStuff/assembly.lft
    cd bed/blastz.hg16
    cp -r axtChrom axtChrom.contig
    cd axtChrom
cat << 'EOF' > doLift.csh
#!/bin/csh
    foreach f (*.axt)
        echo "Lifting $f"
        liftUp -axtQ $f.lft /cluster/data/pt0/jkStuff/assembly.lft warn $f
        mv $f.lft $f
    end
'EOF'
    # << this line makes emacs coloring happy
    chmod +x doLift.csh
    csh doLift.csh >&! doLift.log &
    tail -100f doLift.log

    # create scaffold fasta file from contig file and agp
    agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa -sizes=scaffold.sizes
        # takes 20 minutes
    

# CHAIN HG16 BLASTZ

# The axtChain is usually run on the small cluster,
# as follows, however for this run, with large fasta file
# instead of nibs, required kolossus.
    #ssh kkr1u00
    #cd /cluster/data/pt0/bed/blastz.hg16
    #mkdir -p axtChain/run1
    #cd axtChain/run1
    #mkdir out chain

    #ls -1S /cluster/data/pt0/bed/blastz.hg16/axtChrom/*.axt > input.lst
    #cat << '_EOF_' > gsub
#LOOP
#doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
#'_EOF_'
    # << this line makes emacs coloring happy

    #cat << '_EOF_' > doChain
#!/bin/csh
    #axtFilter -notQ_random $1 | axtChain stdin \
        #/iscratch/i/gs.17/build34/bothMaskedNibs \
        #-faQ /cluster/bluearc/pt0/scaffolds.msk.fa $2 > $3
#'_EOF_'
    # << this line makes emacs coloring happy
    #chmod a+x doChain
    
    # 42 jobs 
    #gensub2 input.lst single gsub jobList
    #grep random jobList > jobList2
    #mv jobList2 jobList
    #para create jobList
    #para try, para check, para push...
    
    ssh kolossus
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain/run1
    cat << '_EOF_' > doChain.kol
#!/bin/csh
    ~/bin/x86_64/axtFilter -notQ_random $1 | ~/bin/x86_64/axtChain stdin \
        /cluster/data/hg16/nib \
        -faQ /cluster/bluearc/pt0/scaffolds.msk.fa $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain.kol

cat << 'EOF' > runChain.csh
    set path = (~/bin/x86_64 $path); rehash
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
        echo "chaining chr$c"
        which axtChain
        doChain.kol /cluster/data/pt0/bed/blastz.hg16/axtChrom/chr${c}.axt chain/chr${c}.chain out/chr${c}.out
    end
'EOF'
    csh runChain.csh >&! runChain.log &
    tail -100f runChain.log

cat << 'EOF' > runChainRandom.csh
    set path = (~/bin/x86_64 $path); rehash
    foreach c (1 2 3 4 5 6 7 8 9 Un X 10 13 15 17 18 19)
        echo "chaining chr${c}_random"
        doChain.kol /cluster/data/pt0/bed/blastz.hg16/axtChrom/chr${c}_random.axt chain/chr${c}_random.chain out/chr${c}_random.out
    end
'EOF'
    csh runChainRandom.csh >&! runChainRandom.log &
    tail -100f runChainRandom.log

    # 7 minutes per chrom -- ~4.5 hrs

    # now on the server, sort chains
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
        #94.050u 10.830s 1:54.09 91.9% 
    time chainSplit chain all.chain
        #85.530u 10.690s 1:46.58

    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainPt0 $i
        eco done $c
    end
    
    # optionally: rm run1/chain/*.chain


# NET HUMAN BLASTZ

    ssh kksilo
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/pt0/scaffold.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                            /cluster/data/pt0/scaffold.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 611950592, utime 1586 s/100, stime 298

    # skipping netClass for this preliminary work -- needs database
    #ssh hgwdev
    #cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    #time netClass hNoClass.net pt0 hg16 human.net \
	#-tNewR=/cluster/bluearc/scratch/mus/pt0/linSpecRep.notInHuman \
	#-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse
    mv hNoClass.net chimp.net

    # Make a 'syntenic' subset of these with
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    netFilter -syn chimp.net > chimpSyn.net
        # takes a few minutes

    # make net
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    mkdir chimpNet
    time netSplit chimp.net chimpNet

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    netFilter -minGap=10 chimp.net |  hgLoadNet hg16 netPt0 stdin
    netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin

    # package files for LaDeana
    cd chimpNet
    mkdir -p /usr/local/apache/htdocs/kate/chimp
    tar cvfz humanChimpNet.gz *.chain
    mv humanChimpNet.gz /usr/local/apache/htdocs/kate/chimp
    cd ..

    # make chain subset from net (for Tarjei Mikkelson at MIT)
    cd chimpNet
    mkdir ../chainSubset
    foreach f (*.net)
        set c = $f:r
        netChainSubset $f ../chain/$c.chain ../chainSubset/$c.chain
    end
    cd ../chainSubset
    tar cvfz humanChimpChain.gz *.chain
    mv humanChimpChain.gz /usr/local/apache/htdocs/kate/chimp
    cd ..

# AXT BEST FOR WEBB

    cd /cluster/data/pt0/bed/blastz.hg16/axtChrom
    mkdir -p ../axtBest
    foreach f (*.axt)
      set c = $f:r
      echo axtBesting $c
      axtBest $c.axt $c ../axtBest/$c.axt -minScore=300
    end
    cd ../axtBest
    tar cvfz humanChimpAxtBest.gz *.ax
    mv humanChimpAxtBest.gz /usr/local/apache/htdocs/kate/chimp

# REVERSE NET -- additional file for LaDeana

    ssh kolossus 
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 /cluster/data/hg16/chrom.sizes /cluster/data/pt0/scaffold.sizes chimpGap10.net humanGap10.net
    # takes a few minutes
    grep -v gap humanGap10.net > humanFill10.net
    # takes a few minutes
    cp humanFill10.net /usr/local/apache/htdocs/kate

# REVERSE CHAIN -- additional file for Tarjei

    ssh kksilo
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    mkdir humanNet
    time netSplit humanGap10.net humanNet
    cd humanNet
    mkdir humanChainSubset
    /cluster/home/kate/bin/x86_64/netChainSubset humanGap10.net all.chain humanChainSubset/all.chain
    
    foreach f (*.net)
        set c = $f:r
        #/cluster/home/kate/bin/x86_64/netChainSubset $f ../chain/$c.chain ../humanChainSubset/$c.chain
        netChainSubset $f ../chain/$c.chain ../humanChainSubset/$c.chain
    end
    cd ../humanChainSubset
    tar cvfz chimpHumanChain.gz *.chain
    mv chimpHumanChain.gz /usr/local/apache/htdocs/kate/chimp

# UPDATE WITH NOV13 ASSEMBLY

    cd /cluster/data/pt0
    wget -r -nH ftp://user:passwd@ftp-genome.wi.mit.edu/Assembly.Nov13.2003
    cd Assembly.Nov13.2003
    gunzip contigs.bases.gz
    
    # create scaffold fasta file from contig file and agp
    agpAllToFaFile assembly.agp contigs.bases scaffolds.fa -sizes=scaffold.sizes
        # takes 20 minutes

    # extract unplaced scaffolds from latest assembly
    # list of unplaced scaffolds from prior assembly
    # add 1 replaced, 3 new scaffolds
    cd /cluster/data/pt0/bed

    # NOTE: blastz.hg16.nomask not used in final chain/net 
    
    mkdir -p blastz.hg16.2003-11-20
    ln -s blastz.hg16.2003-11-20 blastz.hg16.nomask
    cd blastz.hg16.nomask
    echo > README << 'EOF'
    This is an alignment of unmasked chimp supercontigs to unmasked human hg16
    chromosomes, limited to the supercontigs that were not placed
    by the alignment using masked sequence.
'EOF'
    cp ../blastz.hg16/axtChain/notOnNet .
    cat >> notOnNet << 'EOF'
scaffold_28690
scaffold_37928
scaffold_37929
scaffold_37930
'EOF'

    # extract unplaced scaffolds
    faSomeRecords /cluster/data/pt0/Assembly.Nov13.2003/scaffolds.fa notOnNet \
        unplaced.fa

    cd /cluster/data/hg16
cat > jkStuff/makeUnmaskedNib.sh << 'EOF'
#!/bin/csh -ef
mkdir -p unmaskedNib
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un M)
   cd $i
   foreach j (*.fa)
       echo chr$i
       /cluster/bin/i386/faToNib $j ../unmaskedNib/$j:r.nib
       echo done $j
   end
   cd ..
   end
'EOF'
    jkStuff/makeUnmaskedNib.sh >&! unmaskedNib.log &
    tail -100f unmaskedNib.log


# RUN BLASTZ OF UNPLACED SCAFFOLDS VS. UNMASKED HUMAN
# NOTE: this run was aborted due to excessive running time
# A blat was run instead (see below).

# distribute scaffolds and unmaksed nibs to bluearc and /iscratch/i 
# for cluster run
    ssh kksilo
    mkdir -p /cluster/bluearc/pt0/unplacedScaffolds
    faSplit sequence /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa \
                30 /cluster/bluearc/pt0/unplacedScaffolds/
cat > /cluster/bluearc/pt0/unplacedScaffolds/README << 'EOF'
This directory contains scaffold fasta files for the Nov. 13 2003
assembly, that were not placed in the initial repeatmasked alignment.
Used for unmasked alignment with human for placement on human chroms.
'EOF'
    ssh kkr1u00
    mkdir -p /iscratch/i/gs.17/build34/unmaskedNib
    cp /cluster/data/hg16/unmaskedNib/* /iscratch/i/gs.17/build34/unmaskedNib
    df /iscratch/i
    /cluster/bin/scripts/iSync
    cp -r /cluster/data/hg16/unmaskedNib /cluster/bluearc/hg16

# make DEF file for blastz
    ssh kksilo
    cd /cluster/data/pt0
    cd bed/blastz.hg16.nomask

# NOTE: need schwartzbin below for utils still not in penn bin

cat << '_EOF_' > DEF
# human vs. chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
    
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# Specific settings for chimp
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/penn/human_chimp.q

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/unmaskedNib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Chimp
SEQ2_DIR=/cluster/bluearc/pt0/unplacedScaffolds
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store6/pt0/bed/blastz.hg16.nomask

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'

    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.hg16-pt0.nomask.`date -I`

    # needs lots of memory
    ssh kk
    cd /cluster/data/pt0/bed/blastz.hg16.nomask
    # source the DEF file to establish environment for following commands
    bash
    source ./DEF
    ../../jkStuff/BlastZ_run0.sh
        # 10170 jobs written to batch
    cd run.0
    para try
    para check
    para push

    # NOTE -- aborted this run, it would have taken too long


# LOAD CHIMP SEQUENCE INTO HUMAN BROWSER
    mkdir -p /gbdb/hg16/pt0
    ln -s /cluster/data/pt0/scaffolds.msk.fa /gbdb/hg16/pt0
    cd /cluster/data/pt0
    hgLoadSeq -prefix=pt0 hg16 /gbdb/hg16/pt0/scaffolds.msk.fa


# BLAT UNPLACED UNMASKED CHIMP VS. UNMASKED HUMAN 

    ssh kksilo
    mkdir -p /cluster/bluearc/pt0/unplacedScaffolds300
    faSplit sequence /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa \
                300 /cluster/bluearc/pt0/unplacedScaffolds300/
    mkdir /cluster/data/pt0/bed/blatHg16.scaffolds
    cd /cluster/data/pt0/bed/blatHg16.scaffolds
    mkdir psl
    foreach f (/cluster/data/hg16/?{,?}/N?_??????/N?_??????.fa)
      set c=$f:t:r
      echo $c
      mkdir -p psl/$c
    end

    # create cluster job
    mkdir run
    cd run
    ls -1S /cluster/bluearc/pt0/unplacedScaffolds300/*.fa > pt0.lst
    ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/pt0/bed/blatHg16.scaffolds/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << this line makes emacs coloring happy
    gensub2 human.lst pt0.lst gsub spec

    ssh kk
    cd /cluster/data/pt0
    cd bed/blatHg16.scaffolds
    cd run
    para create spec
       # 145336 jobs
    para try
    para check
    para push
    para check


    # NOTE: blatHg16 not used in final chain/net;
    # it consists of blastz.hg16 + blatHg16.scaffolds chains
    ssh kksilo
    mkdir /cluster/data/pt0/bed/blatHg16
    cd /cluster/data/pt0/bed/blatHg16
    mkdir psl
    foreach f (/cluster/data/hg16/?{,?}/N?_??????/N?_??????.fa)
      set c=$f:t:r
      echo $c
      mkdir -p psl/$c
    end

    # extract contigs for unplaced scaffolds
    cd /cluster/data/pt0
    jkStuff/getContigList.pl Assembly.Nov13.2003/assembly.agp \
                /cluster/data/pt0/bed/blastz.hg16.nomask/notOnNet > \
                /cluster/data/pt0/bed/blatHg16/unplacedContigs.lst
    cd /cluster/data/pt0/bed/blatHg16
    faSomeRecords /cluster/data/pt0/Assembly.Nov13.2003/contigs.bases \
                unplacedContigs.lst unplacedContigs.fa
    mkdir -p /cluster/bluearc/pt0/unplacedContigs200
    faSplit sequence unplacedContigs.fa 200 \
                /cluster/bluearc/pt0/unplacedContigs200/

    # create cluster job
    mkdir run
    cd run
    ls -1S /cluster/bluearc/pt0/unplacedContigs200/*.fa > pt0.lst
    ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/pt0/bed/blatHg16/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << this line makes emacs coloring happy
    gensub2 human.lst pt0.lst gsub spec

    ssh kk
    cd /cluster/data/pt0
    cd bed/blatHg16
    cd run
    para create spec
       # 97218 jobs 
    para try
    para check
    para push
    para check

    # lift psl files to chromosome coordinates
    ssh kksilo
    cd /cluster/data/pt0/bed/blatHg16.scaffolds
    pslCat -dir psl/N?_?????? | \
      liftUp -type=.psl stdout \
        /cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin
      # sort by target


# CHAIN UNPLACED SCAFFOLD PSL'S

    ssh kkr1u00
    mkdir -p /cluster/data/pt0/bed/blatHg16.scaffolds/pslChain/run1
    cd /cluster/data/pt0/bed/blatHg16.scaffolds/pslChain/run1
    mkdir out chain

  ls -1S /cluster/data/pt0/bed/blatHg16.scaffolds/chrom/*.psl > input.lst
  cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    # NOTE: axtChain in /cluster/bin has a memory problem
    ~kate/bin/i386/axtChain -psl $1 \
        /iscratch/i/gs.17/build34/unmaskedNib \
        -faQ /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
        # 40 jobs
    para try
    para push

    # merge with blastz-generated chains

    ssh kksilo
    cd /cluster/data/pt0/bed
    # filter out scaffold 28690, which is replaced in the newer assembly
    chainFilter -notQ=scaffold_28690 blastz.hg16/axtChain/all.chain \
        > all.blastz.chain
    chainMergeSort all.blastz.chain \
        blatHg16.scaffolds/pslChain/run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainPt0 $i
        eco done $c
    end
    
    # optionally: rm run1/chain/*.chain

    # NOTE: didn't preNet

    # Net from merged chains
    ssh kolossus
    cd /cluster/data/pt0/bed
    /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 /cluster/data/hg16/chrom.sizes /cluster/data/pt0/Assembly.Nov13.2003/scaffold.sizes chimpGap10.net humanGap10.net
    # takes a few minutes
    # writing chimpGap10.net
#subchainInfo T 81232621,81233154 81218758,81244581 ali 19126, subSize 529, score 1473313.000000, subScore 40749.899456
#subchainInfo T 81238921,81239448 81218758,81244581 ali 19126, subSize 510, score 1473313.000000, subScore 39286.292481
#writing humanGap10.net
#memory usage 4688400384, utime 13822 s/100, stime 945

    grep -v gap humanGap10.net > humanFill10.net
    cp humanFill10.net /usr/local/apache/htdocs/kate/chimp
    # takes a few minutes

    # filter out tiny alignments and post to web site
    ssh kksilo
    cd /cluster/data/pt0/bed
    netFilter humanGap10.net -minAli=150 > humanGap10.filtered.net
    grep -v gap humanGap10.filtered.net > humanFill10.filtered.net
    cp humanFill10.filtered.net /usr/local/apache/htdocs/kate/chimp


    # CHAIN FILES FROM NET
    ssh kksilo
    cd /cluster/data/pt0/bed
    mkdir blastz-blatHg16.2003-11-24
    ln -s blastz-blatHg16.2003-11-24 blastz-blatHg16
    mv all*.chain chain *.net blastz-blatHg16.2003-11-24

    cd blastz-blatHg16
    netFilter chimpGap10.net -minAli=150 > chimpGap10.filtered.net
    netSplit chimpGap10.filtered.net chimpNet
    mkdir chimpChainSubset
    cd chimpNet
    foreach f (*.net)
        set c = $f:r
        echo "chain subset $c"
        netChainSubset $f ../chain/$c.chain ../chimpChainSubset/$c.chain
    end
    cd ../chimpChainSubset
    tar cvfz humanChimpChain.gz *.chain
    mv /usr/local/apache/htdocs/kate/chimp/humanChimpChain.gz \
        /usr/local/apache/htdocs/kate/chimp/old
    mv humanChimpChain.gz /usr/local/apache/htdocs/kate/chimp
    cd ../

    netChainSubset chimpGap10.filtered.net all.chain \
                    chimpGap10.filtered.chain

    # REVERSED CHAIN FILES FROM NET
    chainSwap all.chain all.swap.chain
    netChainSubset humanGap10.filtered.net all.swap.chain \
                        humanGap10.filtered.chain
    gzip -c humanGap10.filtered.chain >  \
                /usr/local/apache/htdocs/kate/chimp/chimpHumanChainAll.gz


    # statistics
    netStats chimp.net.stats chimpGap10.filtered.net
    netStats human.net.stats humanGap10.filtered.net
    netStats chimp.net.unfiltered.stats chimpGap10.net
    netStats human.net.unfiltered.stats humanGap10.net

# REFINE BLAT ALIGNMENTS -- RUN BLASTZ ON UNPLACED SCAFFOLDS VS.
#        ONLY HUMAN CONTIGS IDENTIFIED BY BLAT ALIGNMENTS
# This was proposed, but not completed
    ssh kksilo
    cd /cluster/data/pt0/bed/blatHg16.scaffolds
    pslCat -dir psl/N?_?????? | \
      pslSortAcc nohead contig temp stdin
        # Processed 4644940 lines into 19 temp files
      # sorted by target contig


# RECIPROCAL BEST -- FINAL ALIGNMENTS (2003-12-03 kate)
#   Use unfiltered chimp-reference chains, net them, and pull human side

    ssh kksilo
    cd /cluster/data/pt0
    cd bed/blastz-blatHg16
    netChainSubset humanGap10.net all.swap.chain humanGap10.jk2.chain
    chainSort humanGap10.jk2.chain humanGap10Sorted.jk2.chain
    # note: jim ran these, and switched conventions
    chainNet humanGap10Sorted.jk2.chain /cluster/data/pt0/scaffold.sizes \
        /cluster/data/hg16/chrom.sizes chimp.jk2.net human.jk2.net \
                 -minScore=0 -minSpace=1

    # detour to redo reciprocal best chimp chain
    netChainSubset chimpGap10.net all.chain chimpGap10.jk2.chain
    chainSort chimpGap10.jk2.chain chimpGap10Sorted.jk2.chain
    chainNet chimpGap10Sorted.jk2.chain /cluster/data/hg16/chrom.sizes \
        /cluster/data/pt0/scaffold.sizes human.jk3.net chimp.jk3.net \
                 -minScore=0 -minSpace=1
    /cluster/bin/i386/netSyntenic chimp.jk3.net chimp.best.net
    ~/bin/i386/netFilter -chimpSyn chimp.best.net > chimp.best.filtered.net
    netChainSubset chimp.best.filtered.net all.swap.chain \
        chimp.best.filtered.chain
    ln -s chimp.best.filtered.chain chimp.chain
    cp chimp.chain /usr/local/apache/htdocs/kate/chimp
    cd /usr/local/apache/htdocs/kate/chimp
    gzip chimp.chain

    ##netFilter -minGap=10 chimp.net |  hgLoadNet hg16 netPt0 stdin
    #/cluster/bin/i386/netSyntenic human.jk.net human.best.net
    #~/bin/i386/hgLoadNet -warn hg16 netRxBestPt0 human.best.net

    # back to best human chain
    mv human.best.net human.best.net.old
    /cluster/bin/i386/netSyntenic human.jk2.net human.best.net
    ~/bin/i386/netFilter -chimpSyn human.best.net > human.best.filtered.net
    #mv chimp.best.net chimp.best.net.old
    #/cluster/bin/i386/netSyntenic chimp.jk2.net chimp.best.net
    #~/bin/i386/netFilter -chimpSyn chimp.best.net > chimp.best.filtered.net
    #~/bin/i386/hgLoadNet -warn hg16 netRxBestPt0 human.best.net
    ln -s human.best.filtered.net human.net
    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz-blatHg16
    /bin/i386/hgLoadNet -qPrefix=pt0- -warn hg16 netRxBestPt0 human.net

    #netStats chimp.jk.stats chimp.jk.net
    #netStats human.jk.stats human.jk.net
    #netChainSubset chimp.jk.net all.swap.chain chimp.jk.chain
    #netChainSubset human.jk.net all.chain human.jk.chain
        # read 9254050 of 9254050 chains in all.chain
    # NOTE: post both chains for download by chimp-analysis group
    #netChainSubset chimp.jk2.net all.swap.chain chimp.jk2.chain
    #netChainSubset human.jk2.net all.chain human.jk2.chain
    netChainSubset human.best.filtered.net all.chain human.best.filtered.chain
    ln -s human.best.filtered.chain human.chain
    #netChainSubset chimp.best.filtered.net all.swap.chain \
        #chimp.best.filtered.chain
    #ln -s chimp.best.filtered.chain chimp.chain
    cp human.chain /usr/local/apache/htdocs/kate/chimp
    cd /usr/local/apache/htdocs/kate/chimp
    gzip human.chain 
    #hgLoadChain -qPrefix=pt0- hg16 chainAllPt0 all.chain

    ssh hgwdev
    cd /cluster/data/pt0
    cd bed/blastz-blatHg16
    chainSort -target all.chain | chainSplit chain stdin
    cd chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain -qPrefix=pt0 hg16 ${c}_chainPt0 $i
        echo done $c
    end
    # reloaded these tables on genome-test on 4/29/04
    # as they somehow were wrong (kate)

    # NOTE: need to remake scaffolds.msk.fa for Nov13 assembly
    # and reload seq table
    cd /cluster/data/pt0
    hgLoadSeq -prefix=pt0 hg16 /gbdb/hg16/pt0/scaffolds.msk.fa
        #37928 sequences


# DOWNLOADS 

    # copy chain files to download area
    cd /usr/local/apache/htdocs/goldenPath/hg16
    mkdir -p vsPt0
    cp /cluster/data/pt0/bed/blastz-blatHg16/{human,chimp}.chain .
    gzip human.chain chimp.chain
    md5sum *.gz > md5sum.txt
    # add README.txt file to dir, if needed

# CREATE RECIPROCAL BEST NET AXT'S

    # split scaffolds into files and create nibs for netToAxt
    ssh kksilo
    cd /cluster/data/pt0
    mkdir scaffolds
    faSplit byname scaffolds.msk.fa scaffolds/

    # add 3 scaffolds, change 1 from Nov13 assembly
    cd scaffolds
    mv scaffold_28690.fa scaffold_28690.fa.nov1
    cd ../Assembly.Nov13.2003
    echo "scaffold_28690\nscaffold_37928\nscaffold_37929\nscaffold_37930" \
        > newscaffolds.lst
    faSomeRecords scaffolds.fa newscaffolds.lst newscaffolds.fa
    mkdir newscaffolds
    faSplit byname newscaffolds.fa newscaffolds/
    cd newscaffolds
    foreach i (scaffold*.fa)
        echo $i
        cp $i ../../scaffolds/$i.nov13
        ln -s /cluster/data/pt0/scaffolds/$i.nov13 ../../scaffolds/$i
    end
    cd /cluster/data/pt0
    mkdir nib
    cd scaffolds
    foreach f (scaffold*.fa)
        set i = $f:r
        echo $i
        faToNib -softMask $i.fa ../nib/$i.nib
    end

    cd /cluster/data/pt0/bed/blastz-blatHg16
    mkdir humanBestNet

    netSplit human.net humanBestNet
    mkdir humanBestAxt
    cd humanBestNet
cat > makeAxt.csh << 'EOF'
    foreach f (*.net)
        set i = $f:r
        echo $i.net
        netToAxt $i.net ../all.chain \
                /cluster/data/hg16/nib /cluster/data/pt0/nib \
                ../humanBestAxt/$i.axt
    end
'EOF'
    csh -f makeAxt.csh >&! makeAxt.log &
    tail -100f makeAxt.log

# CHIMP SEQUENCE FOR DOWNLOAD (2003-12-10 kate)
#   Re-extract scaffolds using Nov13 AGP
    ssh kksilo
    cd /cluster/data/pt0
    mkdir Assembly.Nov1.2003
    mv supercontigs* assembly.agp assembly.format scaffold.sizes.Nov1 scaffolds.msk.fa Assembly.Nov1.2003
    ln -s Assembly.Nov13.2003/assembly.agp .
    agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa
    cp scaffolds.msk.fa /usr/local/apache/htdocs/goldenPath/hg16/vsPt0
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsPt0
    gzip scaffolds.msk.fa
    md5sum scaffolds.msk.fa >> md5sum.txt
    # Update README

# REDO NETTING WITH FIX  (2003-12-11)
    ssh kolossus
    cd /cluster/data/pt0/bed/blastz-blatHg16
    /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 \
        /cluster/data/hg16/chrom.sizes /cluster/data/pt0/scaffold.sizes \
        chimpGap10.2.net humanGap10.2.net
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz-blatHg16
    ~/bin/i386/netChainSubset humanGap10.2.net all.swap.chain stdout | \
        chainSort stdin humanGap10.2.chain
    # NOTE: this chain file has duplicated chain ID's -- chains
    # must have been split up from net

    # note: switched conventions here (like jk, above)
    ssh kolossus
    cd /cluster/data/pt0/bed/blastz-blatHg16
    /cluster/home/kate/bin/x86_64/chainNet humanGap10.2.chain \
        /cluster/data/pt0/scaffold.sizes /cluster/data/hg16/chrom.sizes \
        chimp.2.net human.2.net -minScore=0 -minSpace=1
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz-blatHg16
    netSyntenic human.2.net human.2.syn.net
    ~/bin/i386/netFilter -chimpSyn human.2.syn.net > human.best.2.net

    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz-blatHg16
    ~/bin/i386/hgLoadNet -qPrefix=pt0 -warn hg16 netRxBestPt0 human.best.2.net

    # get axt's from net
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz-blatHg16
    netSplit human.best.2.net humanBestNet.2
    mkdir humanBestAxt.2
    cd humanBestNet.2
cat > makeAxt.csh << 'EOF'
    foreach f (*.net)
        set i = $f:r
        echo $i.net
        netToAxt -nocache $i.net ../all.chain \
                /cluster/data/hg16/nib /cluster/data/pt0/nib \
                ../humanBestAxt.2/$i.axt
    end
'EOF'
    csh -f makeAxt.csh >&! makeAxt.log &
    tail -100f makeAxt.log
    cd ..
    ln -s humanBestAxt.2 axtBest
    
    # convert to PSL and load into browser
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz-blatHg16
    mkdir pslBest
cat > makePsl.csh << 'EOF'
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt"
        axtToPsl axtBest/${c}.axt \
        /cluster/data/hg16/chrom.sizes \
                /cluster/data/pt0/scaffold.sizes \
                        pslBest/${c}_blastzBestPt0.psl
    end
'EOF'
    csh makePsl.csh >&! makePsl.log &
    tail -100f makePsl.log

    # Load tables
    ssh hgwdev
    #cd /cluster/data/pt0/bed/blastz-blatHg16/pslBest
    #~/bin/i386/hgLoadPsl -qPrefix=p0- hg16 chr*_blastzBestPt0.psl

    cd /gbdb/hg16
    set table = axtBestPt0
    mkdir -p $table
    cd $table
    foreach p (/cluster/data/pt0/bed/blastz-blatHg16/axtBest/*.axt)
        echo $p
        set f = $p:t
        ln -s $p $f
    end
    cd /cluster/data/pt0/bed/blastz-blatHg16/axtBest
    hgLoadAxt hg16 axtBestPt0

    # extract chains and copy to download area
    ssh kksilo
    cd /cluster/data/pt0/bed/blastz-blatHg16
    ~/bin/i386/netChainSubset human.best.2.net all.chain human.best.2.chain
    #cp chimp.chain /usr/local/apache/htdocs/kate/chimp
    #cd /usr/local/apache/htdocs/kate/chimp
    #gzip chimp.chain

    # chimp reference chains
    /cluster/bin/i386/netSyntenic chimp.2.net chimp.2.syn.net
    ~/bin/i386/netFilter -chimpSyn chimp.2.syn.net > chimp.best.2.net
    ~/bin/i386/netChainSubset chimp.best.2.net all.swap.chain chimp.best.2.chain
    #cp chimp.chain /usr/local/apache/htdocs/kate/chimp
    #cd /usr/local/apache/htdocs/kate/chimp
    #gzip chimp.chain

    #gzip --fast *.chain


# CREATING SCAFFOLD LEVEL COMPRESSED QUALITY SCORES AND HI QUALITY DIFFERENCE FILES

    # Create compressed scaffold quality file
    ssh kksilo
    cd /cluster/data/pt0
    zcat contigs.quals.gz | qaToQac stdin stdout | \
    	chimpSuperQuals assembly.agp stdin scaffolds.qac

    # Create bed file of high quality differences
    cd bed/blastz-blatHg16.2003-11-24
    chimpHiQualDiffs humanBestAxt.krr /cluster/data/pt0/scaffolds.qac chimpDiffs.bed

    # Load into database
    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24
    sed '{s/simpleNucDiff/chimpSimpleDiff/}' $HOME/kent/src/hg/lib/simpleNucDiff.sql | hgsql hg16
    hgLoadBed -oldTable hg16 chimpSimpleDiff chimpDiffs.bed


