# for emacs: -*- mode: sh; -*-

                                                                                
# Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
#  Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/Zv5_assembly_information.shtml

# DOWNLOAD SEQUENCE (DONE, 2005-06-06, hartera)
# MOVE DANRER3 DIRECTORY AND CONTENTS TO STORE11 AS STORE3 IS FULL
# (DONE, 2005-07-22, hartera)
     ssh kkstore01
     mkdir /cluster/store9/danRer3
     ln -s /cluster/store9/danRer3 /cluster/data
     cd /cluster/data/danRer3
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/README
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.stats
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.chunks.agp
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.scaffolds.agp     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.fa
     # 2005-07-22 MOVE danRer3 
     # store9 is 100% full, move danRer3 to store11 which is 10% full
     ssh kkstore02
     cd /cluster/store9
     nohup nice mv danRer3 /cluster/store11 &
     # make link to /cluster/data/danRer3
     ln -s /cluster/store11/danRer3 /cluster/data
     
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2005-06-13, hartera)
     ssh kkstore01
     mkdir -p /cluster/data/danRer3/M
     cd /cluster/data/danRer3/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for
     # "Danio mitochondrion genome".  That shows the gi number:
     # 8576324 for the accession, AC024175
 # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
     # Edit chrM.fa: make sure the header line says it is the
     # Danio Rerio mitochondrion complete genome, and then replace the
     # header line with just ">chrM".
     perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
     rm *.bak
     # Make a "pseudo-contig" for processing chrM too:
     mkdir ./chrM_1
     sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
     mkdir ./lift
     echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
     echo "chrM_1" > ./lift/ordered.lst
     echo "0     M/chrM_1        16596   chrM    16596" > ./lift/ordered.lft
     # make sure this is tab delimited
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
    cat << '_EOF_' > ./chrM.agp
chrM       1       16596   1       F       AC024175.3      1       16596   +
'_EOF_'
     # Create a chrM.chunks.agp
     mkdir -p /cluster/data/danRer3/M/agps
     cd /cluster/data/danRer3/M/agps
     awk 'BEGIN {OFS="\t"} \
        {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
         > chrM.chunks.agp
     # make sure that all these above files are tab delimited

# Create list of chromosomes (DONE, 2005-06-08, hartera)
     ssh kkstore01
     cd /cluster/data/danRer3
     awk '{if ($1 !~ /Zv5/) print $1;}' Zv5.scaffolds.agp \
         | sort -n | uniq > chrom.lst
     cp chrom.lst chrom1to25.lst
     # add chrM
     echo "M" >> chrom.lst
     # add chrUn
     echo "Un" >> chrom.lst
     # add NA
     echo "NA" >> chrom.lst

# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2005-06-09, hartera)
    ssh kkstore01
    cd /cluster/data/danRer3
    # This used to hold scripts -- better to keep them inline here 
    # Now it should just hold lift file(s) and
    # temporary scripts made by copy-paste from this file.
    mkdir /cluster/data/danRer3/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/danRer3/bed

# GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER 
# (DONE, 2005-05-10, hartera)
# Go to http://www.girinst.org/server/RepBase/RepBase10.04.fasta
# and download zebunc.ref containing unclassified zebrafish repeats.
# Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
     ssh hgwdev
     cd /cluster/bluearc/RepeatMasker/Libraries/
     perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]/' zebunc.ref
     # add to RepeatMasker library
     cat zebunc.ref >> RepeatMasker.lib

    # This is all in: /cluster/bluearc/RepeatMasker050305/Libraries

# CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2005-06-10, hartera)

     # The script, createAgpWithGaps.pl (see next section for creating
     # agps and FASTAs for chrNA and chrUn), was used to create a scaffolds 
     # agp file for chrUn to test the program. The agp output was compared to 
     # that from scaffoldFaToAgp and difference was found in the agp file
     # output for scaffoldFaToAgp which used 990568 as the end co-ordinate for
     # Zv5_scaffold1475 instead of 976101 as in the output from the script. So 
     # the co-ordinate numbering is different from there on. The program, 
     # scaffoldFaToAgp is creating the agp file from the FASTA file
     # so perhaps the sequence is a different size than stated in the agp file.
     # Get sequence and find the size:
     ssh kkstore01
     mkdir test
     cd test
     faOneRecord ../Zv5.fa Zv5_scaffold1475 > Zv5_scaffold1475.fa
     faSize Zv5_scaffold1475.fa
     # 990568 bases
     rm Zv5_scaffold1475.fa 
     # reported this inconsistency to Mario Caccamo at Sanger
     # mc2@sanger.ac.uk (2005-06-09) and new scaffolds and chunks agp files
     # were sent on 2005-06-10. There was a chunk (contig) missing from the 
     # chunks agp file and the scaffold therefore had the wrong end 
     # co-ordinate in the agp files.
     # check all sizes of scaffold sequences against those in the agp files
     ssh kkr1u00
     cd /cluster/data/danRer3 
     mkdir -p /iscratch/i/danRer3/scaffolds
     cp Zv5.fa /iscratch/i/danRer3/scaffolds/
     iSync
     
     ssh kk
     mkdir -p /cluster/data/danRer3/scaffolds/run
     cd /cluster/data/danRer3/scaffolds/run
     grep '>' ../Zv5.fa | sed -e 's/>//' > Zv5.scaffolds.lst
cat << '_EOF_' > getSizes.csh
     #!/bin/csh -fe
     set dir=/cluster/bluearc/danRer3/scaffolds
     faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
     echo $1 >> $dir/$1.size
     faSize $dir/$1.fa >> $dir/$1.size
     rm $dir/$1.fa
'_EOF_'
     # << this line makes emacs coloring happy
     chmod +x getSizes.csh
cat << '_EOF_' > gsub
#LOOP
getSizes.csh $(path1)
#ENDLOOP
'_EOF_'
     # << this line makes emacs coloring happy 
     gensub2 Zv5.scaffolds.lst single gsub jobList
     para create jobList 
     para try,check,push,check etc...
    
     ssh kkstore01
     cd /cluster/bluearc/danRer3/scaffolds
     foreach f (*.size)
        cat $f >> Zv5.scaffolds.sizes
     end	  
     cd /cluster/data/danRer3/scaffolds
     mv /cluster/bluearc/danRer3/scaffolds/Zv5.scaffolds.sizes .
     # Check that these sizes correspond to the sizes in the scaffolds agp file
     # use script compareSizes.pl
     cat << '_EOF_' > compareSizes.pl
#!/usr/bin/perl -w
use strict;

my ($file, $agp);

$file = $ARGV[0];
$agp = $ARGV[1];

open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";

my ($l, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
if ($l =~ /^(Zv5_(scaffold|NA)[0-9]+)/)
   {
   $name = $1;
   }
elsif ($l =~ /^([0-9]+)\sbases/)
   {
   $size = $1;  
   $scafsHash{$name} = $size;
   }
}
close FILE;

while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;

@fi = split(/\t/, $line);
$scaf = $fi[5];
$end = $fi[7];

if (exists($scafsHash{$scaf}))
   {
   if ($scafsHash{$scaf} eq $end)
      {
      print OUT "$scaf - ok\n";
      }
   else
      {
      print OUT "$scaf - different size to sequence\n";
      }
   }
else
   {
   print OUT "$scaf - does not exist in list of sizes\n";
   }
}
close AGP;
close OUT;
'_EOF_'
   # << happy emacs
   chmod +x compareSizes.pl
   perl compareSizes.pl Zv5.scaffolds.sizes ../Zv5.scaffolds.agp
   # the only lines where no ID was found in the list of scaffolds with sizes
   # were those lines for gaps.
   grep "different" Zv5_scaffold1475
   # Zv5_scaffold1475 - different size to sequence
   # so only this scaffold is a different size in the agp to the sequence
   # need to check that sizes are consistent between agp files 
   # check also new agp file for scaffolds - newAgps/Zv5.scaffolds.agp
   perl compareSizes.pl Zv5.scaffolds.sizes ../newAgps/Zv5.scaffolds.agp
   # these are all consistent with the sequence sizes
   cd /cluster/data/danRer3/newAgps/
   # print out scaffold names where the co-ordinates are not consistent
   # with sizes given
   awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.scaffolds.agp \
       > Zv5.scaffolds.coordCheck 
   # this file is empty so they are ok. do the same for the chunks.agp file
   awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.chunks.agp \ 
       > Zv5.chunks.coordCheck
   # also empty so ok. check that the difference between $7 and $8 is the
   # same as the difference between $11 and $12 fields
   awk '{if ($6 != 5000 && (($8 - $7) != ($12 - $11))) print $6;}' \
       Zv5.chunks.agp > Zv5.chunks.coordCheck2
   # these are all ok
   rm Zv5.*.coord*
cat << '_EOF_' > checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;

my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp

open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";

while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv5/)
   {
   $name = $f[5];
   $e = $f[2];
   $scafsHash{$name} = $e;
   }
}
close SCAFS;

my $scaf = "";
my $prev = "";
my $prevEnd = 0;

while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);

if ($fi[5] ne "5000")
   {
   $scaf = $fi[9];
   if (($scaf ne $prev) && ($prev ne ""))
      {
      checkCoords($prev, $prevEnd);
      }
$prev = $scaf;
$prevEnd = $fi[2];
   }
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;

sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
   {
   if ($scafsHash{$prev} != $prevEnd)
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev is not consistent between agps\n";
      }
   else
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev - ok\n";
      }
   }
}
'_EOF_'
   # << happy emacs
   chmod +x checkSizesInAgps.pl
   checkSizesInAgps.pl Zv5.scaffolds.agp Zv5.chunks.agp \
         > Zv5.scafsvschunks
   grep "not consistent" Zv5.scafsvschunks
   # no lines were inconsistency was reported
   wc -l Zv5.scafsvschunks
   # 16214 Zv5.scafsvschunks
   grep "Zv5" Zv5.scaffolds.agp | wc -l
   # 16214
   # so all the scaffolds were checked and were ok.
   cd /cluster/data/danRer3
   mv ./newAgps/Zv5.scaffolds.agp .
   mv ./newAgps/Zv5.chunks.agp
   mv ./scaffolds/compareSizes.pl ./jkStuff/
   mv ./newAgps/checkSizesInAgps.pl ./jkStuff/
   rm -r newAgps

# SPLIT AGP FILES BY CHROMOSOME (DONE, 2005-06-13, hartera)
# FASTA WAS CREATED USING SCAFFOLDS AGP
     ssh kkstore01
     cd /cluster/data/danRer3
     # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
     # then one for chunks (contigs on danRer1) showing how they map on to
     # scaffolds.

     # get list of scaffolds from FASTA file and check these are in agp
     grep '>' Zv5.fa | sed -e 's/>//' | sort | uniq > Zv5FaScafs.lst
     # get list of scaffolds from agp - do not print from gap lines
     awk '{if ($7 !~ /contig/) print $6;}' Zv5.scaffolds.agp \
        | sort | uniq > Zv5AgpScafs.lst
     diff Zv5FaScafs.lst Zv5AgpScafs.lst
     # no difference so all scaffolds are in the FASTA file
     # add "chr" prefix for the agp files
     perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
     # for chromosomes:
     foreach c (`cat chrom1to25.lst`)
       echo "Processing $c ..."
       mkdir $c
       perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
         ./Zv5.chunks.agp \
         > $c/chr$c.chunks.agp
       perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
         ./Zv5.scaffolds.agp \
         > $c/chr$c.scaffolds.agp
     end

# CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2005-06-13, hartera)
     ssh kkstore01
     # chrNA consists of WGS contigs that could not be related to any 
     # FPC contig and the scaffolds and contigs are named Zv5_NAN in the 
     # first field of the agp files
     cd /cluster/data/danRer3
     mkdir ./NA
     awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.chunks.agp \
         > ./NA/NA.chunks.agp
     awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.scaffolds.agp \
         > ./NA/NA.scaffolds.agp
     # change the first field to "chrUn" then can use agpToFa to process
     perl -pi.bak -e 's/Zv5_NA[0-9]+/chrNA/' ./NA/*.agp
     # check files and remove backup files
     rm ./NA/*.bak
     # then process chrUn.
     # Re-make chrUn with new agp files - this is made from scaffolds and  
     # contigs where the name is Zv5_scaffoldN in the first field of the 
     # agp files. These scaffolds and contigs are unmapped to chromosomes
     # in the agp file. chrUn is made up of WGS scaffolds that mapped to 
     # FPC contigs, but the chromosome is unknown.
     rm -r Un
     mkdir ./Un
     awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.chunks.agp \
         > ./Un/Un.chunks.agp
     awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.scaffolds.agp \
         > ./Un/Un.scaffolds.agp
     # change the first field to "chrUn" then can use agpToFa to process
     perl -pi.bak -e 's/Zv5_scaffold[0-9]+/chrUn/' ./Un/*.agp
     # check files and remove backup files
     rm ./Un/*.bak

     # get FASTA file of sequences for NA and Un and create agp with 
     # Ns between scaffolds
     # from scaffolds agp, get name of scaffolds to retrieve from the FASTA 
     # file to make the NA and Un chromosomes.
     foreach c (NA Un)
       awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
       $HOME/bin/i386/faSomeRecords /cluster/data/danRer3/Zv5.fa \
          $c/chr$c.scaffolds.lst $c/chr$c.fa
     end
     # check that all scaffolds in list are in FASTA file for NA and Un - ok
     # edit scaffoldFaToAgp.c so that it creates agp with 500Ns between 
     # scaffolds as contig gaps for chrNA and compile. chrNA is already large
     # so the number of Ns are reduced to reduce the size.
     foreach c (NA Un)
        $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
        mv $c/chr$c.fa $c/chr$c.scaffolds.fa
     end
     # change chrUn to chrNA for NA and D to W for NA and Un
     sed -e 's/chrUn/chrNA/' ./NA/chrNA.agp | sed -e 's/D/W/' \
         > ./NA/chrNA.scaffolds.agp
     sed -e 's/D/W/' ./Un/chrUn.agp > ./Un/chrUn.scaffolds.agp
     # edit ./NA/chrNA.scaffolds.agp and ./Un/chrUn.scaffolds.agp and 
     # remove last line as this just adds an extra 500 Ns at the 
     # end of the sequence.
     rm ./NA/chrNA.agp ./Un/chrUn.agp

cat << '_EOF_' > /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
#!/usr/bin/perl
use strict;

# This script takes a chunks agp and inserts Ns between scaffolds for 
# the chunks (contigs) agp file. Could also insert Ns between scaffolds
# for scaffolds agp.

my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
my $chrom = $ARGV[0]; # chromosome name
my $numN = $ARGV[1];  # number of Ns to be inserted 
my $type = $ARGV[2]; # contigs or scaffolds

$prev = "";
$st = 1;
$prevEnd = 0;
$id = 0;

while (<STDIN>)
{
my $l = $_;
my @f = split(/\t/, $l);

if ($type eq "contigs")
   {
   $name = $f[9];
   }
else 
   {
   $name = $f[5]
   }

my $currSt = $f[1];
my $currEnd = $f[2];
my $size = $currEnd - $currSt;

$id++;
$st = $prevEnd + 1;
$end = $st + $size;

if (($prev ne "") && ($prev ne $name))
   {
   $st = $prevEnd + 1;
   $end = ($st + $numN) - 1;
   print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
   $prevEnd = $end;
   $id++;
   }

$st = $prevEnd + 1;
$end = $st + $size;
print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
if ($type eq "contigs")
   {
   print "\t$f[9]\t$f[10]\t$f[11]";
   }

$prevEnd = $end;
$prev = $name;
}
'_EOF_'
     chmod +x /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
     cd /cluster/data/danRer3
     foreach c (NA Un)
        cd $c
        perl ../jkStuff/createAgpWithGaps.pl chr${c} 500 contigs \
             < ${c}.chunks.agp > chr${c}.chunks.agp
        cd ..
     end
     # check co-ordinates
     # clean up
     foreach c (NA Un)
        rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
           $c/${c}.scaffolds.lst
     end
   
# BUILD CHROM-LEVEL SEQUENCE (DONE, 2005-06-13, hartera)
     ssh kkstore01
     cd /cluster/data/danRer3
     # Sequence is already in upper case so no need to change
     foreach c (`cat chrom.lst`)
       echo "Processing ${c}"
       $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
         $c/chr$c.fa ./Zv5.fa
       echo "${c} - DONE"
     end
     # move scaffolds agp to be chrom agp and clean up
     foreach c (`cat chrom.lst`)
        cd $c
        rm *.bak
        cp chr${c}.scaffolds.agp chr${c}.agp
        mkdir -p agps
        mv chr${c}.*.agp ./agps/
        cd ..
     end

# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2005-06-13, hartera)
     # Check that the size of each chromosome .fa file is equal to the
     # last coord of the .agp:
     ssh hgwdev
     cd /cluster/data/danRer3
     foreach c (`cat chrom.lst`)
       foreach f ( $c/chr$c.agp )
         set agpLen = `tail -1 $f | awk '{print $3;}'`
         set h = $f:r
         set g = $h:r
         echo "Getting size of $g.fa"
         set faLen = `faSize $g.fa | awk '{print $1;}'`
         if ($agpLen == $faLen) then
           echo "   OK: $f length = $g length = $faLen"
         else
           echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
         endif
       end
     end
     # all are the OK so FASTA files are the expected size

# CREATING DATABASE (DONE, 2005-06-13, hartera)
    # Create the database.
    # next machine
    ssh hgwdev
    echo 'create database danRer3' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
    echo 'drop database danRer3' | hgsql danRer3
    # Delete and re-create database as above (hartera, 2004-11-30)
    # Use df to make sure there is at least 10 gig free on
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  927G  734G  56% /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2005-06-13, hartera)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #  grp in the database danRer2 to the new database danRer3
    echo "create table grp (PRIMARY KEY(NAME)) select * from danRer2.grp" \
      | hgsql danRer3
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
    echo 'drop table grp;' | hgsql danRer3

# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-06-14, hartera)

     ssh kkstore01
     cd /cluster/data/danRer3
     foreach c (`cat chrom.lst`)
       foreach agp ($c/chr$c.agp)
         if (-e $agp) then
           set fa = $c/chr$c.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end

# MAKE LIFTALL.LFT (DONE, 2005-06-14, hartera)
    ssh kkstore01
    cd /cluster/data/danRer3
    cat */lift/ordered.lft > jkStuff/liftAll.lft 

# SIMPLE REPEAT [TRF] TRACK  (DONE, 2005-06-14, hartera)
    # TRF can be run in parallel with RepeatMasker on the file server
    # since it doesn't require masked input sequence.
    # Run this on the kilokluster. Need to mask contig and chromosome 
    # sequences so run trf using contig sequences.
    # First copy over contig sequences to iscratch and then iSync to cluster.
    ssh kkr1u00
    mkdir -p /iscratch/i/danRer3/contigsNoMask
    cd /cluster/data/danRer3
    foreach d (/cluster/data/danRer3/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
          echo "Copyig $f ..."
          cp $f /iscratch/i/danRer3/contigsNoMask/
       end
    end
    # 288 sequence files
    /cluster/bin/iSync

    ssh kk
    mkdir -p /cluster/data/danRer3/bed/simpleRepeat
    cd /cluster/data/danRer3/bed/simpleRepeat
    mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << keep emacs coloring happy
    chmod +x runTrf
                                                                                
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy
                                                                                
    ls -1S /iscratch/i/danRer3/contigsNoMask/chr*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    # 288 jobs
    para create jobList
    para try, check, push, check etc...
    para time
# Completed: 288 of 288 jobs
# CPU time in finished jobs:      70742s    1179.03m    19.65h    0.82d  0.002 y
# IO & Wait Time:                  1263s      21.05m     0.35h    0.01d  0.000 y
# Average job time:                 250s       4.17m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6722s     112.03m     1.87h    0.08d
# Submission to last job:         10037s     167.28m     2.79h    0.12d

    # lift up to chrom level
    liftUp simpleRepeat.bed /cluster/data/danRer3/jkStuff/liftAll.lft warn \
           trf/*.bed

    # Load into the database
    ssh hgwdev
    cd /cluster/data/danRer3/bed/simpleRepeat
    hgLoadBed danRer3 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Loaded 757119 elements of size 16

# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/danRer3/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/danRer3
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2005-06-15, hartera)
    # When a new library is added for this version of repeatMasker, need to 
    # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made 
    # up of a date e.g. 20050112 here and inside this are species directories
    # for which RepeatMasker has already been run. In this directory it creates
    # a specieslib of the danio repeats. If this exists, this is used for the
    # RepeatMasker run for that species so if new repeats are added to the
    # library, they will not get used unless this is deleted a new specieslib
    # is created using the new library on the first run for danio.
    ssh kkstore01
    rm -r /cluster/bluearc/RepeatMasker/Libraries/20050112/danio/
    cd /cluster/data/danRer3
    #- Split contigs into 500kb chunks, at gaps if possible:
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

    # For RepeatMasking, use RepeatMasker "open-3.0" with repeat library
    # version RepBase Update 9.11, RM database version 20050112 with the 
    # addition of the zebrafish unclassified repeats (zebunc.ref) - see above
    # section on getting this additional zebrafish RepeatMasker library. 
    #- Make the run directory and job list:
    cd /cluster/data/danRer3
cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
                                                                                
cd $1
pushd .
/bin/mkdir -p /tmp/danRer3/$2
/bin/cp $2 /tmp/danRer3/$2/
cd /tmp/danRer3/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer3/$2/$2.out ./
if (-e /tmp/danRer3/$2/$2.align) /bin/cp /tmp/danRer3/$2/$2.align ./
if (-e /tmp/danRer3/$2/$2.tbl) /bin/cp /tmp/danRer3/$2/$2.tbl ./
if (-e /tmp/danRer3/$2/$2.cat) /bin/cp /tmp/danRer3/$2/$2.cat ./
/bin/rm -fr /tmp/danRer3/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3
'_EOF_'
    chmod +x jkStuff/RMZebrafish
    mkdir -p RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer3/jkStuff/RMZebrafish \
                 /cluster/data/danRer3/$d $f \
               '{'check out line+ /cluster/data/danRer3/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end
    # Do the run
    ssh kk 
    cd /cluster/data/danRer3/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
    para time
# Completed: 4069 of 4069 jobs
# CPU time in finished jobs:   13726314s  228771.90m  3812.87h  158.87d  0.435 y
# IO & Wait Time:                 45762s     762.70m    12.71h    0.53d  0.001 y
# Average job time:                3385s      56.41m     0.94h    0.04d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            4549s      75.82m     1.26h    0.05d
# Submission to last job:         56947s     949.12m    15.82h    0.66d
# This is slow. It should have taken about 5 hours.

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore01
    cd /cluster/data/danRer3
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end
                                                                                
    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/danRer3
    hgLoadOut danRer3 */chr*.fa.out -verbose=2
# bad rep range [689, 602] line 105524 of 16/chr16.fa.out 
# bad rep range [147, 146] line 124027 of 16/chr16.fa.out
# bad rep range [280, 258] line 754 of 17/chr17.fa.out 
# bad rep range [280, 258] line 76417 of 17/chr17.fa.out
# bad rep range [314, 311] line 99427 of 19/chr19.fa.out
# bad rep range [367, 366] line 88398 of 23/chr23.fa.out 
# bad rep range [41, 40] line 51509 of 25/chr25.fa.out
# bad rep range [1133, 1132] line 62610 of 9/chr9.fa.out
# bad rep range [6133, 6132] line 122359 of NA/chrNA.fa.out 
# bad rep range [6133, 6132] line 160183 of NA/chrNA.fa.out 
# bad rep range [292, 291] line 252829 of NA/chrNA.fa.out 
# bad rep range [751, 599] line 261276 of NA/chrNA.fa.out 
# bad rep range [360, 359] line 259794 of Un/chrUn.fa.out 
# bad rep range [360, 359] line 259796 of Un/chrUn.fa.out 
# bad rep range [360, 359] line 259798 of Un/chrUn.fa.out 
# bad rep range [1, -56] line 379516 of Un/chrUn.fa.out
# note: 16 records dropped due to repStart > repEnd

# check coverage of repeats masked
# featureBits -chrom=chr1 danRer1 rmsk
# 11589712 bases of 40488791 (28.624%) in intersection
# featureBits -chrom=chr1 danRer2 rmsk
# 26879295 bases of 61678023 (43.580%) in intersection
# featureBits -chrom=chr1 danRer3 rmsk
# 25822888 bases of 55805710 (46.273%) in intersection

# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
# (DONE, 2005-06-15, hartera)
    ssh kkstore01
    cd /cluster/data/danRer3
    # Soft-mask (lower-case) the contig and chr .fa's,
    # then make hard-masked versions from the soft-masked.
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    # for the chromosomes:
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
# This warning is extremely rare -- if it indicates a problem, it is only with
# the repeat annotation and does not affect the masking:
# repeat- and trf-masking Un/chrUn.fa
# WARNING: negative rEnd: -56 chrUn:153329594-153329609 MOSAT_DR
    # for the contigs:
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
# same warning here too:
# repeat- and trf-masking contigs of chrUn
# WARNING: negative rEnd: -56 chrUn_26:1159145-1159160 MOSAT_DR
    # check percent sequence masked
    faSize /cluster/data/danRer3/1/chr1.fa
    # 55805710 bases (1047706 N's 54758004 real 28887275 upper 25870729 lower)
    # 46% is in lower case so masked
    # for danRer2:
    faSize /cluster/data/danRer2/1/chr1New.fa
    # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
    # 43% is in lower case so masked
    # Build nib files, using the soft masking in the fa
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2005-06-15, hartera)
# Added link from danRer3.2bit file to the danRer3 gbdb directory
# (2005-06-17, hartera)
    # Make symbolic links from /gbdb/danRer3/nib to the real nibs
    ssh hgwdev
    cd /cluster/data/danRer3
    mkdir -p /gbdb/danRer3/nib
    foreach f (/cluster/data/danRer3/nib/chr*.nib)
      ln -s $f /gbdb/danRer3/nib
    end

# Load /gbdb/danRer3/nib paths into database and save size info
    # hgNibSeq creates chromInfo table
    hgNibSeq -preMadeNib danRer3 /gbdb/danRer3/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N danRer3 > chrom.sizes
    # take a look at chrom.sizes, should be 28 lines
    wc chrom.sizes
    # 28      56     409 chrom.sizes
    
    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/danRer3/nib because hgBlat looks there:
    faToTwoBit */chr*.fa danRer3.2bit
    # add link to this 2bit file from gbdb danRer3 directory (2005-06-17)
    ln -s /cluster/data/danRer3/danRer3.2bit /gbdb/danRer3/
    # also make 2 bit files for chrUn and chrNA later on - need masked seq
    # make 2 bit files for chrUn and chrNA scaffolds (2005-06-17)
    ssh kkstore01
    cd /cluster/data/danRer3
    # make scaffolds files
    foreach c (NA Un)
       cd $c
       echo "Processing $c ..."
       mkdir scafSeqs
       awk '{if ($5 != "N") print $6;}' chr${c}.agp > scafSeqs/scaffolds.lst
       cd ..
    end 
    cd /cluster/data/danRer3/NA/scafSeqs
cat << '_EOF_' > getSeqs.csh
     #!/bin/csh -fe
     set dir=/cluster/bluearc/danRer3/scaffolds
     faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
'_EOF_'
     # << this line makes emacs coloring happy
     chmod +x getSeqs.csh
cat << '_EOF_' > gsub
#LOOP
getSeqs.csh $(path1)
#ENDLOOP
'_EOF_'
     # << this line makes emacs coloring happy 
     ssh kk
     cd /cluster/data/danRer3/NA/scafSeqs
     gensub2 scaffolds.lst single gsub jobList
     para create jobList 
     para try,check,push,check etc...
    
     ssh kkstore01
     cd /cluster/bluearc/danRer3/scaffolds
     foreach f (*.size)
    faToTwoBit ./chrNA/scafSeqs/*.fa danRer3ChrNA.2bit
    faToTwoBit ./chrUn/scafSeqs *.fa danRer3ChrUn.2bit

# MAKE GOLD AND GAP TRACKS (DONE, 2005-06-15, hartera)
# Add trackDb entry and html page for gold and gap tracks (2005-06-16, hartera)
    ssh hgwdev
    cd /cluster/data/danRer3
    # the gold and gap tracks are created from the chrN.agp file and this is
    # the scaffolds or supercontigs agp 
    hgGoldGapGl -noGl -chromLst=chrom.lst danRer3 /cluster/data/danRer3 .
    # featureBits danRer3 gold
    # 1630323462 bases of 1630323462 (100.000%) in intersection
    # featureBits danRer2 gold
    # 1560497282 bases of 1560497282 (100.000%) in intersection
    # featureBits danRer1 gold
    # 1459132082 bases of 1459132082 (100.000%) in intersection

    # featureBits danRer3 gap
    # 13709500 bases of 1630323462 (0.841%) in intersection
    # featureBits danRer2 gap
    # 28776000 bases of 1560497282 (1.844%) in intersection
    # featureBits danRer1 gap
    # 64174000 bases of 1459132082 (4.398%) in intersection
# Add trackDb.ra entries for gold and gap tracks and also create
# gap.html and gold.html pages.

# MAKE TRACKDB ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
    ssh hgwdev
    # Make trackDb table so browser knows what tracks to expect:
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish
    cvs add danRer3
    cvs commit danRer3
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add danRer3 in all the right places and do
    make update
    make alpha
    cvs commit -m "Added danRer3." makefile
    
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2005-06-16, hartera)
    ssh hgwdev
    mkdir /cluster/data/danRer3/html
   # make a symbolic link from /gbdb/danRer3/html to /cluster/data/danRer3/html
    ln -s /cluster/data/danRer3/html /gbdb/danRer3/html
    # Add a description page for zebrafish
    cd /cluster/data/danRer3/html
    cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2/description.html .
    # Edit this for zebrafish danRer3
                                                                                
    # create a description.html page here
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
    # Add description page here too
    cp /cluster/data/danRer3/html/description.html .
    cvs add description.html
    cvs commit -m "First draft of description page for danRer3." \
        description.html
    cd ~/kent/src/hg/makeDb/trackDb
    make update
    make alpha

# MAKE HGCENTRALTEST ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
# UPDATE ENTRY TO ADD DANRER3 TO GENE SORTER (DONE, 2006-06-09, hartera)
    # Make dbDb and defaultDb entries so test browser knows about it:
    ssh hgwdev
    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("danRer3", "May 2005", \
          "/gbdb/danRer3", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
          37, "Zebrafish", "Danio rerio", \
          "/gbdb/danRer3/html/description.html", 0,  0, \
          "Sanger Centre, Danio rerio Sequencing Project Zv5");' \
    | hgsql -h genome-testdb hgcentraltest
    # set danRer3 to be the default assembly for Zebrafish
    echo 'update defaultDb set name = "danRer3" \
          where genome = "Zebrafish";' \
          | hgsql -h genome-testdb hgcentraltest
    # Update dbDb entry for danRer3 to add it to Gene Sorter 
    # (hartera, 2006-06-09)
    echo 'update dbDb set hgNearOk = 1 where name = "danRer3";' \
         | hgsql -h genome-testdb hgcentraltest

# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS AND ON BLUEARC
# (DONE, 2005-06-16, hartera)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer3/nib
    mkdir -p /iscratch/i/danRer3/nib
    cp -p /cluster/data/danRer3/nib/chr*.nib /iscratch/i/danRer3/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer3/trfFa
    mkdir /iscratch/i/danRer3/trfFa
    foreach d (/cluster/data/danRer3/*/chr*_?{,?})
      cp -p $d/$d:t.fa /iscratch/i/danRer3/trfFa
    end
    rm -rf /iscratch/i/danRer3/rmsk
    mkdir -p /iscratch/i/danRer3/rmsk
    cp -p /cluster/data/danRer3/*/chr*.fa.out /iscratch/i/danRer3/rmsk
    cp -p /cluster/data/danRer3/danRer3.2bit /iscratch/i/danRer3/
    /cluster/bin/iSync
    # add to the bluearc
    ssh kkstore01
    mkdir -p /cluster/bluearc/danRer3/nib
    cp -p /cluster/data/danRer3/nib/chr*.nib /cluster/bluearc/danRer3/nib
    mkdir -p /cluster/bluearc/danRer3/trfFa
    foreach d (/cluster/data/danRer3/*/chr*_?{,?})
      cp -p $d/$d:t.fa /cluster/bluearc/danRer3/trfFa
    end
    cp /cluster/data/danRer3/danRer3.2bit /cluster/bluearc/danRer3/

# ADD CONTIGS TRACK (DONE, 2005-06-16, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
# chunks (contigs) agp files.
    ssh kkstore01
    mkdir -p /cluster/data/danRer3/bed/ctgPos2
    cd /cluster/data/danRer3/bed/ctgPos2
    # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
         awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer3/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
    end
                                                                                
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ctgPos2
    hgsql danRer3 < ~/kent/src/hg/lib/ctgPos2.sql
    echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
         | hgsql danRer3
# create trackDb.ra entry and html page for ctgPos2 track.
    # Changed termRegEx for ctgPos2 in trackDb.ra so that it handles 
    # contigs named "Zv5_scaffold*". (2006-04-19, hartera)

# CREATE gc5Base WIGGLE TRACK (DONE, 2005-06-16, hartera)
# FIX LINK FOR WIB FILES TO POINT TO danRer3 ON store11 (2005-07-25, hartera)
    ssh kkstore01
    mkdir -p /cluster/data/danRer3/bed/gc5Base
    cd /cluster/data/danRer3/bed/gc5Base
    # The number of bases that hgGcPercent claimed it measured is calculated,
    # which is not necessarily always 5 if it ran into gaps, and then the
    # division by 10.0 scales down the numbers from hgGcPercent to the range
    # [0-100].  wigEncode now replaces wigAsciiToBinary and the previous
    # processing step between these two programs. The result file is *.wig.
    # Each value represents the measurement over five bases beginning with
    # <position>. wigEncode also calculates the zoomed set of data.
    # Uses the 2bit file in /cluster/data/danRer3 as sequence input.
                                                                                
    nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer3 \
        /cluster/data/danRer3 | \
        wigEncode stdin gc5Base.wig gc5Base.wib
    # load the .wig file back on hgwdev:
    ssh hgwdev
    cd /cluster/data/danRer3/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/danRer3/wib/gc5Base \
                 danRer3 gc5Base gc5Base.wig
    # and symlink the .wib file into /gbdb
    # fix link as danRer3 is now in store 11 (2005-07-25, hartera)
    rm -r /gbdb/danRer3/wib/gc5Base
    mkdir -p /gbdb/danRer3/wib/gc5Base
    ln -s `pwd`/gc5Base.wib /gbdb/danRer3/wib/gc5Base

# MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2005-06-17, hartera)
    # Use -repMatch=512 (based on size -- for human we use 1024, and
    # the zebrafish genome is ~50% of the size of the human genome
    ssh kkr1u00
    mkdir /cluster/data/danRer3/bed/ooc
    cd /cluster/data/danRer3/bed/ooc
    mkdir -p /cluster/bluearc/danRer3
    ls -1 /cluster/data/danRer3/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/danRer3/danRer3_11.ooc -repMatch=512
    # Wrote 50575 overused 11-mers to /cluster/bluearc/danRer3/11.ooc
    # For 10.ooc, repMatch = 4096 for human, so use 2048
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/danRer3/danRer3_10.ooc -repMatch=2048
    # Wrote 12574 overused 10-mers to /cluster/bluearc/danRer3/10.ooc 
    # keep copies of ooc files in this directory and copy to iscratch
    cp /cluster/bluearc/danRer3/*.ooc .
    cp -p /cluster/bluearc/danRer3/*.ooc /iscratch/i/danRer3/
    /cluster/bin/iSync

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer3 (DONE, 2005-07-20, kuhn)
    # hgcentraltest is now on hgwdev                                            
    ssh hgwdev
   # DNA port is "0", trans prot port is "1"
 echo 'insert into blatServers values("danRer3", "blat2", "17778", "1", "0");    insert into blatServers values("danRer3", "blat2", "17779", "0", "1");' \
    | hgsql hgcentraltest
    # this enables blat and isPcr, isPcr is enabled by loading blat server
    # with tilesize=5 (ask for this when request blat servers from 
    # cluster admin).
    # if you need to delete those entries
    echo 'delete from blatServers where db="danRer3";' \
    | hgsql hgcentraltest
    # to check the entries:
    echo 'select * from blatServers where db="danRer3";' \
    | hgsql hgcentraltest

# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2005-07-22, hartera)
# REMAKE THIS TRACK USING chrUn AND chrNA SCAFFOLDS (DONE, 2005-08-19, hartera)
# UPDATED (2006-09-27) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
    # array chip sequences already downloaded for danRer1
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyZebrafish
    mkdir /cluster/bluearc/affy
    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /cluster/bluearc/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer3
    ssh kkr1u00
    mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2005-08-19
    ln -s /cluster/data/danRer3/bed/affyZebrafish.2005-08-19 \
          /cluster/data/danRer3/bed/affyZebrafish
    cd /cluster/data/danRer3/bed/affyZebrafish
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
    /cluster/bin/iSync

    # the kilokluster is down, so run on the pitakluster
    ssh pk
    cd /cluster/data/danRer3/bed/affyZebrafish
    ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
    ls -1 /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa > genome.lst
    # for output:
    mkdir -p /san/sanvol1/danRer3/affy/pslChrom
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslChrom/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 genome.lst affy.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
# para time
# Completed: 208 of 208 jobs
# CPU time in finished jobs:       1355s      22.59m     0.38h    0.02d  0.000 y
# IO & Wait Time:                  9988s     166.46m     2.77h    0.12d  0.000 y
# Average job time:                  55s       0.91m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              74s       1.23m     0.02h    0.00d
# Submission to last job:           217s       3.62m     0.06h    0.00d

    # then run the 2bit file of scaffolds
    ssh pk 
    cd /cluster/data/danRer3/bed/affyZebrafish
    mkdir scaffoldsNAandUnRun
    cd scaffoldsNAandUnRun
    ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
    foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/*.fa)
       ls -1 $f >> scafs.lst
    end
    mkdir -p /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn/$(root1)_$(root2).psl}\n#ENDLOOP' > template2.sub

    gensub2 scafs.lst affy.lst template2.sub para.spec
    para create para.spec
    para try, check, push ... etc.
# para time
# Completed: 14941 of 14941 jobs
# CPU time in finished jobs:      27574s     459.57m     7.66h    0.32d  0.001 y
# IO & Wait Time:                 47642s     794.03m    13.23h    0.55d  0.002 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              35s       0.58m     0.01h    0.00d
# Submission to last job:           339s       5.65m     0.09h    0.00d

    
    # need to do pslSort and lift up for each separate run
    cd /cluster/data/danRer3/bed/affyZebrafish
    cd /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyZebrafish.psl
    # only use alignments that have at least
    # 95% identity in aligned region.
    # do not use minCover since a lot of sequence is in Un, NA and Finished
    # so genes may be split up so good to see all alignments
    # first do the chr1-25 and chrM alignments
    pslSort dirs raw.psl tmp pslChrom
    pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    # Processed 27408 alignments
    pslSort dirs rawNAandUn.psl tmp pslScaffoldsNAandUn
    pslReps -minAli=0.95 -nearTop=0.005 rawNAandUn.psl scafNAandUn.psl /dev/null
    # Processed 9888 alignments
    # lift up chrom contigs to chrom level
    liftUp affyZfishChroms.psl \
        /cluster/data/danRer3/jkStuff/liftAll.lft warn contig.psl
    liftUp affyZfishScafsNAandUn.psl \
      /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
      warn scafNAandUn.psl
    # sort and merge these files
    mkdir psl
    cp affyZfish* ./psl/
    pslSort dirs affyZebrafish.psl tmp1 psl
    
    # rsync these psl files 
    rsync -a --progress /san/sanvol1/danRer3/affy/*.psl \
         /cluster/data/danRer3/bed/affyZebrafish/
    ssh kkstore02
    cd /cluster/data/danRer3/bed/affyZebrafish
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
    mv affyZebrafish.psl.tmp affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # psl is good
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer3/bed/affyZebrafish
    hgLoadPsl danRer3 affyZebrafish.psl
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes
                                                                                
    hgLoadSeq -abbr=Zebrafish: danRer3 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
    # moved affyZebrafish.html description and trackDb.ra track entry and
    # search for Affy Zebrafish track to
    # ~/kent/src/hg/makeDb/trackDb/zebrafish since it is common to all 
    # danRer assemblies. 

# LIFT FILES FROM SCAFFOLDS TO chrUn AND chrNA (DONE, 2005-07-27, hartera)
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/liftSupertoChrom
    cd /cluster/data/danRer3/liftSupertoChrom
    # lift files are already created when scaffoldFaToAgp was run for chrUn.fa
    # and chrNA.fa. These need to be edited as the last 500 Ns were removed 
    # from the agp file making the sequence 184125739 bp and not 184126239 bp
    # for chrUn, for chrNA, it is 253521007 bp instead of 253521507 bp and need 
    # to change chrUn to chrNA
    cp /cluster/data/danRer3/Un/tmp/chrUn.lft .
    cp /cluster/data/danRer3/NA/tmp/chrNA.lft .
    # edit to remove last lines of each file first
    # then use perl to change co-ordinates
    perl -pi.bak -e 's/184126239/184125739/' chrUn.lft
    perl -pi.bak -e 's/253521507/253521007/' chrNA.lft
    perl -pi.bak -e 's/chrUn/chrNA/' chrNA.lft
    cat *.lft >> liftNAandUnScaffoldsToChrom.lft
    # clean up 
    rm *.bak

# ENSEMBL GENES (DONE, 2005-07-29, hartera) 
    ssh hgwdev  
    mkdir -p /cluster/data/danRer3/bed/ensembl
    cd /cluster/data/danRer3/bed/ensembl
    # Get the Ensembl gene data from
    # http://www.ensembl.org/Multi/martview
    # Follow this sequence through the pages: (NOTE: this interface has changed
    # a little since danRer2)
    # Page 1) Select the Ensembl dataset (v32 here) and the 
    # Danio_rerio choice (ZFISH5 here). Hit next. 22877 entries total.
    # Ensembl 35 now (2005-11-23) and this is the same as for the version 32
    # downloaded as above. Ensembl 36 (Dec 2005) is the same as for 32 for
    # Zebrafish. Ensembl 38 (April 2006) Protein Coding genes is the same 
    # as for Ensembl 32. (Select Gene type as protein_coding on page 2).
    # Page 2) Then hit next.
    # Page 3) Choose the "Structures" Attribute Page from the pulldown menu
    # at the top. Make sure that under the GENE section, the Ensembl 
    # Attributes checked include the Ensembl Gene ID and Ensembl 
    # Transcript ID. Choose GTF as the output. Choose gzip compression.  
    # Hit export. Save as ensemblGene35.gtf.gz

    # the Ensembl gene predictions are mapped to chromosomes except for 
    # chrNA and chrUn. Use lift files for scaffolds to these chroms.
    # get chrUn and chrNA Ensembl records 
    ssh kkstore02
    cd /cluster/data/danRer3/bed/ensembl
    gunzip ensemblGene.gtf.gz
    awk '$1 ~ /^Zv5_NA[0-9]+/ || $1 ~ /^Zv5_scaffold[0-9]+/' ensemblGene.gtf \
                    > ensemblGenechrUns.gtf
    # get records for all other chroms
    awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
    wc -l *.gtf
    # 513421 ensemblGenechroms.gtf
    # 125319 ensemblGenechrUns.gtf
    # 638740 ensemblGene.gtf
    # total lines of files made equal to original file so ok
    liftUp -type=.gtf ensemblGenechrUns.lifted \
     /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ 
     warn ensemblGenechrUns.gtf
     # Got 29880 lifts in 
     # /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
     sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
     cat ensemblGenechrUns.lifted >> ensGene.gtf
     # check file sizes -ok and some of the lifted co-ordinates
     # there were some erroneous lines with "1;" or "2;" - 8 lines total
     # Notified Ensembl and they fixed it so downloaded file again 
     # and reloaded into database
     # Also remove the suffix that denotes the transcript version number. 
     # This is not in the ensGtp or ensPep tables.
     perl -pi.bak -e 's/\.[0-9]+//'g ensGene.gtf
 
     # load into database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ensembl
     hgsql -e 'drop table ensGene;' danRer3
     /cluster/bin/i386/ldHgGene danRer3 ensGene ensGene.gtf
     # Read 32143 transcripts in 638732 lines in 1 files
     # 32143 groups 27 seqs 1 sources 4 feature types
     # 32143 gene predictions

     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
     # Choose Text, tab-separated as the output format and gzip compression.  
     # Result name: ensGtp.
     gunzip ensGtp.tsv.gz
     # edit to remove first header line
     hgsql danRer3 < ~/kent/src/hg/lib/ensGtp.sql
     # remove header line from ensGtp.txt
     echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
         | hgsql -N danRer3

         # Get the ensembl peptide sequences from
    # http://www.ensembl.org/Multi/martview
    # Choose Danio Rerio as the organism
    # Follow this sequence through the pages:
    # Page 1) Choose the Ensembl Genes choice. Hit next.
    # Page 2) Then hit next.
    # Page 3) Choose "Sequences" from the Attributes pulldown menu at the top.
    # Page 4) Choose Peptide as type of sequence to export and select 
    # Ensembl Gene ID from Gene Attributes and 
    # Ensembl Transcript ID and Ensembl Peptide Stable ID from 
    # Transcript Attributes as the output,
    # choose text/fasta and gzip compression,
    # name the file ensemblPep.fa.gz and then hit export.
    gunzip ensemblPep.fa.gz
    hgPepPred danRer3 ensembl ensemblPep.fa
    # added code to hgc.c so that the link to the Ensembl Protein
    # is also displayed on the description page.


FOR MGC GENES:
 - wait one day for nightly build to align and load them into the db
   - rebuild trackDb

# SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn and chrNA INTO SCAFFOLDS
# (DONE, 2005-08-04, hartera)
# ADD SOFT-MASKED SCAFFOLDS TO ISERVERS FOR CLUSTER RUNS 
# (DONE, 2005-08-15, hartera) AND TO BLUEARC (DONE, 2005-08-19)
    ssh kkstore02
    cd /cluster/data/danRer3
    # for chrUn and chrNA, get masked sequence for soft and hard-masked 
    foreach c (Un NA)
      cd $c
      mkdir scaffoldsSoftMask scaffoldsHardMask
      awk 'BEGIN {FS="\t"}{if ($5 != "N") \
       print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
       >> ./scaffoldsSoftMask/faFragSoftMask.csh
      awk 'BEGIN {FS="\t"}{if ($5 != "N") \
        print "faFrag -mixed chr'${c}'.fa.masked",$2-1, $3, $6".fa.masked";}' \
        chr${c}.agp >> ./scaffoldsHardMask/faFragHardMask.csh
      cd ..
    end 

    # change permissions run scripts to get sequences
    foreach d (Un NA)
       chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
       chmod +x $d/scaffoldsHardMask/faFragHardMask.csh
    end

    cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
#!/bin/csh
foreach c (Un NA)
   set dir=/cluster/data/danRer3
   echo "Processing $c"
   cd $dir/$c/scaffoldsSoftMask
   cp ../chr${c}.fa .
   echo "Getting soft-masked sequences ..." 
   nice faFragSoftMask.csh >& faFrag.log
   echo "Getting hard-masked sequences ..." 
   cd $dir/$c/scaffoldsHardMask
   cp ../chr${c}.fa.masked .
   nice faFragHardMask.csh >& faFrag.log
end 
'_EOF_'
   chmod +x jkStuff/getMaskedScaffolds.csh
   nice ./jkStuff/getMaskedScaffolds.csh &
   # check a few sequences that they are correct
   # add name of scaffold to sequence fasta and cat together
   foreach c (Un NA)
      set dir = /cluster/data/danRer3
      foreach d (scaffoldsSoftMask scaffoldsHardMask)
         cd $dir/$c/$d
         foreach f (Zv5*)
           if ($d == "scaffoldsHardMask") then
              set b=$f:r
              set g=$b:r
              set sc=scaffoldMasked${c}.fa
           else
              set g=$f:r
              set sc=scaffold${c}.fa
           endif 
           perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f
           cat $f >> $sc
           rm *.bak
         end
         cp scaffold* $dir/$c/
      end
   end
   # check sizes of final FASTA file with all sequences. check a few
   # sequence files to see that they are correct - ok 
   # Add soft-masked scaffolds to the iservers for cluster runs 
   # (2005-08-15, hartera)
   ssh kkr1u00
   mkdir -p /iscratch/i/danRer3/scaffoldsSoftMask
   cd /cluster/data/danRer3
   foreach c (NA Un)
      foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
      cp -p $f /iscratch/i/danRer3/scaffoldsSoftMask
      end
   end
   /cluster/bin/iSync
   # Add soft-masked scaffolds to the bluearc for cluster runs 
   # (2005-08-19, hartera)
   ssh kkr1u00
   cd /cluster/data/danRer3/
   mkdir -p /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask
   foreach c (NA Un)
      foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
         rsync -a --progress $f \
         /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/
      end 
   end 

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2005-08-05, hartera)
    ssh kkstore02
    cd /cluster/data/danRer3
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/gzipAll.csh
rm -rf gzip
mkdir gzip
# chrom AGP's
tar cvzf gzip/chromAgp.tar.gz [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
tar cvzf gzip/chromOut.tar.gz */chr*.fa.out
# soft masked chrom fasta
tar cvzf gzip/chromFa.tar.gz */chr*.fa
# soft masked chrNA and chrUn scaffolds
tar cvzf gzip/scaffoldUnsFa.tar.gz NA/scaffoldNA.fa \
    Un/scaffoldUn.fa
# hard masked chrom fasta
tar cvzf gzip/chromFaMasked.tar.gz */chr*.fa.masked
# hard masked chrNA and chrUn scaffolds
tar cvzf gzip/scaffoldUnsFaMasked.tar.gz \
    NA/scaffoldMaskedNA.fa \
    Un/scaffoldMaskedUn.fa
# chrom TRF output files
cd bed/simpleRepeat
tar cvzf ../../gzip/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..

# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer3 -native GenBank mrna \
        /cluster/data/danRer3/gzip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer3 -xeno GenBank mrna \
        /cluster/data/danRer3/gzip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer3 -native refseq mrna \
/cluster/data/danRer3/gzip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer3 -native GenBank est \
/cluster/data/danRer3/gzip/est.fa
                                                                                
cd /cluster/data/danRer3/gzip
# gzip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
gzip mrna.fa
gzip xenoMrna.fa
gzip refMrna.fa
gzip est.fa
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x ./jkStuff/gzipAll.csh
    csh ./jkStuff/gzipAll.csh |& tee ./jkStuff/gzipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.
    # Make upstream files and Copy the .zip files to
    # hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/danRer3/gzip
    # make upstream files for zebrafish RefSeq
    featureBits danRer3 refGene:upstream:1000 -fa=upstream1000.fa
    gzip upstream1000.fa
    featureBits danRer3 refGene:upstream:2000 -fa=upstream2000.fa
    gzip upstream2000.fa
    set gp = /usr/local/apache/htdocs/goldenPath/danRer3
    mkdir -p $gp/bigZips
    cp -p *.gz $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f (../*/chr*.fa)
       cp $f $gp/chromosomes
    end
    foreach c (NA Un)
       cd /cluster/data/danRer3/$c
       cp scaffold${c}.fa.gz $gp/chromosomes
    end
    cd $gp/bigZips
    md5sum *.gz > md5sum.txt
    cd $gp/chromosomes
    # gzip the chromosome and scaffold FASTAs individually
    foreach f (*.fa)
      gzip $f
    end
    md5sum *.gz > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*
    # copy README.txt's from danRer2 and update

# MAKE NIB FILES AND 2BIT FILE FOR SOFT MASKED chrUn AND chrNA SCAFFOLDS
# (DONE, 2005-08-06, hartera)
# ADD chrUn AND chrNA SCAFFOLDS 2BIT FILE TO BLUEARC (DONE, 2005-08-19, hartera)

    ssh kkstore02
    cd /cluster/data/danRer3
    mkdir scaffoldsNAandUnNib
    # Build nib files, using the soft masking in the fa
    foreach c (NA Un)
       echo "Processing $c"
       foreach f ($c/scaffoldsSoftMask/Zv5*.fa)
         faToNib -softMask $f scaffoldsNAandUnNib/$f:t:r.nib
       end
    end
    # check correct number of nib files in directory: 14941
    # there are 14676 chrNA scaffolds and 265 chrUn scaffolds
    # copy chromosome 1-25 and chrNA and chrUn scaffolds nibs to a directory
    # on iscratch and iSync for use in cluster runs
    ssh kkr1u00
    mkdir -p /iscratch/i/danRer3/chromandScafNib
    cp -p /cluster/data/danRer3/nib/chr[0-9]*.nib \
       /iscratch/i/danRer3/chromandScafNib
    foreach f (/cluster/data/danRer3/scaffoldsNAandUnNib/Zv5*.nib)
       cp -p $f /iscratch/i/danRer3/chromandScafNib
    end
    ssh kkstore02
    # make a 2 bit file of all the scaffolds for chrNA and chrUn
    # for blastz cluster runs
    cd /cluster/data/danRer3/
    cat NA/scaffoldNA.fa Un/scaffoldUn.fa > danRer3NAandUnScaffolds.fa
    grep '>' danRer3NAandUnScaffolds.fa | wc -l
    # 14941
    faToTwoBit danRer3NAandUnScaffolds.fa danRer3NAandUnScaf.2bit
    ssh kkr1u00
    mkdir -p /iscratch/i/danRer3/NAandUnScafs
    cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
       /iscratch/i/danRer3/NAandUnScafs
    /cluster/bin/iSync
    
    # get sizes of scaffolds for the .len file used by blastz
    ssh kolossus
    mkdir -p /panasas/store/danRer3/NAandUnScafSizes
    cd /cluster/data/danRer3
cat << '_EOF_' > jkStuff/getNAandUnScafSizes.csh
#!/bin/csh -fe
foreach c (NA Un)
  set sizeDir=/panasas/store/danRer3/NAandUnScafSizes
  cd /cluster/data/danRer3/$c/scaffoldsSoftMask
  foreach f (Zv5*.fa)
     set g=$f:r
     faSize detailed=on $f >> $sizeDir/NAandUnScafs.sizes
  end
end
'_EOF_'
    chmod +x jkStuff/getNAandUnScafSizes.csh
    nice jkStuff/getNAandUnScafSizes.csh >& size.log &
    # took about 1 minute
    wc -l /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
    # 14941 /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
    # so correct number of scaffolds
    cp /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes \
       /cluster/data/danRer3
    # add 2 bit to bluearc for cluster runs (2005-08-19, hartera)
    ssh kkr1u00
    mkdir -p /cluster/bluearc/scratch/danRer3
    cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
       /cluster/bluearc/scratch/danRer3/

# BLASTZ SWAP FOR MOUSE (mm6) (DONE, 2005-08-10, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera)
# DROPPED THE CHAIN AND NET TABLES FROM HGWDEV AS THERE WERE 3 SETS OF 
# MOUSE ALIGNMENTS: mm6, mm7 and mm8 (DONE, 2006-03-28, hartera)
    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific
    # if not done already, get lineage-specific repeats
    mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish
    foreach f (/panasas/store/mm6/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end

    mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse
    foreach f (/iscratch/i/danRer3/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec
    end
    /cluster/bin/iSync

    # NOTE: the "mouse/human/etc." lineage-specific repeat files are now in
    # /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
    # however, the files for chrNA and chrUn were missing, so I'm
    # adding them here.  (2005-12-19 kate)
    ssh kkstore02
    cd /cluster/data/danRer3
    cp -p Un/chrUn.fa.out  \
        /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrUn.out.spec
    cp -p NA/chrNA.fa.out \
        /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrNA.out.spec

    # do swap of mm6 vs. danRer3 chain and net alignments to 
    # create danRer3 vs. mm6. see makeMm6.doc for details.
    ssh kk
    cd /cluster/data/mm6/bed/blastz.danRer3
    mkdir -p /panasas/store/danRer3vsmm6Out
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -stop download -blastzOutRoot /panasas/store/danRer3vsmm6Out \
        -swap -chainMinScore=5000 >& doSwap.log &
    # Start: Aug 10 16:30
    # Finish: Aug 10 16:54
    # Blastz parameters are as for mm6 vs. danRer3 danRer3 - see makeMm6.doc
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
  # do cleanup step and specify a different file server as can not 
  # access panasas from kkstore02.
  nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -continue cleanup -fileServer eieio \
        -blastzOutRoot /panasas/store/danRer3vsmm6Out \
        -swap -chainMinScore=5000 >& doSwapCleanUp.log &
  # make html files and trackDb.ra entry for chain and net tracks.
  # check README.txt for downloads.
# featureBits -chrom=chr1 danRer3 refGene:cds chainMm6Link -enrichment
# refGene:cds 0.688%, chainMm6Link 8.193%, both 0.543%, cover 78.94%, 
# enrich 9.64x
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.642%, chainMm5Link 4.499%, both 0.492%, cover 76.60%, 
# enrich 17.02x
# featureBits -chrom=chr2 danRer3 refGene:cds chainMm6Link -enrichment 
# refGene:cds 0.705%, chainMm6Link 8.219%, both 0.557%, cover 79.04%, 
# enrich 9.62x
# featureBits -chrom=chr2 danRer2 refGene:cds chainMm5Link -enrichment 
# refGene:cds 0.739%, chainMm5Link 4.539%, both 0.579%, cover 78.37%, 
# enrich 17.26x
# looks good, although enrichment is lower than for danRer2 and mm5, there are 
# more chains in the score <10000 range for danRer3 than for danRer2 but 
# this does not make up for all the extra chains in danRer3 over danRer2. 
# Maybe there are more high scoring alignments to the chrUn and chrNA chains 
# due to the scaffolds being used for the alignments.
# danRer3 has a extra sequence compared to danRer2. danRer3 chr2 is 48.2 Mb
# and for danRer2, chr2 is 52 Mb so in this case the chrom is smaller.
# featureBits -chrom=chrNA danRer3 refGene:cds chainMm6Link -enrichment
# refGene:cds 0.449%, chainMm6Link 10.952%, both 0.350%, cover 77.94%, 
# enrich 7.12x
# featureBits -chrom=chrNA danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.499%, chainMm5Link 4.176%, both 0.372%, cover 74.60%, 
# enrich 17.86x

   # netToAxt was processing nets incorrectly so remake these with 
   # new version of netToAxt and transfer to downloads dir. 
   ssh kkstore02
   cd /cluster/data/danRer3/bed/blastz.mm6.swap
   rm -r axtNet
   # Make axtNet for download: one .axt per danRer3 seq.
   # remake noClass.net
   # Make nets("noClass", i.e. without rmsk/class stats which are added later):
   cd axtChain
chainPreNet danRer3.mm6.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout \
| chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net

   # create net for each chrom again
   netSplit noClass.net net
   # also split up chains again
   mkdir chain
   zcat danRer3.mm6.all.chain.gz | chainSplit chain stdin
   ssh hgwdev
   cd /cluster/data/danRer3/bed/blastz.mm6.swap
   mkdir axtNet
   foreach f (axtChain/net/*.net)
     netToAxt $f axtChain/chain/$f:t:r.chain \
    /cluster/bluearc/danRer3/nib /panasas/store/mm6/nib stdout \
     | axtSort stdin stdout \
     | gzip -c > axtNet/$f:t:r.danRer3.mm6.net.axt.gz
   end

   # cleanup 
   ssh kkstore02 
   cd /cluster/data/danRer3/bed/blastz.mm6.swap/axtChain
   rm noClass.net
   rm -r net
   rm -r chain
   # remake mafNet from the new axtNet
   cd /cluster/data/danRer3/bed/blastz.mm6.swap
   rm -r mafNet
   # Make mafNet for multiz: one .maf per danRer3 seq.
   mkdir mafNet
   foreach f (axtNet/*.danRer3.mm6.net.axt.gz)
     axtToMaf -tPrefix=danRer3. -qPrefix=mm6. $f \      
    /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len \
     stdout \
     | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
   end

   # copy the new axtNet files to downloads and replace old ones
   ssh hgwdev
   rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
   cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6
   mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
   ln -s /cluster/data/danRer3/bed/blastz.mm6.swap/axtNet/*.axt.gz axtNet/
   # remake md5sum.txt 
   rm md5sum.txt
   md5sum *.gz */*.gz > md5sum.txt
   # Dropped mouse mm6 chain and net tables from hgwdev as there were 3 sets 
   # of mouse alignments for danRer3: mm6, mm7 and mm8 (hartera, 2006-03-29)
   hgsql -e 'drop table netMm6;' danRer3
   foreach c (`cat /cluster/data/danRer3/chrom.lst`)
      hgsql -e "drop table chr${c}_chainMm6;" danRer3
      hgsql -e "drop table chr${c}_chainMm6Link;" danRer3
   end

# BLASTZ FOR FUGU (fr1) (DONE, 2005-08-18, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# RECREATE DOWNLOADS AS THE FUGU DOWNLOADS DIRECTORY HAS BEEN DELETED
# (DONE, 2005-11-17, hartera)
  ssh kk
  mkdir /cluster/data/danRer3/bed/blastz.fr1.2005-08-13
  cd /cluster/data/danRer3/bed
  ln -s blastz.fr1.2005-08-13 blastz.fr1
# use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as Fugu is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer3)
# soft-masked chroms, 1-25 and M
SEQ1_DIR=/iscratch/i/danRer3/chromNib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Fugu (fr1)
# soft-masked scaffolds in 2bit format
SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/danRer3/bed/blastz.fr1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    cp /cluster/data/danRer3/chrom.sizes ./S1.len
    # make S2.len for fr1 scaffolds
    twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
    wc -l *.len
    # 28 S1.len
    # 20379 S2.len
    # make output directory
    mkdir -p /cluster/bluearc/danRer3vsfr1Out
    # do blastz and create chains for fr1 scaffolds on danRer3 chr1-25 and chrM 
    # chickenHumanTuned.gap scoring matrix is now used by default 
    # by axtChain.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
       -stop chainMerge >& do.log &
    # Start: Aug 13 10:48 
    # Finish: Aug 13 13:35
    # then run the danRer3 NA and Un scaffolds against fugu scaffolds 
    mkdir NAandUnScaffolds
    cd NAandUnScaffolds
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer3)
# soft-masked scaffolds for chrNA and chrUn in 2 bit format
SEQ1_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Fugu (fr1)
# soft-masked scaffolds in 2bit format
SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    twoBitInfo /cluster/data/danRer3/danRer3NAandUnScaf.2bit ./S1.len
    # make S2.len for fr1 scaffolds
    twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
    wc -l *.len
    # 14941 S1.len
    # 20379 S2.len
    # make output directory
    mkdir -p /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds
    # do blastz and create chains for fr1 scaffolds on danRer3 
    # chrNA and chrUn scaffolds
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds \
       -chainMinScore=5000 -stop chainMerge >& do.log & 
    # Start: Aug 13 14:05
    # Finish: Aug 14 20:58
    # The log file says it is finished. chainSplit was not run as SEQ1 has
    # is not < 100 sequences. Need to do liftUp before running chainSplit.
    cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain/run
    # Lifting up chains:
    # need to lift these chains up to chrom level for Fugu for chrom run and 
    # for danRer3 and Fugu for the NA and Un scaffolds run.
    # first for Fugu in the danRer3 chrom run
    ssh kkstore02
    cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
    mkdir liftedChain
    foreach f (chain/*.chain)
       set c=$f:t:r
       echo $c
       liftUp -chainQ liftedChain/${c}.lifted.chain \
             /cluster/data/fr1/Un/lift/ordered.lft warn $f
    end
    # lift up for danRer3 scaffolds run.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain
    # first lift Fugu fr1 query, there is no split chains here as there
    # were not < 100 sequences for the target.
 zcat danRer3.fr1.all.chain.gz | liftUp -chainQ danRer3.fr1.liftedQall.chain \
          /cluster/data/fr1/Un/lift/ordered.lft warn stdin
    # then liftUp target coords for danRer3 
    liftUp danRer3.fr1.liftedQandTall.chain \
      /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
      warn danRer3.fr1.liftedQall.chain
    # gzip lifted danRer3.fr1 chain file
    gzip danRer3.fr1.liftedQandTall.chain
    # merge the chains from the danRer3 chrom run and the danRer3
    # NA and Un scaffolds run. chains are sorted by score and IDs are uniqued.
    cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
    mv danRer3.fr1.all.chain.gz danRer3.fr1.chroms.chain.gz
    set blastz=/cluster/data/danRer3/bed/blastz.fr1
    # copy over lifted chains for danRer3 scaffolds vs fr1 
    cp $blastz/NAandUnScaffolds/axtChain/danRer3.fr1.liftedQandTall.chain.gz \ 
       ./liftedChain
    gunzip ./liftedChain/*.gz
    nice chainMergeSort liftedChain/*.chain \
         | nice gzip -c > danRer3.fr1.all.chain.gz
    # then split up into chains again
    mv chain chromChain
    mkdir chain
    nice zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
    # then pick up the doBlastzChainNet.pl script at the net step.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/blastz.fr1
    cp DEF DEF.chroms
    # edit DEF file to include the all nib files for danRer3 and the 
    # nib file for the chrUn of Fugu fr1. Since all the coords have now
    # been lifted to chrom level then these are now needed.
    # SEQ1_DIR=/iscratch/i/danRer3/nib
    # SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
    # use kkr1u00 for computationally intensive steps as kolossus is down.
    # need to create new S2.len for whole chrUn for Fugu
    mv S2.len S2.scaffolds.len
    cp /cluster/data/fr1/chrom.sizes S2.len 
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
       -workhorse kkr1u00 -continue net >& doNet.log &
    # crashed at cleanup step when trying to access kkstore02 
    # The authenticity of host 'kkstore02 (128.114.50.155)' can't be
    # established.  Re-run from this step.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
       -workhorse kkr1u00 -continue cleanup >& doNet2.log &
    # netToAxt was processing nets incorrectly so remake these with 
    # new version of netToAxt. 
    # and transfer to downloads dir.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/blastz.fr1
    rm -r axtNet
    # Make axtNet for download: one .axt per danRer3 seq.
    # remake noClass.net
    # Make nets("noClass", i.e. without rmsk/class stats which are added later):
    cd axtChain
    chainPreNet danRer3.fr1.all.chain.gz \
/cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout \
| chainNet stdin -minSpace=1 /cluster/data/danRer3/bed/blastz.fr1/S1.len \
/cluster/data/danRer3/bed/blastz.fr1/S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
    # create net for each chrom again
    netSplit noClass.net net
    # also split up chains again
    mkdir chain
    zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.fr1
    mkdir axtNet
    foreach f (axtChain/net/*.net)
       netToAxt $f axtChain/chain/$f:t:r.chain \
       /cluster/bluearc/danRer3/nib /cluster/bluearc/fugu/fr1/chromNib stdout \
       | axtSort stdin stdout \
       | gzip -c > axtNet/$f:t:r.danRer3.fr1.net.axt.gz
    end
    # cleanup 
    ssh kkstore02 
    cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
    rm noClass.net
    rm -r net
    rm -r chain
    # remake mafNet from the new axtNet
    cd /cluster/data/danRer3/bed/blastz.fr1
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.danRer3.fr1.net.axt.gz)
      axtToMaf -tPrefix=danRer3. -qPrefix=fr1. $f \
     /cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len \
     stdout \
     | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end

    # copy the new axtNet files to downloads and replace old ones
    ssh hgwdev
    rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
    cd /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1
    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
    ln -s /cluster/data/danRer3/bed/blastz.fr1/axtNet/*.axt.gz axtNet/
    # remake md5sum.txt 
    rm md5sum.txt
    md5sum *.gz */*.gz > md5sum.txt

    # Check README in downloads section and add a note about how the 
    # unordered chroms were split up into scaffolds.
    # Add trackDb entry for chain and net tracks to 
    # trackDb/zebrafish/danRer3/trackDb.ra 
    # Do swap to get danRer3 chains on Fugu, fr1 - see makeFr1.doc
# featureBits -chrom=chr2 danRer3 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.705%, chainFr1Link 8.960%, both 0.645%, cover 91.53%, 
# enrich 10.22x
# featureBits -chrom=chr2 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.739%, chainFr1Link 4.537%, both 0.620%, cover 83.90%, 
# enrich 18.49x
# featureBits -chrom=chrNA danRer3 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.449%, chainFr1Link 7.129%, both 0.399%, cover 88.78%, 
# enrich 12.45x
# featureBits -chrom=chrNA danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.499%, chainFr1Link 3.901%, both 0.409%, cover 81.90%, 
# enrich 20.99x
    # Run directory files are already on /cluster/data. Remake downloads
    # for fugu alignments since these have been removed from
    # the downloads directory. (hartera, 2005-11-17)
    ssh hgwdev 
    # remake downloads using doBlastzChainNet.pl script
    cd /cluster/data/danRer3/bed/blastz.fr1
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
        -continue download -stop download `pwd`/DEF >& doDownload.log &
    # Check README in downloads section and add a note about how the 
    # unordered chroms were split up into scaffolds.

# VEGA
    # get transcripts in transcripts_coords from e-mail from Mario Caccamo
    # at Sanger 06/16/05.
    # also README for Vega
    ssh kkstore01
    mkdir -p /cluster/data/danRer3/bed/vegaGene
    cd /cluster/data/danRer3/bed/vegaGene 

# AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-08-22, markd)
    # align with revised genbank process
    cd ~kent/src/hg/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add danRer3, had to run on pk, due to kk
    # being down.  Set temporary locations for server files

# danRer3 (zebrafish)
# Lift file partitions unplaced sequence pseudo-chroms (disabled)
danRer3.serverGenome = /cluster/data/danRer3/danRer3.2bit
##danRer3.clusterGenome = /iscratch/i/danRer3/danRer3.2bit
##danRer3.ooc = /iscratch/i/danRer3/danRer3_11.ooc
danRer3.clusterGenome = /san/sanvol1/scratch/danRer3/danRer3.2bit
danRer3.ooc = /san/sanvol1/scratch/danRer3/danRer3_11.ooc
##danRer3.align.unplacedChroms = chrNA chrUn
##danRer3.lift = /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
danRer3.lift = no
danRer3.downloadDir = danRer3
danRer3.mgcTables.default = full
danRer3.mgcTables.mgc = all

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial danRer3 &

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  danRer3&

    # enable daily alignment and update of hgwdev
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # add danRer3 to:
        etc/align.dbs
        etc/hgwdev.dbs 
    cvs commit
    make etc-update

# TIGR GENE INDEX (DONE, 2005-08-24, hartera)
# Data from Razvan Sultana (rsultana@jimmy.harvard.edu or rsultana@tigr.org)
# Includes data for chr1-25 and chrM, NOT chrNA and chrUn. Asked for these
# on scaffolds and not on the virtual chroms - harder to generate. 
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/tigr
    cd /cluster/data/danRer3/bed/tigr
    wget --timestamping \
ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer3_chr1-25.tgz
    tar xvzf TGI*.tgz
    # this is data for just chr1-25 and chrM. Data for NA and Un are to follow.
    ls chr1_*
    # chr1_drosophTCs  chr1_g_gallusTCs  chr1_mouseTCs  chr1_zfishTCs
    # chr1_elegansTCs  chr1_humanTCs     chr1_ratTCs
    # so species are fly, chicken, mouse, zebrafish, C. elegans, human and rat
    foreach f (*g_gallus*)
       set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
       mv $f $f1
    end 

    foreach f (*drosoph*)
    set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
       mv $f $f1
    end

    foreach o (Dmelano chicken elegans human mouse rat zfish)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ssh hgwdev
    cd /cluster/data/danRer3/bed/tigr
    hgsql danRer3 -e "drop table tigrGeneIndex"

    nice ldHgGene -exon=TC danRer3 tigrGeneIndex *.gff
    # Read 75388 transcripts in 288032 lines in 182 files
    # 75388 groups 26 seqs 1 sources 1 feature types
    # 75388 gene predictions
    checkTableCoords danRer3 tigrGeneIndex
    /cluster/bin/scripts/runGeneCheck /cluster/data/danRer3/bed/tigr
    # no CDS in these gene predictions so fix this:
    hgsql danRer3 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql danRer3 -e "update tigrGeneIndex set cdsEnd = txEnd;"
    # compress all files
    gzip chr*

# MAKE Human Proteins track  (DONE 2005-09-21 braney)
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/blastDb
    cd /cluster/data/danRer3/blastDb
    cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
    for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
    ln -s `cat list` .
    for i in *.fa
    do
	/projects/compbio/bin/i686/formatdb -i $i -p F
    done
    rm *.log *.fa list
    cd ..
    for i in `cat blastDb/chrom.list`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
    rm blastDb/chrom.list

    mkdir /cluster/data/danRer3/scaffoldBlastDb
    cd /cluster/data/danRer3/scaffoldBlastDb
    cat ../Un/scaffoldsSoftMask/*.fa ../NA/scaffoldsSoftMask/*.fa |  faSplit sequence stdin 500 scaf
    for i in *.fa
    do
	/projects/compbio/bin/i686/formatdb -i $i -p F
    done
    rm *.log *.fa

    mkdir -p /san/sanvol1/scratch/danRer3/comboBlastDb
    cd /cluster/data/danRer3/blastDb
    for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done
    cd /cluster/data/danRer3/scaffoldBlastDb
    for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done

    mkdir -p /cluster/data/danRer3/bed/tblastn.hg17KG
    cd /cluster/data/danRer3/bed/tblastn.hg17KG
    echo  /san/sanvol1/scratch/danRer3/comboBlastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  

    # we want around 250000 jobs
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 37365/(250000/3539) = 528.938940

    mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
    split -l 529 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done

    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2       
        liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3       
	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4       

        if pslCheck -prot $3.tmp                                                                          
        then                                                                                              
            mv $3.tmp $3                                                                                  
            rm -f $f.1 $f.2 $f.3  $f.4
        fi
        exit 0                                                                                            
    fi                                                                                                    
fi                                                                                                        
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/danRer3/bed/tblastn.hg17KG
    para create blastSpec
    para push

# Completed: 203170 of 203170 jobs
# CPU time in finished jobs:   17875092s  297918.20m  4965.30h  206.89d  0.567 y
# IO & Wait Time:               4092508s   68208.46m  1136.81h   47.37d  0.130 y
# Average job time:                 108s       1.80m     0.03h    0.00d
# Longest finished job:            1778s      29.63m     0.49h    0.02d
# Submission to last job:         64970s    1082.83m    18.05h    0.75d

    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push

# Completed: 71 of 71 jobs
# CPU time in finished jobs:      89115s    1485.25m    24.75h    1.03d  0.003 y
# IO & Wait Time:                 35631s     593.85m     9.90h    0.41d  0.001 y
# Average job time:                1757s      29.28m     0.49h    0.02d
# Longest finished job:           15587s     259.78m     4.33h    0.18d
# Submission to last job:         23380s     389.67m     6.49h    0.27d

    ssh kkstore02
    cd /cluster/data/danRer3/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    liftUp -nohead -type=.psl stdout /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft carry u.*.psl m60* | \
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > /cluster/data/danRer3/bed/tblastn.hg17KG/blastHg17KG.psl
        
    ssh hgwdev
    cd /cluster/data/danRer3/bed/tblastn.hg17KG
    hgLoadPsl danRer3 blastHg17KG.psl
# 21063005 bases of 1630323462 (1.292%) in intersection

    # back to kkstore02
    rm -rf blastOut
# End tblastn

# BACENDS TRACK (DONE, 2005-09-28, hartera)
# Track display is very slow on large regions. Split all_bacends table by
# chromosome (DONE, 2006-04-19, hartera)
# REDO BACENDS FOR PAIRS, SINGLES, BAD PAIRS AND ALL BACENDS TABLES
# (see separate section on REDO BACENDS, 2006-05-01 - 2006-05-08, hartera)    
    ssh kkstore01
    # BAC ends sequence files provided by Mario Caccamo at Sanger
    # mc2@sanger.ac.uk
    mkdir -p /cluster/data/danRer3/bed/bacends
    cd /cluster/data/danRer3/bed/bacends

    wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/zf_bacends.fa.gz
    wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/DH_bacends.fa.gz
    wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/bacend_placement.txt.gz
    gunzip *.gz  
    # DH_bacends.fa are from the new library from a doubled haploid zebrafish
    # zf_bacends.fa are from the existing libraries used in danRer2 and danRer1
    # Several reads are present for some of the BAC ends and these have
    # names like p1kaSP6 or q1kaT7 for duplicated reads and p1kSP6w or q1kT7w
    # for multiple reads. In the trace repository, the most recent sequence
    # is stored and the 'a' or 'w' is dropped from the name.
    # for the DH_bacends.fa from the CHORI73 library, the names are 
    # experiment file name                  trace_name
    # ========================              ================
    # CHORI73_139g06.p1kSP6                 CHORI73_139G6SP6
    # CHORI73_165b21.q1kT7                  CHORI73_165B21T7
    # The trace name is that stored in the trace archive with leading zeros
    # dropped and ".p1k" or ".q1k" and lower case changed to upper. 
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends
    # check list of prefixes in zf_bacends.fa
    grep '>' zf_bacends.fa > zf.names
    perl -pi.bak -e 's/>//' zf.names
    perl -pi.bak -e 's/^([A-Za-z]+)[0-9]+.+/$1/' zf.names
    sort -u zf.names
    # bZ
    # zC
    # zK
    # zKp
    # in DH_bacends.fa, all are CHORI73_
    # For DH_bacends.fa, need to clean up, change names to Trace archive
    # format as above. Then choose most recent sequence, those that are bad
    # with lots of Ns should be removed at the alignment stage as they will 
    # not pass the Blat or pslReps criteria. 
   #  cat zf_bacends.fa DH_bacends.fa >> Zv5Bacends.fa
  #  faSize Zv5Bacends.fa
    # 680121953 bases (11160014 N's 668961939 real 668961939 upper 0 lower) 
    # in 729101 sequences in 1 files
    # Total size: mean 932.8 sd 242.6 min 26 (CHORI73_189m04.p1kSP6) 
    # max 5717 (CHORI73_255a17.q1kT7) median 882
    # N count: mean 15.3 sd 75.7
    # U count: mean 917.5 sd 242.2
    # L count: mean 0.0 sd 0.0
    wc -l *.fa
    # 6412741 DH_bacends.fa
    # 14700258 Zv5Bacends.fa
    # 8287517 zf_bacends.fa
    grep '>' DH_bacends.fa | wc -l
    # 304252
    grep '>' zf_bacends.fa | wc -l
    # 424849
    # for DH_bacends.fa there are replicate reads. If duplicate plates 
    # have been made (i.e. read names like ..p1kaSP6 or ..q1kaT7) or plates 
    # have been sequenced multiple times (i.e. read names like ..p1kSP6w or 
    # ..q1kT7w), the Sanger trace repository has the most recent read and 
    # dropped the 'a' or 'w' from the trace name.
    # some are not in the repository. They had bad quality reads with a lot 
    # of Ns or runs of the same base. These should be dropped in the 
    # alignment filtering. 
    
    # now download sequence files from Sanger ftp site - these are the 
    # ones from the Sanger sequence repository
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/bacends/seqs
    cd /cluster/data/danRer3/bed/bacends/seqs
    # get contents of ftp directory
    wget --timestamp \
 ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
    # from index.html, grep lines with cloneEnd 
    grep "cloneEnd" index.html > cloneEnds
    awk 'BEGIN {FS="\""} {print "wget --timestamp",$2;}' cloneEnds \
        > getCloneEnds.csh
    chmod +x getCloneEnds.csh 
    cat getCloneEnds.csh
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025270298.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025273988.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025278580.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035416745.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035417824.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1040215846.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1048006071.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1114727127.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115222417.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115226483.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115230498.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115234585.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115238038.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115240957.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039514906.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039603426.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039604741.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1040231265.fasta.gz
    nice getCloneEnds.csh >& bac.log &
    # check log to see everything downloaded ok and then remove
    rm bac.log index.html
    # unzip files
    gunzip *.gz
    # cat together CHORI73 FASTA files
    cat sanger-zfish-CHORI*.fasta > CHORI73_bacends.fa
    grep '>' CHORI73_bacends.fa > CHORI73.names
    perl -pi.bak -e 's/>//' CHORI73.names
    sort CHORI73.names | uniq > CHORI73.names.sort
    wc -l CHORI73.names.sort
    # 265235 CHORI73.names.sort
    cat sanger-zfish-cloneEnd*.fasta > zfish_bacends.fa
    cat mpgeb-zfish-cloneEnd*.fasta > zfishmpgeb_bacends.fa
    grep '>' zfish_bacends.fa | wc -l
    # 164302
    grep '>' zfishmpgeb_bacends.fa | wc -l
    # 264633
    cp CHORI73.names.sort /cluster/data/danRer3/bed/bacends/
    # compared this list of sequence names for zf_bacends.fa and got more
    # sequences in the zf_bacends.fa - checked and some are in the trace
    # repository and some are not. 
    # for CHORI_73 there are 394 extra sequences in the downloaded file
    # and over 7000 in the original file sent by Mario. Just use the original 
    # file here as the sequences will probably be filtered out if there 
    # are bad alignments. get list of sequences for which there are more than 
    # 2 ends. Some end sequences have multiple reads. 
    cd /cluster/data/danRer3/bed/bacends
    # look at file of CHORI73_ sequences sent by Mario at Sanger:
    grep '>' DH_bacends.fa > DH.names
    perl -pi.bak -e 's/>//' DH.names 
    perl -pi.bak -e 's/(CHORI73_[0-9a-z]+)\.[a-z0-9]+.+/$1/' DH.names
    sort DH.names | uniq -c | sort -nr > DH.names.counts 
    awk '{if ($1 > 2) print $2;}' DH.names.counts > DH.names.morethan2
    # translate to upper case and remove leading zeros
    cat DH.names.morethan2 | tr '[a-z]' '[A-Z]' > DH.names.morethan2.upper
    # remove leading 0
    perl -pi.bak -e 's/(CHORI73_[0-9]+[A-Z])0([0-9]+)/$1$2/' \
        DH.names.morethan2.upper
    sort DH.names.morethan2.upper | uniq > DH.names.morethan2.upper.sort
    wc -l *.sort
    # 265235 CHORI73.names.sort
    # 6020 DH.names.morethan2.upper.sort
    comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort | wc
    # 5299
    # so 721 are not in this list so they are probably not in the repository
    # but align these anyway.
    # for those that are then use the versions in CHORI73.names
    comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort \
        > CHORI73.names.touse
    comm -13 CHORI73.names.sort DH.names.morethan2.upper.sort \
        > DHmorethan2.DHonly
    awk '{if ($1 <= 2) print $2;}' DH.names.counts > DH.names.2orless
    # this is list of sequences to get from DH_bacends.fa
    # need to back translate the list in DHmorethan2.DHonly
    cat DHmorethan2.DHonly | tr '[A-Z]' '[a-z]' > DHtmp
    sed -e 's/chori/CHORI/' DHtmp > DHmorethan2.DHonly.format
    # need to put leading zeros back and "." at the end to help
    # pattern matching with grep.
cat << '_EOF_' > addZeros.pl
#/usr/bin/perl -w
use strict;

my ($file);
$file = $ARGV[0];

open (FILE, $file) || die "Can not open $file: $!\n";

while (<FILE>)
{
chomp;
my ($l,$id);
$l = $_;
if ($l =~ /^CHORI73_[0-9]+[a-z][0-9]{2,}/)
   {
   print "$l\\.\n";
   }
elsif($l =~ /^(CHORI73_[0-9]+[a-z])([0-9]{1})/)
  {
  $id = $1 . "0" . $2 . "\\.";
  print "$id\n";
  }
}
close FILE;
'_EOF_'
    chmod +x addZeros.pl
    perl addZeros.pl DHmorethan2.DHonly.format > DHmorethan2.DHonly.format2
    wc -l DHmorethan2.DHonly*
    # 721 DHmorethan2.DHonly
    # 721 DHmorethan2.DHonly.format
    # 721 DHmorethan2.DHonly.format2
    # need to get full sequence names
    grep '>' DH_bacends.fa > DHBacs.fullnames
    perl -pi.bak -e 's/>//' DHBacs.fullnames
    perl -pi.bak -e 's/(CHORI73_[0-9a-z]+\.[a-z0-9A-Z]+) bases.+/$1/' \
         DHBacs.fullnames
    grep -f DHmorethan2.DHonly.format2 DHBacs.fullnames \
            > DHmorethan2.DHonly.fullnames
    wc -l DHmorethan2.DHonly.fullnames
    # 2352 DHmorethan2.DHonly.fullnames
    sort DHmorethan2.DHonly.fullnames > DHmorethan2.DHonly.fullnames.sort
    # do for those with less than 2 sequences to get the full names
cat << '_EOF_' > getFullNames.pl
#!/usr/bin/perl -w
use strict;

my ($file, $patterns, %idsHash);
$file = $ARGV[0];
$patterns = $ARGV[1];
open (FILE, $file) || die "Can not open $file: $!\n";
open (PATTERNS, $patterns) || die "Can not open $patterns: $!\n";

while (<FILE>)
{
chomp;
my ($l, $pref, $dir);
$l = $_;
if ($l =~ /^(CHORI73_[0-9a-z]+)\./)
   {
   $pref = $1;
   push(@{$idsHash{$pref}}, $l);
   }
}
close FILE;

while (<PATTERNS>)
{
my ($line, @ids, $i);
chomp;
$line = $_;
if (exists($idsHash{$line}))
   {
   @ids = @{$idsHash{$line}};
   foreach $i (@ids)
       {
       print "$i\n";
       }
   }
}
close PATTERNS;
'_EOF_'
    chmod +x getFullNames.pl
    perl getFullNames.pl DHBacs.fullnames DH.names.2orless \
         > DH.fullnames.2orless
   
    # do the same for CHORI73.names.touse to get full names
    awk '{print $1"SP6"}' CHORI73.names.touse > CHORI73.namesSP6.touse
    awk '{print $1"T7"}' CHORI73.names.touse > CHORI73.namesT7.touse
    cat CHORI73.namesSP6.touse CHORI73.namesT7.touse \
        > CHORI73.namesSP6andT7.touse
    wc -l CHORI73.names*
    # 265235 CHORI73.names.sort
    # 10598 CHORI73.namesSP6andT7.touse
    # 5299 CHORI73.namesSP6.touse
    # 5299 CHORI73.namesT7.touse
    # 5299 CHORI73.names.touse

    grep '>' CHORI73_bacends.fa > CHORI73.fullnames
    perl -pi.bak -e 's/>//' CHORI73.fullnames
    grep -f CHORI73.namesSP6andT7.touse CHORI73.fullnames \
         > CHORI73.fullnames.touse
    # so get all the sequence records together in one file
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends
    mkdir bacSeqs
    # get all sequences from DH_bacends.fa that have 2 or less for the clone.
    # This might include cases where there are duplicate reads for one end
    # only but these will go into the singles track anyway.
    faSomeRecords DH_bacends.fa DH.fullnames.2orless ./bacSeqs/DHBacs.2orless.fa
    # get all sequences with more than 2 sequences for that clone but
    # with no sequence in the new downloaded BAC ends sequence file that 
    # has only one sequence for each BAC end.
    faSomeRecords DH_bacends.fa DHmorethan2.DHonly.fullnames.sort \
         ./bacSeqs/DHBacs.2ormore.orig.fa
    # get all sequences for BAC ends where there are more than 2 read for 
    # ends for one clone so there are replicate reads for at least one end.
    # use the sequence in the downloaded CHORI73 set of clone ends for these.
    faSomeRecords CHORI73_bacends.fa CHORI73.fullnames.touse \
         ./bacSeqs/CHORI73.fromDH.morethan2.fa
    cd bacSeqs
    # translate to upper case and remove leading zeros
    cat DHBacs.2orless.fa | tr '[a-z]' '[A-Z]' > DHBacs.2orless.format.fa
    cat DHBacs.2ormore.orig.fa | tr '[a-z]' '[A-Z]' \
        > DHBacs.2ormore.orig.format.fa
    # remove leading 0 and just use name as FASTA header
    # need to leave in a or w as in p1kaSP6 or q1kaT7 or p1kSP6w or q1kT7w
    # these will distinguish replicate reads from the same sequence and will
    # be removed later when the best alignment is selected.
    perl -pi.bak -e \
    's/(CHORI73_[0-9]+[A-Z]{1})0?([0-9]+)\.(P1K|Q1K)(ASP6|SP6|SP6W|AT7|T7|T7W) BASES.+/$1$2$4/' \
        DHBacs*format.fa
    cat CHORI73.*.fa DHBacs*.format.fa > CHORI73BACends.fa
    grep '>' CHORI73BACends.fa | wc -l
    # 295722
    # then combine these with the zf_bacends.fa from Sanger which contain
    # the rest of the BAC end sequences.
    cat ../zf_bacends.fa CHORI73BACends.fa > Zv5BACends.fa
    grep '>' Zv5BACends.fa | wc -l
    # 720571
    faSize Zv5BACends.fa 
    # 674252474 bases (10674972 N's 663577502 real 663577502 upper 0 lower) in 
    # 720571 sequences in 1 files Total size: mean 935.7 sd 239.8 
    # min 26 (CHORI73_189M4SP6) max 5403 (zC259G13.zb) median 882
    # N count: mean 14.8 sd 72.4
    # U count: mean 920.9 sd 239.6
    # L count: mean 0.0 sd 0.0
    # check Zv5BACends.fa has unique sequence names
    grep '>' Zv5BACends.fa | sed 's/>//' > names
    sort names | uniq -c | sort -nr > names.count
    # all unique names so cleanup
    rm names names.count *.bak
    # Now the BAC end sequences file has been made, align the sequences 
    # to danRer3 using Blat.

    ssh pk
    # problems running these on kk using input from bluearc - slowed down
    # kkstore02 with heavy load. So move everything to the san as it 
    # scales better than the bluearc especially from the pk. run directory 
    # is on san also.  
    cd /cluster/data/danRer3/bed/bacends/bacSeqs
    # first split up bacends sequence and add to directory on the san
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/Zv5bacends
    # split up sequence for cluster runs
    faSplit sequence Zv5BACends.fa 20 \
            /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/bacends
    # get all the chrom contig files onto the san
    mkdir -p /san/sanvol1/scratch/danRer3/trfFaChroms
    rsync -a --progress /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa \
         /san/sanvol1/scratch/danRer3/trfFaChroms/

    cd /cluster/data/danRer3/bed/bacends
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsRun
    ln -s /san/sanvol1/scratch/danRer3/bacends/chromsRun
    # make directory for output, do not have output going to /cluster/data dir
    # as it is very large.
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsPsl
    ln -s /san/sanvol1/scratch/danRer3/bacends/chromsPsl
    # also copy over the 11.ooc file for danRer3 if not there already
    cp -p /cluster/bluearc/danRer3/danRer3_11.ooc \
       /san/sanvol1/scratch/danRer3/ 
    # make input file lists
    cd /cluster/data/danRer3/bed/bacends/chromsRun
    ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
    # do blat just for chr1-25 and chrM
    ls -1S /san/sanvol1/scratch/danRer3/trfFaChroms/*.fa > seqs.lst
    # 64 bit blat used for pk. This version of blat recently had a bug fix
    # so should give the same result as i386 blat on kk. use absolute path for
    # output dir rather than symlink as that would increase I/O.
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/chromsPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
   # << this line keeps emacs coloring happy
    /cluster/bin/i386/gensub2 seqs.lst bacends.lst template jobList
    /cluster/bin/i386/para create jobList
    /cluster/bin/i386/para try, check, push, check, ...
# /cluster/bin/i386/para time
# Completed: 4160 of 4160 jobs
# CPU time in finished jobs:     746878s   12447.96m   207.47h    8.64d  0.024 y
# IO & Wait Time:                 11166s     186.11m     3.10h    0.13d  0.000 y
# Average job time:                 182s       3.04m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             915s      15.25m     0.25h    0.01d
# Submission to last job:          5100s      85.00m     1.42h    0.06d

    # run jobs to do blat of NA and Un scaffolds vs BAC end sequences
    ssh pk
    # copy scaffolds to the san
    mkdir -p /san/sanvol1/scratch/danRer3/scaffoldsSoftMask
    foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
      rsync -a --progress $f /san/sanvol1/scratch/danRer3/scaffoldsSoftMask/
    end 
    cd /cluster/data/danRer3/bed/bacends
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
    ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
    # make directory for output, do not have output going to /cluster/data dir
    # as it is very large.
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
    ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
    # make input file lists
    cd /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun
    ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
    # do blat just for NA and Un scaffolds
    foreach f (/san/sanvol1/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
       echo $f >> scafs.lst
    end
    # 64 bit blat used for pk. This version of blat recently had a bug fix
    # so should give the same result as i386 blat on kk. use absolute path for
    # output dir rather than symlink as that would use
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
   # << this line keeps emacs coloring happy
    /cluster/bin/i386/gensub2 scafs.lst bacends.lst template jobList
    /cluster/bin/i386/para create jobList
    /cluster/bin/i386/para try, check, push, check, ...
# para time
# Completed: 298820 of 298820 jobs
# CPU time in finished jobs:    1232495s   20541.58m   342.36h   14.26d  0.039 y
# IO & Wait Time:                923511s   15391.85m   256.53h   10.69d  0.029 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1008s      16.80m     0.28h    0.01d
# Submission to last job:         37494s     624.90m    10.41h    0.43d
 
    ssh kolossus
    cd /cluster/data/danRer3/bed/bacends
    # need to sort psl files, filter and liftUp
    # first do the chr1-25 and chrM alignments
    nice pslSort dirs rawChroms.psl tmp chromsPsl >& chromSort.log
    # Time taken: 2 hours 42 minues 
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
            rawChroms.psl bacEndsChroms.psl /dev/null >& pslRepsChroms.log
    # Took 19 minutes
    # then lift up NA and Un scaffolds to chrom level
    nice pslSort dirs rawNAandUn.psl tmp scaffoldsNAandUnPsl \
         >& scafsNAandUnSort.log
    # took 1 hour 50 minutes
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
            rawNAandUn.psl  bacNAandUnScafs.psl /dev/null >& pslRepsNAandUn.log
    # took 18 minutes
    # lift results:
    liftUp bacEnds.liftedChroms.psl /cluster/data/danRer3/jkStuff/liftAll.lft \
           warn bacEndsChroms.psl
    liftUp bacEnds.liftedNAandUn.psl \
      /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
           warn bacNAandUnScafs.psl
     
    # sort and merge these files
    mkdir liftedPsl
    mv *.lifted*.psl ./liftedPsl/
    nice pslSort dirs bacEnds.psl tmp1 liftedPsl >& pslSortAll.log
    # Took 4 minutes
    pslCheck bacEnds.psl >& pslCheck.log
    # there are 520 BAC ends with overlapping block errors - 1385 alignments

    # use pslReps parameters used for mm6
    pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl \
            bacEnds.psl /dev/null
    # those for hg17
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
            raw.psl  bacEnds2.psl /dev/null
    # see how many align in each case
    awk '{print $10;}' bacEnds.psl | sort | uniq -c \
        | sort -nr > bacEnds.qNames.sort
    awk '{print $10;}' bacEnds2.psl | sort | uniq -c \
        | sort -nr > bacEnds2.qNames.sort
    wc -l bacEnds*qNames.sort
    # 549086 bacEnds2.qNames.sort
    # 519773 bacEnds.qNames.sort
    grep '>' Zv5Bacends.fa | wc -l
    # 729101
    # so 71% of sequences aligned in bacEnds.psl
    # and 75% of sequences aligned in bacEnds2.psl
    # use textHistogram to look at number of alignments
    # bacEnds.psl has 374002 with only 1 alignment
    # bacEnds2.psl has 362364 with only 1 alignment
    # bacEnds.psl - most alignments for 1 sequence is 515, 
    # for bacEnds2.psl - most alignments for 1 sequence is 1272
    # when these are split up into bacEndPairs, bacEndPairsBad and 
    # bacEndSingles, the number of alignments per sequence is reduced
    # so use bacEnds2.psl
     
     # Process BAC end alignments
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/bacends/pairs
     mkdir -p /cluster/data/danRer3/bed/bacends/bacends.1
     # Downloaded BAC ends accessions from SRS
     # Go to http://srs.sanger.ac.uk
     # Go to "Select Databanks" tab and check DBGSS
     # Go to "Query Form" tab
     # Select Organism as field and enter "Danio*" as search term
     # Select AllText as field and enter "*Sanger*" as search term
     # Select AllText as filed and enter "T7|SP6" as search term
     # Select a view
     # Download as BACEndAccs.txt to bacend.1 directory 
     cd /cluster/data/danRer3/bed/bacends/bacends.1
     cp /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/getBacEndInfo.pl .
     # get lists of SP6 and T7 accessions and merge lists
     awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /SP6/) print $3"SP6",$4}' \
         BACEndAccs.txt > BACEndSP6.accs
     awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /T7/) print $3"T7",$4}' \
         BACEndAccs.txt > BACEndT7.accs
     cat BACEndSP6.accs BACEndT7.accs > BACEndExtNames.accs
     # change external names to internal names
cat << '_EOF_' > extToIntNames.pl
#!/usr/bin/perl -w
use strict;

my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {
   CH211-  zC
   DKEY-   zK
   DKEYP-  zKp
   RP71-   bZ
   BUSM1-  dZ
   CH73-   CHORI73_
};

while (<STDIN>) 
{
my ($l, $c, $intPref);
$l = $_;
foreach $c (@clonePrefixes)
   {
   if ($l =~ /$c/)
       {
       # get internal name
       if (exists($cloneHash{$c}))
          {
          $intPref = $cloneHash{$c};
          $l =~ s/$c/$intPref/; 
          print $l;
          }
       }
   }
}
'_EOF_'
     chmod +x extToIntNames.pl
     perl extToIntNames.pl < BACEndExtNames.accs > BACEnd_accessions.txt
     # get BAC clone accessions from Genbank. They can be obtained from EMBL
     # through SRS but harder to separate the BAC end accessions from the
     # BAC clone accessions:
     # go to http://www.ncbi.nlm.nih.gov
     # 1) select "Nucleotide" as the search database.
     # 2) Search string: Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
     # Those sequences with "genomic survey" in the title appear to be 
     # BAC clone end accessions. Here, we want only BAC clone accessions.
     # 3) There are 628991 sequences (2005-09-19). Select File from Send To 
     # pulldown menu and name file "BACClones.gbAccs.txt".
     # create script to parse out clone ID and the accession:
cat << '_EOF_' > getAccsandIdsFromGb.pl
#!/usr/bin/perl -w
use strict;

my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {  
   CH211-  zC
   DKEY-   zK
   DKEYP-  zKp 
   RP71-   bZ
   BUSM1-  dZ
   CH73-   CHORI73_
};

my $found = "FALSE";
my $acc = "";
my $id = "";
while (<STDIN>)
{
my ($l, @f, $intId, $extPref, $intPref);
$intPref = "";
$extPref = "";

chomp;
$l = $_;
if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
   {
   $acc = "";
   $acc = $1;
   $found = "FALSE";
   }
elsif ($l =~ /clone/)
   {
   $id = "";
   # check for clone name in this line
   foreach my $p (@clonePrefixes)
      {
      if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
         {
         $id = $1;
         # translate to upper case
         $id =~ tr/a-z/A-Z/;
         $extPref = $p;
         $found = "TRUE";
         }
      }
   }
if ($found eq "TRUE")
   {
   if (exists($cloneHash{$extPref}))
      {
      $intPref = $cloneHash{$extPref};
      }
   $intId = $id;
   # translate this to internal ID
   $intId =~ s/$extPref/$intPref/;
   print "$intId\t$acc\t$id\n";
   $found = "FALSE";
   }
}
'_EOF_'
     # chmod +x getAccsandIds.pl
    #  perl getAccsandIds.pl < BACClones.accs.txt > BACClonesIdsandAccs.txt
     # Took 36 minutes. This file has internal BAC clone name, accession and
     chmod +x getAccsandIdsFromGb.pl
     # CHORI73_ is a new prefix, this is for the internal name of 
     # BAC clones from the CHORI73 doubled haploid library.
     nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
          > BACClonesIdsandAccs.txt &
     
     # Took under 3 minutes. The output file here has internal BAC clone name, 
     # Genbank accession and external BAC clone name.
     grep '>' ../bacSeqs/Zv5BACends.fa | sed -e 's/>//' > allBacEnds.names
     # modify getBacEndInfo.pl for these sequence names so rename as
     # getBacEndInfov2.pl 
     # need to make pairs file
     perl getBacEndInfov2.pl allBacEnds.names BACEnd_accessions.txt \
          > bacEnds.log
     # check that all the BAC end sequence names from allBacEnds.names
     # appear in either bacEndPairs.txt or bacEndSingles.txt
     wc -l bacEnd*
     # 159319 bacEndAccs.aliases
     # 333356 bacEndPairs.txt
     # 19788 bacEndSingles.txt
     # bacEndAccs.aliases contains sequence read names and their
     # Genbank accessions. 
     awk 'BEGIN {OFS="\n"} {print $1, $2;}' bacEndPairs.txt \
         | sed -e 's/,/\n/g' > bacPrs.names
     awk '{print $1;}' bacEndSingles.txt | sed -e 's/,/\n/g' > bacSingles.names
     cat bacPrs.names bacSingles.names | sort > bacEnds.names.sort
     sort allBacEnds.names > allBacEnds.names.sort
     wc -l *.sort
     # 720571 allBacEnds.names.sort
     # 720571 bacEnds.names.sort
     # so all the BAC ends from the FASTA file have been accounted for either
     # as pairs or singles.
     # process BAC end alignments
     cd /cluster/data/danRer3/bed/bacends/pairs
     set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
     # try different parameters
      /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.psl $bacDir/bacEndPairs.txt \
     all_bacends bacEnds
     wc -l bacEnds.*
     # 426 bacEnds.long
     # 14875 bacEnds.mismatch
     # 229139 bacEnds.orphan
     # 164778 bacEnds.pairs
     # 0 bacEnds.short
     # 100 bacEnds.slop
     # 409318 total
     # there are less slop (190) more pairs (90967) and orphans (229139)
     # and less mismatch (18083) and less long (980) than for danRer2
     # size of sequence should be 100-200 kb but since assembly is not 
     # complete there are misassemblies so the distance between pairs could be
     # larger. If -max=200000 -slopval=10000 -hardMax=500000 is used, then
     # there are 18377 bacEnds.long, 250243 bacEnds.orphan, 
     # and 131209 bacEnds.pairs and over 3000 less just drop out.413243 total
     # try -max=300000 -slopval=10000 -hardMax=500000
     # wc -l bacEnds.*
     # 3343 bacEnds.long
     # 11731 bacEnds.mismatch
     # 243500 bacEnds.orphan
     # 154981 bacEnds.pairs
     #  0 bacEnds.short
     # 509 bacEnds.slop
     # 414064 total
     # try -min=25000 -max=350000 -slopval=10000 -hardMax=500000 as for human
     # wc -l bacEnds.*
     # 1725 bacEnds.long
     # 12081 bacEnds.mismatch
     # 242235 bacEnds.orphan
     # 156444 bacEnds.pairs
     # 616 bacEnds.short
     # 1017 bacEnds.slop
     # 414118 total
     # this would be good to use but for direct comparison between danRer2 
     # and danRer3, it would be good to use the same parameters as before
     # so stick with those above: 
     # -min=2000 -max=650000 -slopval=10000 -hardMax=800000 
     # create header required by "rdb" tools

     # NOTE: there are overlapping BAC clone ends for danRer3. Some of these
     # are only a few kb apart (from beginning of one to end of the other)
     # so use stricter pslPairs parameters as for human and mouse.
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/bacends/pairs
     cd /cluster/data/danRer3/bed/bacends/pairs
     set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl \
        $bacDir/bacEndPairs.txt all_bacends bacEnds
     wc -l bacEnds.*

     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
          > ../header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
     # make pairs bed file
     cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
               | headchg -del > bacEndPairs.bed
     # also need to process bacEndSingles.txt into a database table
     # for singles in bacEndSingles.txt, create a dummy file where they
     # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
     # sequence, put the dummy sequence in the second column, if the single is
     # a reverse sequence put in first column. use a perl script to do this.
     cd /cluster/data/danRer3/bed/bacends
     set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
     mkdir singles
     cd singles
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
     perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                           $bacDir/bacEndSingles.format
     # then run pslPairs on this formatted file
     /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
     wc -l bacEnds.*
     # 0 bacEnds.long
     # 0 bacEnds.mismatch
     # 11439 bacEnds.orphan
     # 0 bacEnds.pairs
     # 0 bacEnds.short
     # 0 bacEnds.slop
     # there are 11439 orphans here and 229139 from pair analysis so 
     # a total of 240578 orphans
     cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
     wc -l bacEnds.singles
     # 240578 bacEnds.singles
     # make singles bed file
     cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed
     cp bacEndSingles.bed ../pairs
     cd ../pairs
     # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
     # since orphans are already in bacEndSingles, do not add these
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
     # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
     # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans

     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
     # use extractPslLoad later to get all_bacends.psl for database

     # There are rows where the aligments were the same but the lfNames are 
     # different. This is due to the presence of multiple reads for the 
     # same BAC end sequence. Sometimes they are slightly different lengths 
     # so the alignments are a little different. It would be good to 
     # consolidate all of these. Firstly, the identical rows were merged into 
     # one with a list of all the lfNames corresponding to that alignment.
     
     ssh kkstore02
     #echo "create database bacsDr3_rah;" | hgsql danRer3
     cd /cluster/data/danRer3/bed/bacends/pairs
     #hgLoadBed bacsDr3_rah bacEndPairs bacEndPairs.bed \
    #       -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
     # Loaded 163174 elements of size 11
     # create a bacEndSingles table like bacEndPairs if not created already
     # hgLoadBed bacsDr3_rah bacEndSingles bacEndSingles.bed \
       #          -sqlTable=../singles/bacEndSingles.sql -notItemRgb
     # Loaded 212775 elements of size 11
     # NOTE - this track isn't pushed to RR, just used for assembly QA
     # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
    # hgLoadBed bacsDr3_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
     #      -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
     # Loaded 15169 elements of size 11
     # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
     # name, different lfNames and same alignments.
     mkdir -p /cluster/data/danRer3/bed/bacends/duplicates
     cd /cluster/data/danRer3/bed/bacends/duplicates
     mkdir -p /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
     cd /cluster/data/danRer3/bed/bacends/duplicates
     ln -s /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
     # write program to do this for linked feature series (lfs) which
     # is the type of data structure used for BAC ends.
     # Need a bed file sorted by chrom and chromStart 
     cd overlapRun
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
        sort -k1,2 /cluster/data/danRer3/bed/bacends/pairs/${f}.bed > ${f}.lfs
     end
     wc -l *.lfs
     # 15169 bacEndPairsBadNoOrphans.lfs
     # 163174 bacEndPairs.lfs
     # 212775 bacEndSingles.lfs
 
     # remove replicate rows where names match and the overlapping region
     # (chromEnd - chromStart) is greater than or equal to 0.999.
     ssh kolossus
     cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         echo "Processing $f"
         nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
               ${f}.bed -name -minOverlap=1.0 -notBlocks
     end
     # Started: Tue Sep 27 21:51 Finished: Sep 28 06:29 
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
     # check the numbers of lines are correct
    
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
             | sort | uniq -c | sort -nr > ${f}.uniqCount
     end
     wc -l *
     # 163116 bacEndPairs.bed
     # 163174 bacEndPairs.lfs
     # 163116 bacEndPairs.uniqCount
     # 15163 bacEndPairsBad.bed
     # 15169 bacEndPairsBad.lfs
     # 15163 bacEndPairsBad.uniqCount
     # 212754 bacEndSingles.bed
     # 212775 bacEndSingles.lfs
     # 212754 bacEndSingles.uniqCount
     # numbers of lines after uniqueing by coords, name and score is the
     # same as that after using lfsOverlap to remove these lines so correct.
     cd /cluster/data/danRer3/bed/bacends/duplicates
     mv ./overlapRun/* .
     rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
     # Use perl script to choose 2 BAC ends to represent each BAC clone.
     # since there are often more than one read for each BAC end in this set,
     # 2 were chosen for each BAC pair or 1 for the singles. This was based on
     # the ones that had the largest region aligned (using lfSizes).
     # copy perl script over that was used for danRer2
     cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
        pickLfNamesv2.pl 
     # edit so that regular expression for matching BAC end names is the 
     # same as that used in ../bacends.1/getBacEndInfov2.pl
     # need to sort by chrom, chromStart

     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
     end
     # run perl script: input bed file, pairs or singles, name of output file
     perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
     mv error.log log.pairs
     # log.pairs lists the 18 cases where alignments for a BAC clone use
     # a different pair of sequence reads for the ends than the previous
     # alignment for ends for that BAC clone. These were all checked and in
     # each case, the extra alignments are almost identical or overlap for
     # the most part so it does not matter if the extra alignments are 
     # removed.
     # run script for singles:
     perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
     mv error.log log.singles
     # log.singles has 34 cases where alignments for a BAC clone use 
     # different sequence reads for either the T7 or SP6 BAC end.
     # singles may include both BAC ends for a clone in the case
     # where they aligned to different chromosomes or a long way apart on 
     # the same chromsome (orphans). mostly those that have a different read
     # align to an almost identical or largely overlapping region.
     # some sequences appear to be different: CH211-98J20 - zC98J20.yb and
     # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
     # like it has low complexity sequence, this is discarded and zKp107B4.yb 
     # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
     # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp 
     # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
     # In these cases, the 2 sequences align to different regions.
     perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
          badPairs2lfNames.bed
     mv error.log log.badPairs
     # only 3 alignments have a different pair of ends to other alignments
     # but alignment region is almost the same in each case.
    
     # for each of these new bed files, checks were made that there are
     # only 2 BAC ends per alignments for pairs and 1 for singles.
     # For each pair, there should only be 2 ends which can appear either
     # way round depending on the orientation and there should be 1 end for
     # the beginning (suffix T7, t7 or z) and one end for the end
     # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
     # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
     # orientation. For singles, there should be a single BAC end for each
     # alignment and for each BAC clone, a sequence for either or both types
     # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
     # alignments.
     # Finally overlaps in BAC clone names were checked. All BAC clones
     # represented in each of the pairs, badPairs and singles bed files are
     # unique to that file. Between all three bed files, 300323 BAC clones
     # have alignments. 512886 clone ends are aligned in these three bed files. 
     
     # NOTE: using sort and uniq on hgwdev produces tab delimited output
     # after merging rows with the same BAC name, the scoring is now
     # wrong in the bed files.
     # Scores should be 1000 if there is 1 row for that name, else
     # 1500/number of rows for that sequence name - calculated by pslPairs.
     # Correct the scores.
                                                                                
     mkdir -p /cluster/data/danRer3/bed/bacends/scores
     cd /cluster/data/danRer3/bed/bacends/scores
     # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and 
     # edit so both scripts so that hits file is split on space,not on tabs
     cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/correctScores2.pl .
     cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/checkScores.pl .
     awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                 | sort | uniq -c > pairs.hits
     perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
                           > bacEndPairsGoodScores.bed
     # same for singles
     awk '{print $4}' ../duplicates/singles1lfName.bed \
                 | sort | uniq -c > singles.hits
                                                                                
     perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
                 noBin > bacEndSinglesGoodScores.bed
                                                                                
     # and for badPairs
     awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                 | sort | uniq -c > badPairs.hits
     perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
                 noBin > bacEndPairsBadGoodScores.bed
     # check that the scores are now correct  
     awk '{print $4, $5}' bacEndPairsGoodScores.bed \
         | sort | uniq -c > pairs.count
     perl checkScores.pl < pairs.count
     # all the BAC clones should be in good.txt and none in bad.txt
     # wc -l should give same number of lines in good.txt as in pairs.hits
     # repeat for other bed files
     awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
         | sort | uniq -c > badPairs.count
     perl checkScores.pl < badPairs.count
     awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
         | sort | uniq -c > singles.count
     perl checkScores.pl < singles.count
     # for the singles, 6 ended up in bad.txt because their scores 
     # were 214.285714285714 which is correct for 7 alignments. rounding the
     # score caused the discrepancy.
     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/scores
     # copy over table definition from danRer2
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
        ../singles/
     # Now load database tables:
     hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
     # Loaded 163098 elements of size 11
     hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScores.bed \
               -sqlTable=../singles/bacEndSingles.sql -notItemRgb
     # Loaded 212720 elements of size 11
     # 212720 record(s), 0 row(s) skipped, 50 warning(s) loading bed.tab
     # warnings are unknown but all of bed file loaded and the number
     # of warnings is small so ignore
     hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
     # Loaded 15160 elements of size 11
     # load BAC end sequences into seq table so alignments may be viewed
     # symlink to bacends.fa sequences in danRer1
     mkdir -p /gbdb/danRer3/bacends
     ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
                                /gbdb/danRer3/bacends/Zv5BACends.fa
     hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa

     # create file for loading all_bacends table
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/scores
     # for all_bacends table, just load the alignments for those sequences
     # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
     # bacEnds.load.psl is the file of alignments
     # get all the names of sequences 
     foreach f (*GoodScores.bed)
       echo $f
       awk '{print $11;}' $f >> allBacEnds.names
     end
     wc -l allBacEnds.names
     # 390978 allBacEnds.names
     # this is the total number of lines in the *GoodScores.bed files
     perl -pi.bak -e 's/,/\n/g' allBacEnds.names
     sort allBacEnds.names | uniq > allBacEnds.names.uniq
     wc -l allBacEnds.names.uniq
     # 512886 allBacEnds.names.uniq
     # get alignments for just the BAC ends that are in the database tables
     # make bacEnds.load.psl
     cd /cluster/data/danRer3/bed/bacends/scores 
     extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
            bacEndPairsBadGoodScores.bed bacEndSinglesGoodScores.bed | \
            sorttbl tname tstart | headchg -del > bacEnds.load.psl
    # check that alignments are present for all BAC ends in 
    # allBacEnds.names.uniq
    awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
    comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
    # 512886
    ssh hgwdev
    cd /cluster/data/danRer3/bed/bacends/scores
    # load all_bacends table
    hgLoadPsl danRer3 -table=all_bacends bacEnds.load.psl
    # load of all_bacends did not go as planned: 7584708 record(s), 
    # 0 row(s) skipped, 526 warning(s) loading psl.tab
    
    # (hartera, 2006-04-19)
    # Display is very slow for BAC ends on large regions. Try splitting
    # all_bacends by chromosome.
    ssh hgwdev
    mkdir /cluster/data/danRer3/bed/bacends/all_bacends
    cd /cluster/data/danRer3/bed/bacends/all_bacends
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        echo "Processing $c ..."
        awk '{if ($14 == "'chr${c}'") print;}' \
            /cluster/data/danRer3/bed/bacends/scores/bacEnds.load.psl \
            > chr${c}.bacEnds.load.psl
    end
    # rename old table
    hgsql -e 'alter table all_bacends rename allBacendsOld;' danRer3
    # load new tables
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
       hgLoadPsl danRer3 -table=chr${c}_all_bacends chr${c}.bacEnds.load.psl
    end
    # There are still warnings on loading, most (510) are for chrUn.
    # This improves the performance a lot.
    # The chrom-parsing code is confused by the double underscores in the
    # chrN_all_bacends tables so change the names to chrN_allBacends
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
       hgsql -e "alter table chr${c}_all_bacends rename chr${c}_allBacends;" \
             danRer3
    end
    # Then add correct table name to each of the bacEnd* tables
    foreach t (bacEndPairs bacEndPairsBad bacEndSingles)
       hgsql -e "update $t set pslTable = 'allBacends';" danRer3
    end
    # corrected termRegex for some bacCloneXRef searches in trackDb.ra so 
    # that they work correctly (bacPairsIntName, bacSinglesIntName, 
    # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)

# CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES 
# (bacEndAlias, bacCloneAlias and bacCloneXRef) (DONE, 2005-10-06, hartera)
# RECREATE TABLES AFTER REMAKING THE SINGLES AND PAIRS TABLES 
# (see REDO BACENDS SECTION) (DONE, 2006-06-08, hartera)
# REPLICATE ROWS IN TABLES SO REMOVE AND RELOAD (DONE, 2006-08-04, hartera)
    # Process data and create bacEndAlias table 
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends/bacends.1
    #  make bacEndAlias table with Genbank accessions for ends
    # need to run getBacEndInfo.pl for the BAC end names in the 
    # BAC tables.
    # in the pairs directory, there is the allBacEnds.names.uniq file
    # so use this.
    # Already made the bacEndAccs.aliases file with getBacEndInfov2.pl 
    # This has none of the BAC ends whose names end in ASP6 or AT7 as 
    # these are all from the CHORI73 library and they do not have BAC end
    # accessions in Genbank at the moment. This contains accessions for 
    # all BAC ends even those without alignments.
    hgsql danRer3 < $HOME/kent/src/hg/lib/bacEndAlias.sql
    echo "load data local infile 'bacEndAccs.aliases' into table \
         bacEndAlias" | hgsql danRer3
    ssh kkstore02
    # get the latest versions of the clonemarkers, contig names and markers
    # files from Sanger
    mkdir -p /cluster/data/danRer3/bed/bacends/cloneandStsAliases
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.27.07.05.txt
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.27.07.05.txt
    wget --timestamp \
         ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.27.07.05.txt
    wc -l *27.07.05.txt
    # 29885 clonemarkers.27.07.05.txt
    # 167858 ctgnames.27.07.05.txt
    # 12250 markers.27.07.05.txt
    # Recreate tables as bacEndPairs, bacEndSingles, bacEndPairsBad and
    # chrN_allBacends tables have changed (2006-06-08, hartera)
    # get list of BAC end names, lfNames
    cp /cluster/data/danRer3/bed/bacends/scoresAndCoords/allBacEnds.names.uniq .
    # get list of BAC clone names 
    foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
      awk '{print $4}' \
      /cluster/data/danRer3/bed/bacends/scoresAndCoords/${f}GoodScores.bed >> bacs.names
    end
    sort -u bacs.names > bacs.names.uniq
    wc -l *.uniq
    # 512321 allBacEnds.names.uniq
    # 300290 bacs.names.uniq

    # from psl file
    awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
    # edit to remove first few lines with no names
    sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
    wc -l bacEndsPsl.names.uniq
    # 545920 bacEndsPsl.names.uniq
    # this is all the BAC ends that originally had alignments
    # Add an alias table for BAC clones
    # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    # Add a xref table to give external clone registry names, internal names
    # sanger name, relationship between STS and BAC clone (method of finding
    # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
    # Genbank accession and STS primer sequences
    # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    set dir=/cluster/data/danRer3/bed/bacends/
    awk 'BEGIN {OFS="\t"}{print $4, $1}' \
     $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
    awk 'BEGIN {OFS="\t"}{print $4, $1}' \
    $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
    sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
    # use a list of internal names,Genbank accessions, and BAC clone names
    # use BACClonesIdsandAccs.txt.
    # get list of UniSTS IDs using aliases to search alias file
    # print Sanger name, alias and UniSTS ID, use find_markers3.pl
cat << '_EOF_' > find_markers3.pl
    # example:
# perl find_markers3.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
   chomp;
   ($id, $al) = split /\t/;
   @alsArray = split(/;/, $al);
   foreach my $as (@alsArray)
      {
      push (@{$aliasHash{$as} }, $id);
      }
}
close ALIAS;
                                                                                
while (<MARKERS>) {
    my @idArray;
    ($f, $t, $m, $idArray[0]) = 0;
    my @ids;
    chomp; ($a, $b, $aliases, @rest) = split /\|/;
    if ($verbose > 3) { printf "aliases $aliases \n"; }
    @alias = split /;/, $aliases;
    ALIAS: foreach $s (@alias) {
        if ($s =~ /[\D]+/) {
            if ($verbose > 5) { printf "this $s \n"; }
            if (exists($aliasHash{$s}))
               {
               @idArray = @{$aliasHash{$s}};
               }
            if ($idArray[0]) {
                $f = 1; $t = $s; @ids = @idArray;
                if ($verbose) { printf "this $s found $m \n"; }
                last ALIAS;
            }
        }
    }
    if ($f)
     {
     my @sNames = split(/;/, $b);
     foreach my $sn (@sNames)
        {
        foreach my $i (@ids)
           {
           printf "$sn\t$i\n";
           }
        }
    }
}
close MARKERS;
'_EOF_'
    chmod +x find_markers3.pl
    perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-09-29/UniSTS.aliases \
         markers.27.07.05.txt > sangerandUniSTSId.txt
    # No need to reformat this for zfishBacClonesandSts
    # FPC contig information (i.e. FPC contig number) from ctgnames file is
    # not included in the tables as these are dynamic and constantly
    # changing with the assembly.
    # FILE OF BAC CLONE ACCESSIONS
    # http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

    # copy over file of BAC internal names, accessions and external names 
    cp /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt .
    # use zfishBacClonesandSts to create tab files for loading into
    # bacCloneAlias and bacCloneXRef tables
    # make output directory
    rm -r /cluster/bluearc/danRer3/bacEnds/out
    mkdir -p /cluster/bluearc/danRer3/bacEnds/out
    # edit zfishBacClonesandSts.c to add prefixes for CHORI73 library:
    # CHORI73_ for internal name, CH73- for external name
    # in ctgnames.27.07.05.txt and clonemarkers.27.07.05.txt
    perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.07.05.txt 
    mv ctgnames.27.07.05.txt.bak ctgnames.27.07.05.orig
    mv clonemarkers.27.07.05.txt.bak clonemarkers.27.07.05.txt.orig
    # no change to markers file so remove .bak file
    rm markers.27.07.05.txt.bak 
    nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.07.05.txt \
      clonemarkers.27.07.05.txt markers.27.07.05.txt \
      bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
      sangerandUniSTSId.txt ./out > ./out/zfishBacs.out &
    # output is in /cluster/bluearc/danRer3/bacends/out so copy over
    # sort alias tab file by sangerName
    sort -k2 ./out/bacAlias.tab > bacAlias.sort.tab
    cp ./out/bacXRef.tab .
    wc -l *.tab
    # 110961 bacAlias.sort.tab
    # 540800 bacXRef.tab
 
    ssh hgwdev 
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
    hgsql -e 'drop table bacCloneAlias;' danRer3
    hgsql -e 'drop table bacCloneXRef;' danRer3
     
    hgLoadSqlTab danRer3 bacCloneAlias \
          $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab
    hgLoadSqlTab danRer3 bacCloneXRef \
          $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab
# edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
# and bacEndSingles tracks as for danRer1. copy over html from danRer2
# for bacEndPairs and bacEndSingles tracks.
    # Replicate rows in table so reload after removing these
    # (hartera, 2006-08-04)
    ssh hgwdev 
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases

    sort bacAlias.sort.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
    sort bacXRef.tab | uniq > bacXRef.tab.uniq
    wc -l *.tab.uniq
    # 57656 bacAlias.sort.tab.uniq
    # 356453 bacXRef.tab.uniq
 
    # Drop old tables and reload:
    hgsql -e 'drop table bacCloneAlias;' danRer3
    hgsql -e 'drop table bacCloneXRef;' danRer3
    
    hgLoadSqlTab danRer3 bacCloneAlias \
          $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
    hgLoadSqlTab danRer3 bacCloneXRef \
          $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq

# BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2005-10-06, hartera)
# REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES - both ok.
# (DONE, 2006-06-12, hartera)
# REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2006-08-04, hartera)
    # The following tests were carried out to check that all the data
    # in the bacCloneAlias and bacCloneXRef tables is correct.
    ssh hgwdev
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
    cp ./testTablesNew/*.pl .
    rm -r testTablesNew
    mkdir -p testTablesNew
    cd testTablesNew
                                                                                
# Check that the correct aliases are associated with their Sanger STS names
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
        ../markers.27.07.05.txt > sNameandaliases
    # write script to get one Sanger name and one alias on each line
    cp ../*.pl .
    perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
    sort sNameandaliases.format | uniq > sNameandaliases.sort
    # get Sanger names and aliases from database
    hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer3 \
          | sort | uniq > alias.db.sort
    wc -l alias.db.sort
    # 57656 alias.db.sort
    diff sNameandaliases.sort alias.db.sort
    # No difference between data file and data from database so ok
    # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
    # get Sanger names from alias table
    hgsql -N -e 'select sangerName from bacCloneAlias;' danRer3 \
             | sort | uniq > sName.alias.sort
    wc -l sName.alias.sort
    # 15309 sName.alias.sort
    # get Sanger names from xRef table
    hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
          is not null;' danRer3 | sort | uniq > sName.xRef.sort
    wc -l sName.xRef.sort
    # 15522 sName.xRef.sort
    comm -23 sName.alias.sort sName.xRef.sort
    # nothing unique to alias file so all Sanger names in the alias table are
    # also in the xRef table
    comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
    wc -l sNamexRefNotAlias
    # 213 sNamexRefNotAlias
    awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.07.05.txt | sort | uniq \
        > clonemarkers.sNames.sort
    # get Sanger names from markers file
    awk 'BEGIN {FS="|"}{print $2}' ../markers.27.07.05.txt > markers.sNames
    # remove semi-colons and sort
    sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
    # sanger names unique to markers file
    comm -13 clonemarkers.sNames.sort markers.sNames.sort
    # there are none
    comm -23 clonemarkers.sNames.sort markers.sNames.sort \
         > sNames.clonemarkersOnly
    wc -l sNames.clonemarkersOnly
    # 213 sNames.clonemarkersOnly
    diff sNames.clonemarkersOnly sNamexRefNotAlias
    # No difference so all the extra Sanger Names in the xRef 
    # table are from the clonemarkers file and these have no aliases in 
    # the markers file so they are not in the alias table so this is all ok.
  
# Check that Sanger STS names and primers are associated correctly
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases/testTablesNew
    # get sanger names and primers from markers file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
        ../markers.27.07.05.txt > sNameandPrimers
    # use script to reformat and write with one Sanger name per line
    chmod +x getSangerandPrimers.pl
    perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
    sort sNameandPrimers.format > sNameandPrimers.format.sort
    wc -l sNameandPrim*
    # 12250 sNameandPrimers
    # 15309 sNameandPrimers.format
    # 15309 sNameandPrimers.format.sort
    # get Sanger names and primers from database
    hgsql -N -e \
      'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
      where sangerName is not null and leftPrimer is not null and \
      rightPrimer is not null;' danRer3 | sort | uniq \
      > sNamesandprimers.fromdb.sort
    wc -l sNamesandprimers.fromdb.sort
    # 15309 sNamesandprimers.fromdb.sort
    diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
    # No difference so ok.

# Check that UniSTS IDs and Sanger STS names are associated correctly
   # get Sanger names and UniSTS IDs from the database
   hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
       uniStsId is not null;' danRer3 | sort | uniq > sNameUniSTS.fromdb.sort
   wc -l sNameUniSTS.fromdb.sort
   #  5634 sNameUniSTS.fromdb.sort
   # Need to reformat the sNameUniSTS.fromdb.sort
   chmod +x formatUniSts.pl
   perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
        > sNameUniSTS.fromdb.format.sort
   # get Sanger names from data file and see how many UniSTS IDs there are
   # for each name
   awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
       > sangerandUniSTSId.count
   # the most is 3 
   # 3 etID9786.21
   # 3 etID9056.23
   # 3 etID9042.2
   # 3 etID8627.2
   # 3 etID8281.9
   # 3 etID11096.5
   sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
   diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
       > sangerandUniSTSIdvsdb
   # No difference between data from original file and that in database so ok

# Check that chrom mappings and external BAC clone names are correct
   # get extNames and chroms they map to from the database
   hgsql -N -e 'select name, chroms from bacCloneXRef where \
         chroms is not null;' danRer3 | sort | uniq \
         > nameandchromsfromdb.sort
   # reformat nameandchromsfromdb.sort
   perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
        > nameandchromsfromdb.format.sort
   # compare extNames and chroms from db to those in data file
   cp ../bacClones.namesandchrom .
   sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
   diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
   # no difference - all ok

# Check Genbank accessions and internal BAC clone names
   hgsql -N -e 'select intName,genbank from bacCloneXRef where \
         genbank is not null;' danRer3 | sort | uniq \
         > intNamesandAccs.fromdb.sort
   # this should be a subset of zfish_accsMerged.txt - not all BAC clones
   # listed here appear in either our BAC ends tracks or the markers files.
   awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
       | sort -u > BACClonesIntandAccs.sort
   comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
   # there is nothing in the database that is not in BACClonesIntandAccs.sort
   comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
            > onlyinzfishAccs
   wc -l onlyinzfishAccs
   # 86 onlyinzfishAccs
   hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
         danRer3 | sort | uniq > intNamesNoAcc.fromdb.sort
   awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
   comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
        > indbNoAccsandAccs.out
   # none of these names are common to both so all accessions from
   # BACClonesIdsandAccs.txt are in the database for the internal names stored
   # where there are accessions available.

# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
   # get internal names and Sanger names from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.07.05.txt \
       | sort | uniq > intNameandSanger.sort
   hgsql -N -e 'select intName, sangerName from bacCloneXRef \
       where sangerName is not null;' danRer3 \
       | sort | uniq > intNameandSanger.fromdb.sort
   diff intNameandSanger.sort intNameandSanger.fromdb.sort
   # No difference between data from file and that from database so ok

# Check BAC clone internal name and relationship fields
   # get internal names and relationships from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.07.05.txt \
       | sort | uniq > intNameandRelation.sort
   # get internal names and relationships from database
   hgsql -N -e 'select intName, relationship from bacCloneXRef \
       where relationship != 0;' danRer3 \
       | sort | uniq > intNameandrelation.fromdb.sort
   # differences unique to database file
   comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.indbonly
   # differences unique to data file
   comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.incloneMarkersonly
   wc -l intNameRelation*
   # 4650 intNameRelation.incloneMarkersonly
   # 4650 intNameRelation.indbonly
  
   awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
   awk '{print $1}' intNameRelation.incloneMarkersonly \
       > intNameRelation.incloneMarkersonly.names
   diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
   # there is no difference in the internal names with relationship fields
   # no difference in names and the only places these should differ is that
   # the second column should all be 3 in the data from the database only.
   # this is because all the relationship entries that were blank were
   # in the clonemarkers file were changed to 3 when entered into the database.
   awk '{print $2}' intNameRelation.indbonly | sort | uniq
   # 3 - correct so all ok
   # all the differences should be that those that are blank in clonemarkers
   # are 3 in the database.
   # check that those that have 0 in the database bacCloneXRef relationshipe
   # field are not in the list from cloneMarkers
   # select these internal names with 0 relationship from the database
   hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
         danRer3 | sort | uniq > intNameNoRelation.fromdb.sort
   # get all the internal names from the data file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt \
       | sort | uniq > intNamefromCloneMarkers.sort
   comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
   # nothing in common between these two files as expected so there are
   # no internal names in the db with 0 in the relationship field that
   # appear in the clonemarkers file.

# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
   # get intName and extName from ctgnames file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.07.05.txt \
       | sort | uniq > intNameandextNamefromCtgNames.sort
   # get intName and extName from database
   hgsql -N -e 'select intName,name from bacCloneXRef;' danRer3 \
       | sort | uniq > intNameandextName.fromdb.sort
   wc -l intNameandextName*
   # 340039 intNameandextName.fromdb.sort
   # 167858 intNameandextNamefromCtgNames.sort
   comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextindbAndCtgNames
   wc -l intandextindbAndCtgNames
   # 167858 intandextindbAndCtgNames
   # there are 167858 name pairs common between the file and the database
   # and this is the same number of name pairs as in the data file
   diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
   # no difference between those name pairs from the data file and those that
   # are common between the data file and the database so all internal and
   # external names from ctgNames file are in the database
   # get the list of extra ones from db
   comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextNamesindbNotinCtgNames
   wc -l intandextNamesindbNotinCtgNames
   # 172181 intandextNamesindbNotinCtgNames
   # get list of internal names from the clonemarkers file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt | sort | uniq \
       > clonemarkers.intName.sort
   wc -l clonemarkers.intName.sort
   # 13471 clonemarkers.intName.sort
   # compare these intNames to those from the database not in the ctgnames file
   comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
   # none of these clone markers internal names are in this list so they
   # must all be in the ctgnames file too. These extra internal names will be
   # translations of external names found in the list of mappings of BAC clones
   # to chroms.

# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
   # get all extNames from baclones.namesandchrom.uniq and from ctgnames
   awk '{print $1}' ../bacClones.namesandchrom.uniq > \
       extNames.ctgnamesandbacClones
   awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.07.05.txt \
       >> extNames.ctgnamesandbacClones
   wc -l extNames.ctgnamesandbacClones
   # 510169 extNames.ctgnamesandbacClones
   sort extNames.ctgnamesandbacClones | uniq \
        > extNames.ctgnamesandbacClones.sort
   wc -l extNames.ctgnamesandbacClones.sort
   # 340039 extNames.ctgnamesandbacClones.sort
   # get extNames from the database
   hgsql -N -e 'select name from bacCloneXRef;' danRer3 | sort | uniq \
         > extNames.fromdb.sort
   wc -l extNames.fromdb.sort
   # 340039 extNames.fromdb.sort
   comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
         > extNames.fromdbandfiles
   wc -l extNames.fromdbandfiles
   # 340039 extNames.fromdbandfiles
   # find extNames in common from data files and database
   diff extNames.fromdb.sort extNames.fromdbandfiles
   # no difference, all extNames from files are in db

# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
   # get internal names from ctgnames and clonemarkers files
   awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.07.05.txt \
       > intNames.ctgnamesandclonemarkers
   awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.07.05.txt \
       >> intNames.ctgnamesandclonemarkers
   wc -l intNames.ctgnamesandclonemarkers
   # 197743 intNames.ctgnamesandclonemarkers
   sort intNames.ctgnamesandclonemarkers | uniq \
        > intNames.ctgnamesandclonemarkers.sort
   wc -l intNames.ctgnamesandclonemarkers.sort
   # 167858 intNames.ctgnamesandclonemarkers.sort
   # get internal names from database
   hgsql -N -e 'select intName from bacCloneXRef;' danRer3 | sort | uniq \
        > intNames.fromdb.sort
   wc -l intNames.fromdb.sort
   # 340039 intNames.fromdb.sort
   # some of these intNames are derived from the corresponding extNames
   # all of the intNames from the file should be in the db
   comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
        > intNames.fromdbandfiles
   wc -l intNames.fromdbandfiles
   # 167858 intNames.fromdbandfiles
   diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
   # no difference, all intNames from files are in db
                                                                                
# Check that all translations are correct between BAC clone
# external and internal names.
   # write script to get the prefixes from internal and external names
   chmod +x getNamePrefixes.pl
   hgsql -N -e 'select name, intName from bacCloneXRef;' danRer3 \
         | sort | uniq > extandintNames.fromdb.sort
   perl getNamePrefixes.pl < extandintNames.fromdb.sort \
         > extandintNames.prefixes
   sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
   # these all look good
   # BUSM1   dZ
   # CH211   zC
   # CH211   zc
   # CH73    CHORI
   # CT7     bP
   # DKEY    zK
   # DKEY    zk
   # DKEYP   zKp
   # RP71    bZ
   # XX      bY
   # zk is a internal name prefix for the external name prefix, DKEY-. There
   # is only one example where this is used (DKEY-81G7) and this in the
   # ctgnames file and is in the bacCloneXRef table so that is ok.
   # All data looks good in these tables now.

# BLASTZ TETRAODON (tetNig1) (DONE, 2005-10-20, hartera)
# REMADE DOWNLOADS FOR net, all.chain AND over.chain AS THEY HAD BEEN DELETED.
# MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
# ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
    # Tetraodon is quite distant from zebrafish, more distant than human/chicken
    # so use the HoxD55.q matrix for the Blastz alignments.
    # Blastz requires lineage-specific repeats but there are none
    # available between these two fish species 
    
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/blastz.tetNig1.2005-10-11
    cd /cluster/data/danRer3/bed
    ln -s blastz.tetNig1.2005-10-11 blastz.tetNig1
    cd /cluster/data/danRer3/bed/blastz.tetNig1
    # create a 2bit file for danRer3 with all chroms (1-25 and M) and the
    # scaffolds for NA and Un if it does not exist already
    cd /cluster/data/danRer3
    faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
               Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
    ssh hgwdev
    # move the 2 bit file for danRer3 to the san if not there already
    mkdir -p /san/sanvol1/scratch/danRer3/
    mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
       /san/sanvol1/scratch/danRer3/
    # also copy over the danRer3 2 bit file for all chroms and the
    # lift file for NA and Un scaffolds to chrNA and chrUn.
    cp /cluster/data/danRer3/danRer3.2bit /san/sanvol/scratch/danRer3/
    cp /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
       /san/sanvol1/scratch/danRer3/

    # also copy over tetraodon sequences to the san
    mkdir -p /san/sanvol1/scratch/tetNig1/contigs
    cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
    # see makeTetNig1.doc for making tetNig1ChrContigsRandomScafs.2bit
    # make output and run directories
    mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
    mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
    cd /cluster/data/danRer3/bed/blastz.tetNig1
    ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
    ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
    # also copy over tetraodon sequences to the san
    mkdir -p /san/sanvol1/scratch/tetNig1/contigs
    cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
       /san/sanvol1/scratch/tetNig1/contigs/
# use tetraodon sequence in contigs for dynamic masking - see below
# for dynamic masking: M=50. Each time a base is hit at least 50 times, it
# is masked out.
# Blastz danRer3 chroms and scaffolds vs tetNig1 ordered chrom contigs and 
# scaffolds from random chromosomes. lift up the tetNig1 contigs to chrom 
# level. Then make the chains and then liftUp all the scaffolds to chrom 
# level before sorting and merging chains and then netting.
    # get all contigs from mapped ordered chroms and make 2bit file
    # see makeTetNig1.doc

    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000

# QUERY - Tetraodon (tetNig1)
# soft-masked 500 kb contigs for chroms, scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    cp /cluster/data/danRer3/chrom.sizes ./S1.len
    twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
               chromsUnNAScafs.sizes
    twoBitInfo \
/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit ./S2.len
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
  -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -stop cat \
  -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
  `pwd`/DEF >& do.log &
   # PID 32339  Start: Tue Oct 11 14:55
   # use Hiram's script to kill 4 empty shell commands on Thurs Oct 13th
   # /cluster/bin/scripts/findEmpty.sh -r to find
   # /cluster/bin/scripts/findEmpty.sh -K to kill
# Fri Oct 14 10:41
# Checking finished jobs
# crashed: 32
# running: 20
# ranOk: 3716
# failed 4 times: 32
# total jobs in batch: 3768
# check problems:

# 141 jobs crashed on host: kkr10u19.kilokluster.ucsc.edu
# Just removed this machine with parasol remove machine as over 9000 jobs 
# crashed for opossum run on this machine.
# run again with para push -retries=20
# By 16:00 on Fri Oct 14, all jobs finished but 2 failed 4 times so repush
# with para push -retries=20.
# para time
# Completed: 3768 of 3768 jobs
# CPU time in finished jobs:   12465019s  207750.32m  3462.51h  144.27d  0.395 y
# IO & Wait Time:                873594s   14559.90m   242.66h   10.11d  0.028 y
# Average job time:                3540s      59.00m     0.98h    0.04d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           19777s     329.62m     5.49h    0.23d
# Submission to last job:        264857s    4414.28m    73.57h    3.07d
    ssh pk
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/run.blastz
    para time > run.time
    # run doBlastzChainNet.pl to continue with cat step since the script
    # crashed when some of the jobs failed 4 times.
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
  -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -continue cat -stop cat \
  -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
  `pwd`/DEF >& doCat.log &
    # Took about 7 minutes.
    # Now need to liftUp the contigs for tetNig1 to chrom-level but
    # not the scaffolds. All the scaffolds will be lifted after the 
    # chaining step.
    ssh kolossus
    # liftUp contigs for tetraodon query: 
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
    mv pslParts pslPartsNotLifted
    mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun/liftedPsl
    set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
    # use carry for "how" as this will carry items not in liftSpec to dest
    # file without translation. lift file is only for contigs not scaffolds.
    # use nohead option otherwise psl header added at the top of each file.
    # need to add the blastz params header
    zcat ./pslPartsNotLifted/part958.lst.psl.gz | head -3 > header

    # first lift to pseudo-contig level and then to chroms
  foreach f (./pslPartsNotLifted/*.psl.gz) 
     set g=$f:r:t
     zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
  /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft carry stdin
     liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
  /cluster/data/tetNig1/jkStuff/liftAll.lft carry $dir/liftedPsl/${g}.lifted.psl
     cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
     rm $dir/liftedPsl/${g}.lifted*
  end
    # check a couple of files and see that they have the correct number of lines
    # then move the contents of this directory to pslParts
    mkdir $dir/pslParts
    foreach f ($dir/liftedPsl/*.psl)
       gzip $f 
       mv ${f}.gz $dir/pslParts/
    end
    # carry on with doBlastzChainNet.pl from the chaining step
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
    cp DEF DEF.tetraContigs
    # edit DEF file so that tetNig1 now has a 2bit file of the chroms and 
    # scaffolds for randoms in the CTGDIR and also there is a lift file
    # for the scaffolds.
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
                                                                                
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
                                                                                
# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000
                                                                                
# QUERY - Tetraodon (tetNig1)
# soft-masked chroms, and scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
                                                                                
BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
TMPDIR=/scratch/tmp
                                                                                
#DEBUG=1
'_EOF_'
    # if it does not exist already, make the file of sizes for the tetNig1
    # chroms and scaffolds.
    twoBitInfo \
/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit \
    /san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
    # Also, need to change the sequence sizes file for tetNig1 to the 
    # chrom sizes and not the scaffolds and contigs sizes.
    cp S2.len S2contigsAndScafs.len
    cp /cluster/data/tetNig1/chrom.sizes S2.len
    # then run doBlastzChainNet.pl script again
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
         -bigClusterHub=pk \
         -smallClusterHub=pk \
         -workhorse=pk \
         -fileServer=kolossus \
         -continue chainRun \
         -chainMinScore=5000 \
         `pwd`/DEF >& doChains.log &
    # Start: Fri Oct 14 17:47 Finished: Oct 14 17:57
    # crashed as one job failed after 4 retries, problem is that 
    # part958.lst.psl.gz is not recognized as a psLayout file. It is empty
    # except for parameter comment lines so it can be ignored.
    # Also, need to change the sequence sizes file for tetNig1 to the 
    # chrom sizes and not the scaffolds and contigs sizes.
    ssh pk
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/run/
    para time > run.time
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
    # crashes while doing chainMerge so add a flag into DEF file to indicate
    # that the genomes are in scaffolds so there is a large number of chain 
    # files. Changed doBlastzChainNet.pl so that if this flag is seen then 
    # the chain files are concatentated and then chainSort is used to sort 
    # the resulting chain file by score and chainMergeSort is used to renumber 
    # the chain IDs so that they are unique. chainMergeSort expects chain 
    # files sorted by score as input.
    # add this line to the DEF file: GENOME_IN_SCAFFOLDS=1 
    nice ./doBlastzChainNet.pl \
         -bigClusterHub=pk \
         -smallClusterHub=pk \
         -workhorse=pk \
         -fileServer=kolossus \
         -continue chainMerge \
         -chainMinScore=5000 \
         `pwd`/DEF >& doChainMergeNet.log &
    # Start: Wed Oct 19 12:52 Finish: Oct 19 13:13   
    # Add a trackDb.ra entry for chainTetNig1 and netTetNig1 and add html 
    # pages. Modify track descriptions to describe the process using 
    # scaffolds for danRer3 chrNA and chrUn and the fact that dynamic 
    # masking was used for the Blastz alignments Edit the README for 
    # the downloads to add in information about using scaffolds for Blastz 
    # for danRer3 chrNA and chrUn and for tetNig1 random unordered chroms, 
    # and how the tetNig1 genome was aligned as a file of contigs for chroms
    # and scaffolds for randoms for the Blastz alignments and so that
    # each danRer3 chunk was aligned with the whole of the tetraodon 
    # genome to take advantage of dynamic masking (M=50).
    # Finally, run a doBlastzChainNet.pl swap for this to create danRer3 
    # chains and net tracks on tetNig1 - see makeTetNig1.doc.
 
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.746%, chainTetNig1Link 7.167%, both 0.672%, cover 90.17%, 
# enrich 12.58x
# featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.750%, chainTetNig1Link 4.463%, both 0.621%, cover 82.84%, 
# enrich 18.56x
# so better coverage for danRer3 but less enrichment than for danRer2.

# Make the download files for all.chain, over.chain and net again as these
# files have been removed. Put the files on /cluster/data rather than the 
# san so that they are not moved again. (hartera, 2005-11-17)
    ssh kolossus
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
    chainMergeSort ./run/chain/*.chain | nice gzip -c \
                   > danRer3.tetNig1.all.chain.gz
    # copy over.chain file from bedOver directory to axtChain directory
    cp /cluster/data/danRer3/bed/bedOver/danRer3.tetNig1.over.chain.gz \
       /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/
    # recreate net file
    # make noClass.net
    #Make nets ("noClass", i.e. without rmsk/class stats which are added later) 
    chainPreNet danRer3.tetNig1.all.chain.gz ../S1.len ../S2.len \
         stdout | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout \
         /dev/null | netSyntenic stdin noClass.net 
    # memory usage 251383808, utime 562 s/100, stime 41
    # create net file 
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
    netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
    # compress net file
    gzip danRer3.tetNig1.net

    # Move these files to /cluster/data and remake download links as the 
    # san is not a permanent storage space.
    mv /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun \
       /cluster/data/danRer3/bed/blastz.tetNig1/
    # Then change the symlinks in the downloads directory to point to the files
    # on /cluster/data
    cd /usr/local/apache/htdocs/goldenPath/danRer3/vsTetNig1/axtNet
    set runDir=/cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
    rm *.gz
    foreach f ($runDir/axtNet/*.axt.gz)
      ln -s $f .
    end
    cd ..
    rm *.gz
    foreach f ($runDir/axtChain/*.gz)
      ln -s $f
    end
    # remake the md5sum file
    rm md5sum.txt
    md5sum *.gz */*.gz > md5sum.txt
    
    # Test Runs for chr2 and chrUn
    cd /cluster/data/danRer3/bed/blastz.tetNig1
    mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
    ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
    # create blastz output directory
    mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
    ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
    mkdir /san/sanvol1/scratch/danRer3/chrUnand2
    cd /san/sanvol1/scratch/danRer3/chrUnand2
    cp ../nib/chr2.nib ../nib/chrUn.nib .
    rsync -a --progress /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit \
       /san/sanvol1/scratch/tetNig1/contigs/
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run

cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM
SEQ1_DIR=/san/sanvol1/scratch/danRer3/chrUnand2
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target
SEQ1_CHUNK=500000
SEQ1_LAP=500

# QUERY - Tetraodon (tetNig1)
# soft-masked 500 kb contigs for chroms, scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
   
    cp /cluster/data/danRer3/chrom.sizes ./S1.len 
    twoBitInfo \
    /san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit ./S2.len
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
      -bigClusterHub=pk \
      -smallClusterHub=pk \
      -workhorse=pk \
      -fileServer=kolossus \
      -stop cat \
      -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
      -chainMinScore=5000 \
      `pwd`/DEF >& do.log &
      # PID: 4890 Start: Thu Sep 29 14:50
      # ran quickly, 30 mins
      # crashed as some jobs crashed and failed after 4 retries so 
      # push them again. 
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
      -bigClusterHub=pk \
      -smallClusterHub=pk \
      -workhorse=pk \
      -fileServer=kolossus \
      -continue cat \
      -stop cat \
      -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
      -chainMinScore=5000 \
      `pwd`/DEF >& doCat.log &
    # Took a couple of minutes
    # need to lift up the contigs to chrom level for tetNig1 
    # liftUp contig files for tetraodon query: 
    # if file is empty, then liftUp gets stuck reading commented lines
    # so make a list of files which contain alignment data and not just
    # commented lines starting with # (blastz parameters)
    foreach f (./pslPartsNotLifted/*.psl.gz)
        zcat $f | awk '{if ($1 !~ /#/) print "'$f'";}' >> pslParts.lst
    end
    sort pslParts.lst | uniq > pslPartsNotEmpty.lst 
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run
    mv pslParts pslPartsNotLifted
    mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run/liftedPsl
    set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
    # use carry for "how" as this will carry items not in liftSpec to dest
    # file without translation. lift file is only for contigs not scaffolds.
    # use nohead option otherwise psl header added at the top of each file.
    # need to add the blastz params header
    zcat \
    ./pslPartsNotLifted/chrUn.nib:chrUn:99500000-100000500.psl.gz \
    | head -3 > header

    # first lift to pseudo-contig level and then to chroms
  foreach f (`cat pslPartsNotEmpty.lst`) 
     set g=$f:r:t
     zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
  /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn stdin
     liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
  /cluster/data/tetNig1/jkStuff/liftAll.lft warn $dir/liftedPsl/${g}.lifted.psl
     cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
     rm $dir/liftedPsl/${g}.lifted*
  end
    mv liftedPsl pslParts
    # need to gzip these again
    foreach f (./pslParts/*.psl)
       gzip $f
    end
    # then carry on with chaining for these danRer3 NA and Un scaffolds
    # tetNig1.2bit has full chroms for ordered chroms
    # and randoms as scaffolds
    cp DEF DEF.contigs
    # copy over 2bit file with chroms for tetNig1 if not
    # there already.
    mv S2.len S2.contigs
    twoBitInfo \
    /san/sanvol1/scratch/tetNig1/tetNig1.2bit ./S2.len
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
      -bigClusterHub=pk \
      -smallClusterHub=pk \
      -workhorse=pk \
      -fileServer=kolossus \
      -continue chainRun \
      -stop net \
      -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
      -chainMinScore=5000 \
      `pwd`/DEF >& doNet.log &
    # PID 1117    Start: Thu Sep 29 16:20 Finished: 16:24
    # crashed: says it can't find [danRer3.tetNig1.]all.chain[.gz] but it 
    # is there.
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
      -bigClusterHub=pk \
      -smallClusterHub=pk \
      -workhorse=pk \
      -fileServer=kolossus \
      -continue net \
      -stop net \
      -chainMinScore=5000 \
      `pwd`/DEF >& doNet2.log &
    # Took 1 minute
    # TO DO: load tables
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain/chain
    foreach f (*.chain)
       set c=$f:r
       hgLoadChain danRer3 ${c}_chainTetNig1NoScafs $f
    end
    cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain
    # add gap/repeat stats to net file using db tables
    netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
    # load nets
    netFilter -minGap=10 danRer3.tetNig1.net \
              | hgLoadNet -verbose=0 danRer3 netTetNig1NoScafs stdin

    # then need to load chains and net into browser with a different name
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.742%, chainTetNig1Link 7.166%, both 0.670%, cover 90.26%, 
# enrich 12.60x
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
# refGene:cds 0.742%, chainTetNig1NoScafsLink 7.171%, both 0.670%, cover 90.30%, enrich 12.59x
# featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.497%, chainTetNig1Link 6.175%, both 0.441%, cover 88.68%, enrich 14.36x
# featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
# refGene:cds 0.497%, chainTetNig1NoScafsLink 6.179%, both 0.441%, cover 88.67%, enrich 14.35x
Rows in chainTetNig1Link:
    tetNig1	tetNig1NoScafs
chr2	308576	303236
chrUn	1133922 1114061

#nets:
# featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.742%, netTetNig1 62.053%, both 0.715%, cover 96.34%, enrich 1.55x
# featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1NoScafs -enrichment
# refGene:cds 0.742%, netTetNig1NoScafs 63.095%, both 0.717%, cover 96.63%, enrich 1.53x
# featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.497%, netTetNig1 48.803%, both 0.477%, cover 95.87%, enrich 1.96x
# featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1NoScafs -enrichment
# refGene:cds 0.497%, netTetNig1NoScafs 49.207%, both 0.478%, cover 96.01%, enrich 1.95x
#  Rows in netTetNig1
# 	tetNig1		tetNig1NoScafs
chr2	17370		17415
chrUn	56259		56360

# featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.739%, chainTetNig1Link 4.463%, both 0.617%, cover 83.44%, 
# enrich 18.69x
# featureBits -chrom=chr2 danRer3 refGene:cds chainNoHoxD55TetNig1Link -enrichment
# refGene:cds 0.668%, chainNoHoxD55TetNig1Link 4.815%, both 0.587%, 
# cover 87.95%,enrich 18.27x

# featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55TetNig1Link -enrichment
# refGene:cds 0.668%, chainHoxD55TetNig1Link 7.846%, both 0.612%, cover 91.71%, enrich 11.69x
# HoxD55.q with mm6 parameters but H=2500:
# featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55v2TetNig1Link -enrichment
# refGene:cds 0.668%, chainHoxD55v2TetNig1Link 7.400%, both 0.601%, 
# cover 90.10%,enrich 12.18x

# if H=2000 is used, one job does not finish for blastz after a day.
# makes little difference if use mm6 parameters
#  Database   	Table			Number of chains
#  danRer2	chr2_chainTetNig1		21176
#  danRer3	chr2_chainNoHoxD55TetNig1	16076
#  danRer3	chr2_chainHoxD55TetNig1		23951
#  danRer3	chr2_chainHoxD55v2TetNig1	21378
# also there are more lower scoring chains with HoxD55 alone than for 
# no HoxD55 or using the mm6 parameters with HoxD55. However, using HoxD55
# seems to increase the number of higher scoring chains.

# BLASTZ, CHAIN AND NET FOR OPOSSUM (monDom2) (DONE, 2005-10-18, hartera)
# MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
# ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/blastz.monDom2.2005-10-07
    cd /cluster/data/danRer3/bed
    ln -s blastz.monDom2.2005-10-07 blastz.monDom2
    # create a 2 bit for danRer3 with all chroms (1-25 and M) and the
    # scaffolds for NA and Un.
    cd /cluster/data/danRer3
    faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
               Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
    ssh hgwdev
    mkdir -p /san/sanvol1/scratch/danRer3/
    mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
       /san/sanvol1/scratch/danRer3/
    # make output and run directories
    mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
    mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
    cd /cluster/data/danRer3/bed/blastz.monDom2
    ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
    ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
    cd chromsAndScafsRun
    cat << '_EOF_' > DEF
# zebrafish (danRer3) vs opossum (monDom2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
                                                                                
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
                                                                                
# TARGET - zebrafish (danRer3) soft-masked chroms 1-25 and chrM, and
# scaffolds for NA and Un
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Opossum (monDom2)
# soft-masked sequence in scaffolds
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
                                                                                
BASE=/san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
                                                                                
#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    cp /cluster/data/danRer3/chrom.sizes S1.len
    twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
               chromsUnNAScafs.sizes
    cp /cluster/data/monDom2/chrom.sizes S2.len
    # now do the run
  nice /cluster/bin/scripts/doBlastzChainNet.pl \
  -bigClusterHub=pk \
  -smallClusterHub=pk \
  -workhorse=pk \
  -fileServer=kolossus \
  -stop cat \
  -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
  -chainMinScore=5000 \
  `pwd`/DEF >& do.log &
    # chromsAndScafs PID 19811      Start: Fri Oct  7 15:16
    # Friday Oct 14th 10:30 - 
# Checking finished jobs
# crashed: 3271
# ranOk: 90399
# failed 4 times: 3271
# total jobs in batch: 93670
# more than 9000 crashed on one machine: kkr10u19.kilokluster.ucsc.edu
# so remove this machine.
# run again with para push -retries=20

    # still 7 jobs crashed so repush again with para push -retries=20
    # Now try using the SEQ1_LIMIT option in the DEF file to limit the 
    # number of sequences in a partition file to 30. Before, there would 
    # be a lot of small sequences in a partition file that would take a long
    # time to run.
    # finished around 21:40 Fri Oct 14 Took about 7 days, maybe a little less 
    # as a number of jobs crashed last night.
    # carry on from the cat step to the end
    ssh pk
    cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun/run.blastz
    para time > run.time
# para time
# Completed: 93670 of 93670 jobs
# CPU time in finished jobs:   55738486s  928974.77m 15482.91h  645.12d  1.767 y
# IO & Wait Time:               1276213s   21270.22m   354.50h   14.77d  0.040 y
# Average job time:                 609s      10.14m     0.17h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1470s      24.50m     0.41h    0.02d
# Submission to last job:        627367s   10456.12m   174.27h    7.26d

    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
  nice /cluster/bin/scripts/doBlastzChainNet.pl \
  -bigClusterHub=pk \
  -smallClusterHub=pk \
  -workhorse=pk \
  -fileServer=kolossus \
  -continue cat \
  -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
  -chainMinScore=5000 \
  `pwd`/DEF >& doCatChainNet.log &
  # Took 13 minutes to cat then chain. It had 70 jobs crash at the chaining
  # step. These are empty files - when axtChain opens them using 
  # pslxFileOpenWithMeta (in psl.c) it aborts as the file is empty apart from
  # meta data and therefore not psLayout format. Ignore these crashed jobs
  # for now and then modify psl.c so it will skip over these empty files.
  # Next, the script crashed on the chainMergeSort step
  # since there are too many chains due to opossum being scaffold-based. 
  # chainMergeSort opens all the files at once.
  # Added a flag to the DEF file to show if an assembly is scaffold-based:
  # GENOME_IN_SCAFFOLDS=1 
  # and then modify doBlastzChainNet.pl so that if it sees this flag, then
  # chains are merged into one file then run chainSort to sort the file 
  # and then chainMergeSort to change the IDs so they are unqiue.
  # chainMergeSort assumes that the input files are sorted already.
  nice ./doBlastzChainNet.pl \
  -bigClusterHub=pk \
  -smallClusterHub=pk \
  -workhorse=pk \
  -fileServer=kolossus \
  -continue chainMerge \
  -chainMinScore=5000 \
  `pwd`/DEF >& doChainMergeNet.log &
  # Start: Tue Oct 18 12:55 Finished: 15:02
# add trackDb.ra entries for monDom2 chain and net tracks and add html for
# these tracks too. Modified html pages to describe the process using 
# scaffolds for chrUn and chrNA for danRer3.
# Modify the downloads README.txt to include a description of the process
# of running blastz with scaffolds for the chrUn and chrNA unordered chroms.
# Finally run the swap for this to get danRer3 chains and net tracks 
# on monDom2 - see makeMonDom2.doc. 
    # Move the run directory files to /cluster/data and remake download links
    # as the san is not a permanent storage space (hartera, 2005-11-17)
    ssh hgwdev 
    mv /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun \
       /cluster/data/danRer3/bed/blastz.monDom2/
    # then change the symlinks in the downloads directory to point to the files
    # on /cluster/data
    cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMonDom2/axtNet
    set runDir=/cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
    rm *.gz
    foreach f ($runDir/axtNet/*.axt.gz)
      ln -s $f .
    end
    cd ..
    rm *.gz
    foreach f ($runDir/axtChain/*.gz)
      ln -s $f
    end
    # remake the md5sum file
    rm md5sum.txt
    md5sum *.gz */*.gz > md5sum.txt
   
# RADIATION HYBRID (RH) MAP TRACK (DONE, 2005-09-06, hartera)
    # Data from Leonard Zon's lab at the Childrens Hospital, Boston
    # Provided by Anhua Song: asong@enders.tch.harvard.edu
    # Updated data provided on 2006-02-23
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap
    cd /cluster/data/danRer3/bed/ZonLab/rhMap
    # download data from e-mail to this directory
    # new sequences (2006-02-23) are available
    unzip rhSequenceSubmit022306.zip
    # sequences are in rhSequenceSubmit022306/rhSequenceSubmitSeq022306.txt
    # primer information is in rhSequenceSubmit022306/rhSequenceSubmit022306.txt
    mv rhSequenceSubmitSeq022306.txt rhMap022306.fa
    mv rhSequenceSubmit022306.txt rhMapPrimers022306.txt
    # first remove ^M from end of lines
    dos2unix rhMap022306.fa
    dos2unix rhMapPrimers022306.txt
    grep '>' rhMap022306.fa | wc -l
    # 11514
    wc -l rhMapPrimers022306.txt
    # 13438 rhMapPrimers022306.txt
    grep '>' rhMap022306.fa > rhMap.names
   
    # remove '>' from names and grab first field
    perl -pi.bak -e 's/>//' rhMap.names
    awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
        > rhMap.namesOnly.sort
    awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq \
        > rhMapPrimers.namesOnly.sort
    wc -l *.sort
    # 11514 rhMap.namesOnly.sort
    # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
    # There are no replicates this time for rhMap sequences but there are for
    # the primers set:
    awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq -c \
        | sort -nr > rhMapPrimers.names.count
    # These replicates are blank lines so there are no replicates
    # Total 11514 sequences in rhMap, but 13436 primer sets

    # 11527 rhMap.namesOnly.sort
    # 13436 rhMapPrimers.namesOnly.sort

    # get a list of headers from the FASTA file
    grep '>' rhMap022306.fa > rhMap.headers
    awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
    # BAC_END
    # EST
    # GENE
    # SSLP
    # STS
    # 5 types of sequence
    awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
    # BACends
    # Custom
    # Insertion_Mutant
    # Insertion_Mutants
    # MGH
    # NCBI
    # Sanger SG
    # Sequencing_Project
    # ThisseClone
    # Thisse_Clone
    # other_zfEst
    # wu_zfEst
    # wz
    # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
    # So there are 11 different sources.
    awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
    # CHBG
    # MPIEB
    
    # There are 2 sequences with problem primers. E-mailed Peter Song about
    # these and he suggested to delete thoser primers:
    # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
    # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
    # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
    # need to reformat FASTA headers so they are in the format: 
    # NAME.SOURCE.TYPE.ORIGIN
    # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
    # so change these to have the same name. Also shorten Sanger SG to Shotgun.

    perl -pi.bak -e 's/Insertion_Mutant/InsertMut/' rhMap022306.fa
    perl -pi.bak -e 's/Insertion_Mutants/InsertMut/' rhMap022306.fa
    perl -pi.bak -e 's/Sanger SG/Shotgun/' rhMap022306.fa
    perl -pi.bak -e 's/ThisseClone/Thisse/' rhMap022306.fa
    perl -pi.bak -e 's/Thisse_Clone/Thisse/' rhMap022306.fa
    perl -pi.bak -e 's/Sequencing_Project/Seqproj/' rhMap022306.fa
   
    # use a script to reformat the names for the FASTA headers to the format 
    # >NAME.SOURCE where name is the first field separated by "|" and source
    # is the 9th field. The source is used to make the name unique. Some
    # of these names are BAC ends that occur in the BAC ends track so there
    # are name clashes in the seq table if the names are not made unique.
    # Also make the name upper case as for those for the danRer1 and danRer2
    # RH map. 
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    split(toupper($0), a, "\\|");
    print a[1]"."a[9];
    next;
}

/^[0-9]+ / {
    $0 = $2;
}

{
    print $0;
}

'_EOF_'
# << keep emacs coloring happy
    chmod +x rhFix
    rhFix rhMap022306.fa > rhMap.fa
    # Blat sequences vs danRer3 genome
    ssh pk
    mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
    # make output directory
    mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
    ln -s /san/sanvol1/scratch/danRer3/rhMap/psl .
    # copy input to the san
    cp \
  /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
    /san/sanvol1/scratch/danRer3/rhMap/
    # do the blat run to align RH map sequences to danRer3 and do separate
    # runs for chroms and scaffolds from chrUn and chrNA
    ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
    ls -1S /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/chr[0-9M]*.fa \
          > genome.lst
    # use the individual scaffolds for chrUn and chrNA alignments
    foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/Zv5_*.fa)
        ls -1S $f >> genome.lst
    end
    wc -l genome.lst
    # 15149 genome.lst
    cp -p /cluster/data/danRer3/bed/ooc/danRer3_10.ooc \
          /san/sanvol1/scratch/danRer3
# try same parameters as for BAC ends
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc {check out line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    # gensub2 genome.lst rhmap.lst gsub spec
    gensub2 genome.lst rhMap.lst gsub spec
    para create spec
    para try, check, push, check etc.
# para time
# Completed: 15149 of 15149 jobs
# CPU time in finished jobs:      16326s     272.09m     4.53h    0.19d  0.001 y
# IO & Wait Time:                 41360s     689.34m    11.49h    0.48d  0.001 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              61s       1.02m     0.02h    0.00d
# Submission to last job:           263s       4.38m     0.07h    0.00d
    
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
    # Make & check the psl table
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create rhmap.psl
    pslSort dirs raw.psl tmp psl
    pslReps -nearTop=0.0001 -minAli=0.80 -minCover=0.20 raw.psl \
            contig.psl /dev/null
    # There are 11514 sequences in total in rhMap.fa
    # Experimented with different parameters:
    # little difference if STS markers BLAT parameters were used 
    # i.e. -ooc=11.ooc and -stepSize=5.
    # For Blat parameters used above (-ooc=10.ooc and -tileSize=10), try
    # different pslReps parameters using minCover=0.40 and nearTop=0.0001:
    # minAli=0.96, 83%, most aligned sequence has 11 alignments.
    # minAli=0.90, 88% align, most aligned seq has 11 alignments
    # minAli=0.80, 88%, 10120 sequences aligned. 
    # at minAli=0.50, there are still 10120 sequences aligned so those that
    # are not aligning must have very low sequence identity. Took a look at 
    # some that are not aligning e.g. 2217C, 2791C and these are not passing
    # the minCover=0.40 criterion. Some sequences have Ns in them too
    # e.g. ZC92E13.YBF so has a lot of short alignments that do not pass
    # the minCover parameter. Lowering minCover increases the number of 
    # sequences aligned:
    # minAli=0.80, minCover=0.20, there are 10850 (94%) of sequences aligned. 
    # minAli=0.90, minCover=0.20, there are 10837 (94%) of sequences aligned
    # with 21 less alignment than for minAli=0.80. 
    # Most alignments for one sequence is 99, second most is 11. There are 
    # about 1851 sequences with more than > 1 alignment (many of these 
    # have 2 alignments) while for minAli=0.80 and minCover=0.40, there were
    # 1266 sequences with more than 1 alignment. With lower minCover, more
    # sequences align, but there are more sequences with higher numbers of
    # multiple alignments. At minCover=0.0, there is 1 sequence with 1353
    # alignments, the second largest number of alignments for 1 sequence
    # is 532, then 329 etc. So use minAli=0.80 and minCover=0.20 to get the
    # most sequences aligned without having sequences aligning too many times. 
    # at minAli=0.80 and minCov=0.20, there are 10850 sequences aligned (94%). 
    # 88% of sequences were aligned for danRer2.
    # merge together liftAll and scaffolds lift then lift psl to chrom level.
    cat /cluster/data/danRer3/liftSuperToChrom/liftNAandUnScaffoldsToChrom.lft \        /cluster/data/danRer3/jkStuff/liftAll.lft \
        > /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft 
    liftUp rhMap.psl \
           /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft \     
           warn contig.psl
    # Got 30168 lifts
    pslCheck rhMap.psl
    # psl is ok
    # Load sequence alignments into database.
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
    # drop old table and reload (hartera, 2006-03-26)
    echo "drop table rhMap;" | hgsql danRer3
    hgLoadPsl danRer3 rhMap.psl
    # cleanup
    rm -r /san/sanvol1/scratch/danRer3/rhMap/psl 
    rm psl para.results batch batch.bak spec
    rm -r err  
    gzip *.psl
    # Copy sequences to gbdb if they are not already there.
    mkdir -p /gbdb/danRer3/rhMap
    ln -s \
      /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
      /gbdb/danRer3/rhMap/rhMap022306.fa 
    # then add sequences to database:
    # reloaded (hartera, 2006-03-26)
    hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap022306.fa
    # Note: first time these sequences were loaded there was a problem 
    # 2215 are not loaded into database, these all
    # have names with extensions like .YB, .YC etc. so remove from extFile
    # and seq. Sequences with the same IDs are already in the seq table
    # for the BAC ends tracks so need to make these RH map names unique.
    hgsql -e 'delete from seq where extFile = 736113;' danRer3
    hgsql -e 'delete from extFile where id = 736113;' danRer3
    hgsql -e 'update history set errata = "Removed sequences. Error so not all asequences loaded." where ix = 23;' danRer3 
    
    # Check that all the headers from rhMap.headers are also in the primers
    # file which seems to contain the same headers from the FASTA file
    # as well as additional markers.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
    perl -pi.bak -e 's/>//' rhMap.headers
    sort rhMap.headers > rhMap.headers.sort
    sort rhMapPrimers022306.txt > rhMapPrimers.sort
    wc -l *.sort
    # 11514 rhMap.headers.sort
    # 13437 rhMapPrimers.sort
    comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
    # 11514 in common
    # so all FASTA headers from rhMap022306.fa are in the primers file
    # Get headers again from rhMap.fa file as the names of the sources have
    # been changed. Parse out information from headers to add to an rhMapInfo
    # table so that this information can be displayed on the details page for
    # the RH map markers.
    # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the 
    # RH map for that linkage group, 4 - distance (in cR) from the 
    # top of a linkage group, 4 - position number in entire RH map (ordered 
    # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
    # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
    # 12 - 3' reverse primer.
    # Sort headers by linkage group and by position
    grep '>' rhMap022306.fa > rhMap.headers2
    # then use the rhMap.headers2 file to extract the marker information
    # and to reformat the names for the FASTA headers to the format 
    # >NAME.SOURCE where name is the first field separated by "|" and source
    # is the 9th field so that names in the rhMap and rhMapInfo tables are 
    # the same. The source is used to make the name unique. 
cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    sub(/>/,"",$0);
    split(toupper($0), a, "\\|");
    print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
    next;
}
'_EOF_'
# << keep emacs coloring happy
    chmod +x getRhInfo
    getRhInfo rhMap.headers2 > rhMapInfo.tab
    # Sort headers by linkage group (LG) and by position
    sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
    wc -l rhMapInfoSorted.tab
    # 11514 rhMapInfoSorted.tab
 
    ssh hgwdev 
    # Create a table with RH map item information including type, source,
    # origin and primer sequences.
    cat << 'EOF' > ~/kent/src/hg/lib/rhMapInfo.as
table rhMapInfo
"Radiation Hybrid map information"
(
string name;		"Name of Radiation Hybrid (RH) map marker"
string linkageGp;	"Linkage group to which the marker was mapped"
uint position;  	"Position number in RH map for this linkage group"
uint distance;  	"Distance from the top of linkage group (cR)"
string markerType;      "Type of marker"
string source;    	"Source of marker"
string mapSite;   	"Institution that mapped the marker"
string leftPrimer; 	"Forward primer sequence"
string rightPrimer; 	"Reverse primer sequence"
)
'EOF'
# << happy emacs
    # create .sql, .c and .h files using autoSql
    autoSql rhMapInfo.as rhMapInfo
    mv rhMapInfo.h ../inc
    # rhMapInfo.sql - name is the primary key
    # commit rhMapInfo.as, .sql, .c and .h files to CVS.   
    # create and load table (Reloaded: hartera, 2006-03-26)
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306 
    echo "drop table rhMapInfo;" | hgsql danRer3
    hgsql danRer3 < ~/kent/src/hg/lib/rhMapInfo.sql
    hgsql -e \
    'load data local infile "rhMapInfoSorted.tab" into table rhMapInfo' danRer3
    
    # edit danRer3/trackDb.ra to add rhMap track and the search spec.  
    # add and edit rhMap.html to describe the info data.
    # edit ~/kent/src/hg/hgc/hgc.c so that the rhMapInfo data is displayed 
    # on the details page for each marker - edit doRHmap function.
    # Add a rule to all.joiner to check that all names in rhMap also appear 
    # in rhMapInfo
    # Add a rule to all.joiner to check that all names in rhMap also appear 
    # in rhMapInfo..
    # commit these to CVS.
    # Changed termRegex for  rhMap search in trackDb.ra so that it works 
    # for all IDs. (2006-04-19, hartera)

# SELF BLASTZ, CHAIN, NET, AXTNET, MAFNET AND DOWNLOADS
# (DONE, 2005-12-02, hartera)
    ssh pk
    mkdir -p /cluster/data/danRer3/bed/blastzSelf.2005-11-30
    cd /cluster/data/danRer3/bed
    ln -s blastzSelf.2005-11-30 blastzSelf
    cd /cluster/data/danRer3/bed/blastzSelf
    # make run directory on the san
    mkdir -p /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
    ln -s /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
    # make 2 bit file of chr1-25 and chrM
    cd /cluster/data/danRer3
    faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
        /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
    cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
    twoBitInfo /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit S1.len
    cp S1.len S2.len
    cat << '_EOF_' > DEF
# zebrafish vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
                                                                                
BLASTZ=blastz.v7.x86_64
BLASTZ_L=5000
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0
                                                                                
# TARGET: Zebrafish danRer3
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000
                                                                                
# QUERY: Zebrafish danRer3
SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
SEQ2_SELF=1
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1800000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
'_EOF_'
    chmod +x DEF
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
         -bigClusterHub=pk \
         -smallClusterHub=pk \
         -workhorse=pk \
         -fileServer=kolossus \
         -chainMinScore=5000 \
         -chainLinearGap=medium \
         `pwd`/DEF >& do.log &
    # Start: Wed Nov 30 17:07 Finish: Thur Dec  1 06:51
    # Crashed at downloads step as these exist from previous run so remove
    rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf 
# para time (blastz)
# Completed: 2425 of 2425 jobs
# CPU time in finished jobs:    4783120s   79718.66m  1328.64h   55.36d  0.152 y
# IO & Wait Time:                108014s    1800.24m    30.00h    1.25d  0.003 y
# Average job time:                2017s      33.62m     0.56h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2762s      46.03m     0.77h    0.03d
# Submission to last job:         14993s     249.88m     4.16h    0.17d

# para time (axtChain)
# Completed: 26 of 26 jobs
# CPU time in finished jobs:      96405s    1606.74m    26.78h    1.12d  0.003 y
# IO & Wait Time:                   731s      12.19m     0.20h    0.01d  0.000 y
# Average job time:                3736s      62.27m     1.04h    0.04d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            7405s     123.42m     2.06h    0.09d
# Submission to last job:          7411s     123.52m     2.06h    0.09d

    # Carry on from downloads step.
    cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
    nice /cluster/bin/scripts/doBlastzChainNet.pl \
         -bigClusterHub=pk \
         -smallClusterHub=pk \
         -workhorse=pk \
         -fileServer=kolossus \
         -continue download \
         -chainMinScore=5000 \
         -chainLinearGap=medium \
         `pwd`/DEF >& doDownloads.log &
    # Took 2 minutes. 
# check trackDb entry exists. Put html at danRer3 level of trackDb and edit
# these and the downloads README to state that chrNA and chrUn were not 
# aligned for this track.
    # Remove extra downloads made by script:
    # Only chain track is pushed to the RR so remove the net and axtNet 
    # downloads, re-make md5sum.txt and edit README.txt accordingly.
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf
    rm danRer3.danRer3.net.gz md5sum.txt
    rm -r axtNet
    md5sum *.gz > md5sum.txt
    
# Original run with loose linear gap matrix and scaffolds for chrNA and chrUn
# done 2005-10-26.
# filtering chains from above on minScore 10,000. done 2005-11-18
# Using the medium linear gap matrix for axtChain. minScore=5,000. 
# done 2005-11-30.
# chainSelf - loose linearGap matrix, filtered minScore=5000
# chainSelfFilt10k - loose linearGap matrix, filtered minScore=10000
# chainSelfMedGap - medium linearGap matrix, filtered minScore=5000
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
# refGene:cds 0.743%, chainSelfLink 65.056%, both 0.560%, cover 75.29%, 
# enrich 1.16x

# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfFilt10kLink -enrichment
# refGene:cds 0.743%, chainSelfFilt10kLink 64.019%, both 0.554%, cover 74.54%, 
# enrich 1.16x
# number of rows in tables for chr1:

# chainSelf          	941416
# chainSelfFilt10k	530292
# chainSelfMedGap	997525			
# chainSelfLink		9110071
# chainSelfFilt10kLink	7226815
# chainSelfMedGapLink	9149100

# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfMedGapLink -enrichment
# refGene:cds 0.743%, chainSelfMedGapLink 64.525%, both 0.549%, cover 73.80%, 
# enrich 1.14x

# so the medium linearGap matrix increases the number of chains by about 5% 
# but coverage is little different.
# for the chains filtered with  minScore=10000
# 12192577 chains out of 17592225 do not have chrNA or chrUn as query or 
# target which is about 69%. 
# 12192577 out of 12807964 do not have chrNA or chrUn as the query for just
# chr1-25 and chrM which is about 95%.
# so make the chains without chrNA and chrUn and using the medium linearGap
# matrix which is for species that are not so distant.
# 2005-12-02
# medium linearGap matrix for axtChain, minScore=5000 and no chrNA or chrUn.
# number of rows in tables for chr1:
# chainSelf 	943482
# chainSelfLink 8707208
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
# refGene:cds 0.743%, chainSelfLink 60.876%, both 0.503%, cover 67.65%, 
# enrich 1.1
# coverage dropped about 8% without chrNA and chUn alignments so not a 
# huge difference.

# BLASTZ SWAP FOR HUMAN (hg18) (DONE, 2005-12-24, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    ssh hgwdev
    # Blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific for all alignments except those
    # involving danRer3 chrUn and chrNA where the dynamic masking 
    # functionality of Blastz was used. hg18 random chroms were aligned
    # as contigs and danRer3 chrNA and chrUn were aligned as scaffolds -
    # see zebrafish (danRer3) chain and net track section in makeHg18.doc
    # for further details. 

    # do swap of hg18 vs. danRer3 chain and net alignments to 
    # create danRer3 vs. hg18 see makeHg18.doc for details.
    cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
    # edit DEF file and add location of danRer3 and hg18 lineage-specific
    # repeats - move chrUn and chrNA lineage-specific repeats into a tmp
    # directory as they were not used.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
        -bigClusterHub=pk -swap -chainMinScore=5000 \
        -chainLinearGap loose `pwd`/DEF >& doSwap.log &
    # Took about 27 minutes.
    # Blastz parameters are as for hg18 vs. danRer3 - see makeHg18.doc
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
  # make html files and trackDb.ra entry for chain and net tracks.
  # check README.txt for downloads.
# featureBits -chrom=chr2 danRer3 refGene:cds chainHg18Link -enrichment 
# refGene:cds 0.767%, chainHg18Link 4.370%, both 0.607%, cover 79.15%,
# enrich 18.11x
# featureBits -chrom=chr2 danRer2 refGene:cds chainHg17Link -enrichment 
# refGene:cds 0.769%, chainHg17Link 4.576%, both 0.605%, cover 78.69%,
# enrich 17.20x
# Similar coverage and enrichment as for danRer2 vs hg17 but there are less
# chains: 7057 for hg18 on danRer3, 1111 for hg17 on danRer2 (chr1).

# 5-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2006-02-06, hartera)
# MAF ANNOTATION ADDED (DONE, 2006-02-6, braney)
# FINISHED MAKING TREE IMAGE FOR TRACK DESCRIPTION PAGE 
# (DONE, 2006-02-07, hartera)
# Species: zebrafish(danRer3), human (hg18), mouse(mm7), 
# fugu(fr1) and tetraodon(tetNig1)
# Opossum (monDom2) was dropped since there were many more alignments
# for monDom2 than monDom1 and the chains were shorter on average. The
# reason for this is unknown so they will not be included in the 
# conservation track at this time.
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)


    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/multiz5way
    cd /cluster/data/danRer3/bed/multiz5way
    mkdir mafLinks
    # set up directories for links to mafs for each pairwise alignment
    mkdir mafLinks/hg18
    mkdir mafLinks/mm7
    mkdir mafLinks/fr1
    mkdir mafLinks/tetNig1
  
    set dir=/cluster/data/danRer3/bed
    # need to make links to all the mafNet files for pairwise blastz 
    # alignments for each species. Make sure files are all called chrN.maf.gz
    ln -s $dir/blastz.hg18.swap/mafNet/*.maf.gz ./mafLinks/hg18
    ln -s $dir/blastz.mm7.swap/mafNet/*.maf.gz ./mafLinks/mm7
    ln -s $dir/blastz.fr1/mafNet/*.maf.gz ./mafLinks/fr1
    ln -s $dir/blastz.tetNig1.2005-10-11/chromsAndScafsRun/mafNet/*.maf.gz \
          ./mafLinks/tetNig1
    # copy files over to the san for the pitakluster cluster run
    ssh pk
    mkdir /san/sanvol1/scratch/danRer3/multiz5way
    cd /san/sanvol1/scratch/danRer3/multiz5way
    rsync -a --copy-links --progress \
          /cluster/data/danRer3/bed/multiz5way/mafLinks/ .
    # 277 Mb of data - took less than 1 minute 
    mkdir penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn

#       Progressive alignment up the tree w/o stager,
#       using multiz.v10 (var_multiz)
#       Method: align internal subtrees (using 0 flag to var_multiz)
#               Then, align these to human (using 1 flag to var_multiz)
#       NOTE: must use maf_project after each multiz run, in order
#       to order output.  Single-cov guaranteed by use of net MAF's,
#       so it is not necessary to run single_cov2.

    # make output dir and run dir

    cd /cluster/data/danRer3/bed/multiz5way
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set db = danRer3
    set multi = /scratch/tmp/$db/multiz5way.$c
    set pairs = /san/sanvol1/scratch/$db/multiz5way
    set penn = $pairs/penn

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        echo "cleanup"
        echo "rm -fr $multi"
        rm -fr $multi
        echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db"
        rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        echo "cp $multi/$2/$c.maf $3"
        ls -og $multi/$2/$c.maf
        cp $multi/$2/$c.maf $3
        exit
    endif

    set s1 = $2
    set s2 = $3
    set flag = $4

    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi
    if (-d $pairs/$s1) then
        set d1 = $pairs
        set f1 = $d1/$s1/$c.maf.gz
        set t1 = /tmp/$s1.$c.maf
        zcat $f1 > $t1
    else
        set f1 = $d1/$s1/$c.maf
        set t1 = /tmp/$s1.$c.maf
        cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
        set f2 = $d2/$s2/$c.maf.gz
        set t2 = /tmp/$s2.$c.maf
        zcat $f2 > $t2
    else
        set f2 = $d2/$s2/$c.maf
        set t2 = /tmp/$s2.$c.maf
        cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
        $penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \
                $out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        $penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf
        rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
        rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
        rm -f $t2
    endif
'EOF'
# << keep emacs coloring happy
    chmod +x oneMultiz.csh
    cp -p oneMultiz.csh \
         /san/sanvol1/scratch/danRer3/multiz5way/penn/oneMultiz.csh
    # Create 6way.nh file of tree. This was used in the distant past for 
    # early versions of phastCons.  Now, this is merely a convenient 
    # reference to the tree under construction.  This is also used to draw 
    # a graphic tree as species5.nh, see below.

    cat << '_EOF_' > /cluster/data/danRer3/bed/multiz5way/5way.nh
(hg18,mm7),((tetNig1,fr1),danRer3))
'_EOF_'
    # << this line keeps emacs coloring happy
    #   using the tree diagram as above, arrange these alignments
    #   in order of the tree branches
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
set db = danRer3
set s = "/san/sanvol1/scratch/$db/multiz5way/penn/oneMultiz.csh"

$s $c hg18 mm7 0
$s $c tetNig1 fr1 1
$s $c tetNig1fr1 hg18mm7 1
# get final alignment file
$s $c tetNig1fr1hg18mm7 /cluster/data/$db/bed/multiz5way/maf/$c.maf
#cleanup
$s $c
'EOF'
# happy emacs
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer3/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'EOF'

    awk '{print $1}' ../../../chrom.sizes > chrom.lst
    
    gensub2 chrom.lst single template jobList
    para create jobList
    para try, para check, para push, para check ... etc
    para time
# Completed: 28 of 28 jobs
#CPU time in finished jobs:       3546s      59.10m     0.98h    0.04d  0.000 y
# IO & Wait Time:                   115s       1.92m     0.03h    0.00d  0.000 y
# Average job time:                 131s       2.18m     0.04h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             553s       9.22m     0.15h    0.01d
# Submission to last job:           709s      11.82m     0.20h    0.01d

    # do not filter mafs as only removes a small fraction of alignments
    # better to keep them all. check for single column alignments (these
    # just have a single base for each species in the alignment). There
    # should be none of these now. Previously had to do a glueing step to 
    # deal with these. There are none here.

# Build maf annotation and load database (braney, 2006-02-06)
cd /cluster/data/danRer3/bed/multiz5way
mkdir anno 
cd anno
cat ../../maf/chr1.maf | awk "/^s/ {print \$2}" | sed "s/\..*$//"  | sort -u > species.names
mkdir maf run
cd run
rm sizes nBeds
for i in `cat species.names`
do
    ln -s  /cluster/data/$i/chrom.sizes $i.len
    ln -s  /cluster/data/$i/$i.N.bed $i.bed
    echo $i.bed  >> nBeds
    echo $i.len  >> sizes
done 

for i in ../../maf/*.maf
do
    echo mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/danRer3/danRer3.2bit ../maf/`basename $i`
done > jobs
sh -x jobs

ssh hgwdev

cd /cluster/data/danRer3/bed/multiz5way/anno/mafs
cat *.maf | hgLoadMafSummary danRer3 multiz5way stdin       

# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_2"
hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_3"

mkdir /gbdb/danRer3/multiz5way
for i in *.maf
do
    ln -s `pwd`/$i /gbdb/danRer3/multiz5way
done
hgLoadMaf danRer3 multiz5way
rm *.tab

cd /cluster/data/danRer3/bed/multiz5way
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .

#edit Makefile to correct species names 

mkdir -p /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf
for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf/$i; done  

make getGenes
make getFrames
make loadDb

###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore02
cd /cluster/data/danRer3/bed/multiz5way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(zcat  ../maf/*.maf.gz | time genePredToMafFrames danRer3 stdin stdout danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm7 genes/mm7.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip >multiz5way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way/frames

hgLoadMafFrames danRer3 multiz5wayFrames multiz5way.mafFrames.gz >&log&
#end of multiz5way annotation and load

    # create tree image - like tree.nh but with common names
    # (hartera, 2006-02-07)
    ssh hgwdev
    cd /cluster/data/danRer3/bed/multiz5way
    cat << '_EOF_' > species5.nh
((human,mouse),((tetraodon,fugu),zebrafish))
'_EOF_'
    /cluster/bin/phast/$MACHTYPE/draw_tree -b -s species5.nh > species5.ps
    convert species5.ps 5way.jpg
    # using GIMP, edit tree and remove whitespace
    # Photoshop used to edit the image (kuhn, 2006-02-07)
    cp 5way.jpg /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg 
    # change permissions for display
    chmod +r /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg

# check for all.joiner entry for multiz5way - ok
# add trackDb.ra entry in ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3:
# track multiz5way
# shortLabel 5-Way Conservation
# longLabel 5-Way Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 104
# visibility pack
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# yLineOnOff Off
# autoScale Off
# summary multiz5waySummary
# speciesGroups vertebrate mammal
# sGroup_mammal hg18 mm7
# sGroup_vertebrate tetNig1 fr1

# add this line to trackDb entry as above for the tree image (2006-02-07):
# treeImage phylo/danRer3_5way.jpg

# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT 
# (DONE, 2006-02-06, hartera)
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/multiz5way/cons
    cd /cluster/data/danRer3/bed/multiz5way/cons
    # create a starting-tree.mod based on chr5 (73Mb - largest chrom)
    # chr5 is the largest chrom apart from NA and Un
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr5.maf \
        --refseq ../../../5/chr5.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1
    # takes about 30 seconds
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
        --tree "((danRer3,(tetNig1,fr1)),(mm7,hg18))" \
        --out-root starting-tree
    # took less than 1 minute
    rm s1.*ss
    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.307629 0.191708 0.192177 0.308486
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    # 0.384 is the GC content. This is used in the -gc argument below.
    # If you do *not* have a global tree model and you do not know your
    # GC content, you can get it directly from the MAFs with a command
    # like:
    # /cluster/bin/phast/$MACHTYPE/msa_view \
    # --aggregate danRer3,tetNig1,fr1,mm7,hg18 -i MAF \ 
    # -S /cluster/data/danRer3/bed/multiz5way/maf/chr*.maf > maf_summary.txt
    # This gives a GC content of 0.438
    # break up the genome-wide MAFs into pieces on the san filesystem
    ssh kkstore02
    mkdir -p /san/sanvol1/scratch/danRer3/cons/ss
    cd /san/sanvol1/scratch/danRer3/cons/ss
    bash
    for C in `awk '{print $1}' /cluster/data/danRer3/chrom.sizes`
    do
      if [ -s /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf ]; then
        mkdir ${C}
        echo msa_split $C
        chrN=${C/chr/}
        /cluster/bin/phast/$MACHTYPE/msa_split \
            /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf \
            --refseq /cluster/data/danRer3/${chrN}/${C}.fa \
            --in-format MAF --windows 1000000,0 --between-blocks 5000 \
            --out-format SS -I 1000 --out-root ${C}/${C}
      fi
    done
    # took about 20 minutes to run
    # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)

    ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
       awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
    
    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/danRer3/cons/treeRun1
    cd /san/sanvol1/scratch/danRer3/cons/treeRun1
    mkdir tree log
    
    # now set up cluster job to estimate model parameters.  Parameters
    # will be estimated separately for each alignment fragment then
    # will be combined across fragments. Tuning this loop should come 
    # back to here to recalculate. Tuning target-coverage and expected-length.
    # Create little script that calls phastCons with right arguments
cat > makeTree << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/x86_64/phastCons ../ss/$1 \
   /cluster/data/danRer3/bed/multiz5way/cons/starting-tree.mod \
   --gc 0.438 --nrates 1,1 --no-post-probs --ignore-missing \
   --expected-length 12 --target-coverage 0.17 \
   --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #   emacs happy
    chmod a+x makeTree

    # Make sure that the correct GC content is subsituted in here. Notice 
    # the target coverage of 0.17. Here we are going to aim 
    # for 65% coverage of coding regions by conserved elements.
    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #   happy emacs
    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try,check,push,check etc.
# para time
# Completed: 50 of 50 jobs
# CPU time in finished jobs:        714s      11.90m     0.20h    0.01d  0.000 y
# IO & Wait Time:                   132s       2.20m     0.04h    0.00d  0.000 y
# Average job time:                  17s       0.28m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              26s       0.43m     0.01h    0.00d
# Submission to last job:           353s       5.88m     0.10h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/danRer3/cons/treeRun1
    ls tree/chr*/*.cons.mod > cons.txt
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
        --output-average ../ave.cons.mod > cons_summary.txt
    ls tree/chr*/*.noncons.mod > noncons.txt
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
        --output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/danRer3/bed/multiz5way/cons
    #   measuring entropy
    #   consEntropy <target coverage> <expected lengths>
    #            ave.cons.mod ave.noncons.mod --NH 9.78
    #   never stops with the --NH argument
    # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
    # the expected length that produces this entropy is the one 
    # to use for phastCons.
    /cluster/bin/phast/$MACHTYPE/consEntropy 0.17 12 \
                        ave.cons.mod ave.noncons.mod

# -target-coverage=0.17 -expected-lengths 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=0.618383 bits/site
# Expected min. length: L_min=17.978234 sites
# Expected max. length: L_max=10.983828 sites
# Phylogenetic information threshold: PIT=L_min*H=11.117434 bits

# then the above steps from creating the treeRun directory onwards were
# repeated with the target coverage and expected lengths parameters set as
# below:

# -target-coverage=0.25 -expected-lengths 12
#Transition parameters:gamma=0.250000, omega=12.000000, mu=0.083333,nu=0.027778
#Relative entropy: H=0.637721 bits/site
#Expected min. length: L_min=15.535855 sites
#Expected max. length: L_max=10.157133 sites
#Phylogenetic information threshold: PIT=L_min*H=9.907536 bits

#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####

# Parameters used for danRer2 6-way conservation track:
# -target-coverage=0.35 -expected-lengths 18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
# Relative entropy: H=0.592725 bits/site
# Expected min. length: L_min=16.435656 sites
# Expected max. length: L_max=12.564154 sites
# Phylogenetic information threshold: PIT=L_min*H=9.741828 bits

# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the 
# coverage of coding regions by the most conserved elements
    # Create cluster dir to do main phastCons run
    ssh pk
    mkdir -p /san/sanvol1/scratch/danRer3/cons/consRun1
    cd /san/sanvol1/scratch/danRer3/cons/consRun1
    mkdir ppRaw bed
    cp -p /san/sanvol1/scratch/danRer3/cons/ave.*.mod .
    # Create script to run phastCons with right parameters
    #   This job is I/O intensive in its output files, thus it is all
    #   working over in /scratch/tmp/
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 18 --target-coverage 0.35 --quiet \
        --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #   root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #   happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.

# combine predictions and transform scores to be in 0-1000 interval
    ssh kkstore02
    cd /san/sanvol1/scratch/danRer3/cons/consRun1

    #   The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
        | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #   ~ 1 minute
    cp -p mostConserved.bed /cluster/data/danRer3/bed/multiz5way
# Figure out how much is actually covered by the mostConserved data as so:
    cd /cluster/data/danRer3
    faSize */chr*.fa  
    # 1644032962 bases (48201758 N's 1595831204 real 816464533 upper 
    # 779366671 lower) in 28 sequences in 28 files
    # The non-N size is 1595831204 bases
    cd /cluster/data/danRer3/bed/multiz5way
    awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1595831204\n",100.0*sum/1595831204,sum}' \
        mostConserved.bed
    -target-coverage 0.35: % 3.06 = 100.0*48883581/1595831204 length=18
    -target-coverage 0.
    
    ssh hgwdev
    cd /cluster/data/danRer3/bed/multiz5way
    # get an or of refGene and mgcGenes CDS regions 
    featureBits danRer3 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
    # 11338034 bases of 1630323462 (0.695%) in intersection
    featureBits danRer3 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # refSeqOrMgcCds.bed 0.695%, mostConserved.bed 2.998%, both 0.464%, 
    # cover 66.71%, enrich 22.25x 
    # so use this result for -target-coverage=0.35 -expected-lengths=18
    # with entropy (PIT) value of 9.74 (aiming for around 9.8) and 
    # 66.7% coverage of coding regions with most conserved elements 
    # (aiming for about 65%)

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/danRer3/bed/multiz5way
    hgLoadBed danRer3 phastConsElements mostConserved.bed
    # Loaded 552331 elements of size 5
    featureBits danRer3 mgcGenes:cds phastConsElements -enrichment
    # mgcGenes:cds 0.531%, phastConsElements 2.998%, both 0.363%, 
    # cover 68.39%, enrich 22.81x
    featureBits danRer3 refGene:cds phastConsElements -enrichment
    # refGene:cds 0.658%, phastConsElements 2.998%, both 0.440%, cover 66.82%,
    # enrich 22.28x
    # Create merged posterier probability file and wiggle track data files
    # the sed business gets the names sorted by chromName, chromStart
    # so that everything goes in numerical order into wigEncode
    ssh kkstore02
    cd /san/sanvol1/scratch/danRer3/cons/consRun1
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | wigEncode stdin phastCons5way.wig phastCons5way.wib
    # takes a few minutes
    ls -l phastCons*
    # -rw-rw-r--  1 hartera protein 198399845 Feb  6 16:05 phastCons5way.wib
    # -rw-rw-r--  1 hartera protein  45304940 Feb  6 16:05 phastCons5way.wig
    cp -p phastCons5way.wi? /cluster/data/danRer3/bed/multiz5way/cons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/danRer3/bed/multiz5way/cons
    mkdir -p /gbdb/danRer3/wib
    ln -s `pwd`/phastCons5way.wib /gbdb/danRer3/wib/phastCons5way.wib
    # use this if need to reload table
    hgsql -e 'drop table phastCons5way;' danRer3
    # load table
    hgLoadWiggle danRer3 phastCons5way phastCons5way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/danRer3/bed/multiz5way/cons
    bash
    time hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
            -db=danRer3 phastCons5way > histogram.data 2>&1
# real    2m33.069s
# user    1m58.310s
# sys     0m16.170s

        #   create plot of histogram:
    cat << '_EOF_' > histo.gp
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer3 Histogram phastCons5 track"
set xlabel " phastCons5 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
     "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'

    #   happy emacs
    gnuplot histo.gp > histo.png
    display histo.png &

# add line: wiggle phastCons5way to trackDb.ra for multiz5way to display the 
# wiggle for the conservation track.
# check all.joiner for entries for phastCons5way and phastConsElements5way -ok
# copy over html for multiz and edit.

# PHASTCONS SCORES DOWNLOADABLES (DONE, 2006-02-07, hartera)
    #   prepare compressed copy of ascii data values for downloads
    ssh kkstore02
    cd /san/sanvol1/scratch/danRer3/cons/consRun1
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons5Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons5Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
            gzip > ${out}
done
'_EOF_'
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    # 192.852u 8.835s 4:04.05 82.6%   0+0k 0+0io 1pf+0w
    # creates 331 Mb of data.
    # copy data for downloads
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
    cd /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
    rsync -a --progress \
        pk:/san/sanvol1/scratch/danRer3/cons/consRun1/phastCons5Scores/ .

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
    cd /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
    ln -s /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores/*.gz .
    md5sum *.gz > md5sum.txt
    # copy over and edit README.txt from the hg17 phastCons.

# MULTIZ 5-WAY DOWNLOADABLES (DONE, 2006-02-22, hartera)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer3
    mkdir -p multiz5way
    cd multiz5way
    foreach f (/cluster/data/danRer3/bed/multiz5way/maf/*.maf)
        set c = $f:r:t
        echo $c
        nice gzip $f
        ln -s $f.gz .
    end
    md5sum *.gz > md5sum.txt
    # copy over README and edit for this 5-way multiple alignment

##################################################################
# HGNEAR TABLES (also used by the Known Genes details page links)
# GET LATEST PROTEIN SEQUENCE FOR ALL HGNEAR SPECIES (DONE, 2005-02-10, hartera)
#   # For species with knownGene, use that; otherwise, download the latest 
    # version of the main model organism database for this species.
    # Human: use knownGene proteins.
# need to get hg18 peptide sequence:
     mkdir -p /cluster/data/hg18/bed/blastp
     cd /cluster/data/hg18/bed/blastp
     pepPredToFa hg18 knownGenePep known.faa
# # Mouse: use knownGene proteins.
# already done:
#    mkdir -p  /cluster/data/mm7/bed/geneSorter/blastp
#    cd /cluster/data/mm7/bed/geneSorter/blastp
#    pepPredToFa mm7 knownGenePep known.faa
    # Rat: use knownGene proteins.
# already done:
#    mkdir /cluster/data/rn3/bed/blastp
#    cd /cluster/data/rn3/bed/blastp
#    pepPredToFa rn3 knownGenePep known.faa
    # Fly: use FlyBase proteins - already done 
    # /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
    # Worm: use WormBase proteins.
    mkdir -p /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version. It is WormPep 154 so use that.
    wget --timestamping -O wormPep154.faa \
       ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep154/wormpep154
    # Yeast: use SGD proteins.
    mkdir -p /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    # get latest version - from Jan 26, 2006
    wget -O orf_trans.fasta.jan26.gz \
         ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    # rename old version of peptide sequences
    mv sgdPep.faa sgdPep.jan9.faa
    zcat orf_trans.fasta.jan26.gz > sgdPep.faa

# HGNEAR PROTEIN BLAST TABLES (DONE, 2006-02-10, hartera)
# RENAME SELF BLASTP TABLE AND CHANGE CONFIG.RA FILE (DONE, 2006-04-19, hartera)
# NOTE: mmBlastTab was updated to mm8 as a result of running doHgNearBlastp.pl
# for mm8 on 2006-03-13 (see makeMm8.doc).
# RECREATE THE HGNEAR TABLES FOR RAT AND MOUSE TO UPDATE THEM 
# (DONE, 2006-05-31, hartera)
# RE-MADE THE ZEBRAFISH BLASTP TABLES USING THE TRANSCRIPT ID INSTEAD OF THE
# PEPTIDE ID FOR EACH SEQUENCE - FOR ALL OTHER SPECIES THE PEPTIDE SEQUENCES
# ARE REPRESENTED BY THEIR KNOWN GENES TRANSCRIPT ID
# (DONE, 2006-07-03, hartera)
# CHANGED INDEX ON ensZfishBlastTab (DONE, 2006-11-03, hartera)
    ssh hgwdev
    mkdir -p /cluster/data/danRer3/bed/hgNearBlastp
    cd /cluster/data/danRer3/bed/hgNearBlastp
    
    # zebrafish vs fly table has already been created as a result of 
    # creating the blastp table for dm2 (see makeDm2.doc)

cat << _EOF_ > config.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, worm, yeast
# zebrafish vs fly already done (dm2)

targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg18 mm7 rn3 ce2 sacCer1

danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa

buildDir /cluster/data/danRer3/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp
_EOF_
     # << this line makes emacs coloring happy
    nice doHgNearBlastp.pl config.ra >& do.log &
    tail -f do.log
    # Took about 2 hours to finish.
    # The target geneset (self Blastp) should be prefixed with ensZfish
    # so change the config.ra and rename the table (2006-04-19, hartera)
    hgsql -e 'alter table flyBaseBlastTab rename ensZfishBlastTab;' danRer3
    # Update mouse to mm8 and rat to rn4
    mkdir updates
    cd updates
    hgsql -e 'drop table mmBlastTab;' danRer3
    hgsql -e 'drop table rnBlastTab;' danRer3

cat << _EOF_ > config.ra
# Update of zebrafish vs. other Gene Sorter orgs:
# mouse mm8 and rat rn4
targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs mm8 rn4

danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates
_EOF_
     # << this line makes emacs coloring happy
    nice doHgNearBlastp.pl config.ra >& do.log &
    tail -f do.log
    # Took about 25 minutes.

    # Need to remake all the BlastTab tables using the transcript Id instead
    # of the protein ID for zebrafish Ensembl Genes.
    # create ensZfishBlastTab and drBlastTab tables using the Ensembl 
    # transcript Ids for the tables instead of the peptide Ids
    # (2006-07-03, hartera)
    ssh hgwdev
    # create the FASTA file of Ensembl peptide sequences with transcript IDs
    # there is a one to one relationship between these IDs.
    cd /cluster/data/danRer3/bed/blastp
    # then create a fasta file of the sequences:
    pepPredToFa danRer3 ensPep ensPep.faa
    mkdir /cluster/data/danRer3/bed/hgNearBlastp/updates2 
    cd /cluster/data/danRer3/bed/hgNearBlastp/updates2 
cat << _EOF_ > config.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, fly, worm, yeast

targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg18 mm8 rn4 dm2 ce2 sacCer1

danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa

buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
_EOF_
     # << this line makes emacs coloring happy
    nice doHgNearBlastp.pl config.ra >& do.log &
    tail -f do.log
    # Took about 45 minutes
    # update sacCer1 otherOrgs.ra to use danRer3 instead of danRer1 
    # for drBlastTab.
    
# also need to update:
# dm1, hg{15,16,17}, mm{5,6,7}, rn{2,3}
# Human (hg15 and hg16), Drosophila, mouse mm5 and rat all use danRer1.
# Human hg17 and mouse mm6 and mm7 uses danRer2.
  # Update these all to use the Zv5 (danRer3) Ensembl proteins.
  # Ensembl 38 (April 2006)
  ssh hgwdev
  cd /cluster/data/danRer3/bed/hgNearBlastp/updates2
cat << _EOF_ > config2.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, fly - older databases

targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg17 hg16 hg15 mm7 mm6 mm5 rn3 rn2 dm1

danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
hg17Fa /cluster/data/hg17/bed/blastp/known.faa
hg16Fa /cluster/data/hg16/bed/blastp/known.faa
hg15Fa /cluster/data/hg15/bed/blastp/known.faa
mm7Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
mm6Fa /cluster/data/mm6/bed/geneSorter/blastp/known.faa
mm5Fa /cluster/data/mm5/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
rn2Fa /cluster/data/rn2/bed/blastp/known.faa
dm1Fa /cluster/data/dm1/bed/blastp/bdgp.faa

buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
_EOF_
     # << this line makes emacs coloring happy
    # create BlastTab tables for all queries vs target and no self blastp
    nice doHgNearBlastp.pl config2.ra -noSelf -queryOnly >& do2.log &
    tail -f do2.log
    # Took about 30 minutes
    # Update and commit hgGeneData and hgNearData files to make sure that 
    # all queries and links now work for the transcript ID instead of 
    # peptide ID for ensZfishBlastTab and drBlastTab tables.

    # Gene Sorter is very slow for danRer3. ensZfishBlastTab has an index
    # on both the query and target. All the other BlatTab tables have only
    # an index on the query so try dropping the index on the target.
    hgsql -e 'alter table ensZfishBlastTab drop index target;' danRer3
    # Gene Sorter still loads slowly.
    # Index is too short. hgLoadBlastTab used to load table and index on
    # query is query(12). The first 12 characters are not unique for
    # the Ensembl IDs so extend to query(20).
    hgsql -e 'alter table ensZfishBlastTab drop index query;' danRer3
    hgsql -e 'create index query on ensZfishBlastTab (query(20));' danRer3
    # Much faster now.

# END OF HGNEAR STUFF
####################################################
# GENE SET BASED ON ENSEMBL GENES (PROTEIN CODING GENES) 
# (in progress, 2005-11-23, hartera)
    # see ENSEMBL GENES section for documentation of creation of
    # the ensGene, ensGtp and ensPep tables and the track.
    # compare the Ensembl and Human Proteins tracks
    featureBits danRer3 refGene:cds ensGene:cds -enrichment
# refGene:cds 0.658%, ensGene:cds 1.994%, both 0.589%, cover 89.60%, 
# enrich 44.94x
    featureBits danRer3 refGene:cds blastHg17KG -enrichment
# refGene:cds 0.658%, blastHg17KG 1.292%, both 0.385%, cover 58.52%, 
# enrich 45.30x
    # little difference in enrichment and less coverage for Human Proteins so
    # it seems like Ensembl is the best choice in terms of genome coverage
    # and intersection with RefSeq CDS regions.
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/ensGenes
    cd /cluster/data/danRer3/bed/ensGenes
    # use Ensembl's BioMart to download the Ensembl Genes UniProt IDs and
    # descriptions. For genes with no description, use the InterPro domain.
    # Go to http://www.ensembl.org/Multi/martview
    # Follow this sequence through the pages: 
    # Page 1) Select the Ensembl dataset (now v38 here, v36 and v37 is the 
    # same for Zv5 Danio rerio protein coding genes) and the Danio_rerio 
    # choice (ZFISH5 here). 
    # Hit next. 25541 entries total.
    # Ensembl 37 from Feb 2006 - this dataset is the same as for the 
    # version 32 downloaded as above for the Ensembl Genes track.
    # (Checked on 2006-03-09, hartera)
    # Ensembl 38 from April 2006 - this dataset is the same as for the 
    # version 32 downloaded as above for the Ensembl Genes track.
    # (Checked on 2006-05-31, hartera)
    # Page 2) In the GENE section, select Gene type as protein_coding. 
    # Then hit next. There are now 22877 entries in this filtered version.
    # Page 3) Choose the "Features" Attribute Page from the pulldown menu
    # at the top. Make sure that under the GENE section, the Ensembl 
    # Attributes checked are the Ensembl Transcript ID, External Gene ID and the
    # Description. Under External References, select Unified UniProt 
    # accession, and ZFIN Primary ID. Under the Protein section, select 
    # InterPro Description and InterPro ID under InterPro 
    # Attributes. Select text, tab-separated for output. Choose gzip 
    # compression. Hit export. Save as ensGeneInfo37Coding.tsv.gz. Same as for
    # Ensembl v36 so update to Ensembl v37. Ensembl v38 is the same too
    # so update to this version (2006-05-31, hartera). Also add External Gene
    # ID for the Ensembl Attributes.
    gunzip ensGeneInfo38Coding.txt.gz
    # this file has some errors in it - there is a newline character in the
    # middle of the descriptions for the genes with the following UniProt 
    # IDs: Q5TYV0, Q5SPG7, Q5SPG5, Q5RIJ2, Q5RID3. This causes the table
    # to be loaded incorrectly. Edit the ensGeneInfo38Coding.txt file manually
    # to remove these extra newlines.

    # Repeat above steps and get the Ensembl transcript ID from Ensembl 
    # Attributes and then get EntrezGene ID, RefSeq DNA ID, and RefSeq 
    # Peptide ID and from the External References section. Select text, 
    # tab-separated for output. Choose gzip compression. Hit export. Again 
    # Ensembl v36 gives the same result for Danio rerio. 
    # Save as ensGeneInfo38Coding2.txt.gz
    cd /cluster/data/danRer3/bed/ensGenes
    gunzip ensGeneInfo38Coding2.txt.gz  
    wc -l ensGeneInfo38*
    # 85607 ensGeneInfo38Coding.txt
    # 32457 ensGeneInfo38Coding2.txt

    # 85607 ensGeneInfo37Coding.tsv
    # 33233 ensGeneInfo37Coding2.tsv

    # find how many Transcripts have multiple SWISS-PROT IDs
    tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
         {print $1, $2, $4}' > ensGene38UniProtandExtId.txt
    tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
         {if ($2 != "") print $1, $4}' \
         > ensGene38UniProt.txt
    sort ensGene38UniProt.txt | uniq > ensGene38UniProt.txt.uniq
    awk '{print $1}' ensGene38UniProt.txt.uniq | sort | uniq -c | sort -nr \
        > ens38UniProt.count
    awk '{if ($1 > 1) print $2}' ens38UniProt.count \
        > ens38UniProtMorethanOne.txt    
    wc -l ens38UniProtMorethanOne.txt
    # 2257 ens38UniProtMorethanOne.txt
    awk '{if ($1 == 1) print $2}' ens38UniProt.count \
        > ens38UniProtOnlyOne.txt    
    wc -l ens38UniProtOnlyOne.txt
    # 8172
    # get list of Ensembl transcripts with more than 1 UniProt ID and
    # the list of UniProt IDs.
    grep -f ens38UniProtMorethanOne.txt ensGene38UniProt.txt.uniq \
            > ens38UniProtMorethanOne.uniProtIds
    # get list of Ensembl transcripts with more than 1 UniProt ID and
    # the list of UniProt IDs and external database IDs.
    sort ensGene38UniProtandExtId.txt | uniq \
         > ensGene38UniProtandExtId.txt.uniq
    grep -f ens38UniProtMorethanOne.txt ensGene38UniProtandExtId.txt.uniq \
         > ens38UniProtMorethanOne.uniProtandExtIds
     
    # to do blastp of Ensembl Proteins vs UniProt 
    # (last uniProt update 2006-01-23):
    ssh hgwdev
    mkdir -p /cluster/data/danRer3/bed/ensGenes/blastDb
    cd /cluster/data/danRer3/bed/ensGenes/blastDb
    # create a table of Danio Rerio (Brachydanio rerio in UniProt)
    # SWISS-PROT sequences (2006-05-31)
    hgsql uniProt -e ' \
      create table test.danioProt select protein.* from protein,accToTaxon \
      where accToTaxon.taxon = 7955 and accToTaxon.acc = protein.acc;'
    # then create a fasta file of the sequences:
    pepPredToFa test danioProt danioUniProt.fa
    grep '>' danioUniProt.fa | wc -l
    # 14297
    # then select just those UniProt IDs for the Ensembl Transcript IDs that
    # have multiple UniProt IDs associated with them.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/ensGenes/blastDb
    # get list of UniProt IDs
    awk '{print $2}' ../ens38UniProtMorethanOne.uniProtIds \
        > ens38MultiUniProtIds.idsOnly
    sort ens38MultiUniProtIds.idsOnly | uniq \
         > ens38MultiUniProtIds.idsOnly.uniq
    faSomeRecords danioUniProt.fa ens38MultiUniProtIds.idsOnly.uniq \
          ens38DanioUniProt.fa 
    # 4410 UniProt IDs but 4293 in the FASTA file so 117 are missing.
    grep '>' ens38DanioUniProt.fa | sort > uniProtSeq.ids
    perl -pi.bak -e 's/>//' uniProtSeq.ids
    comm -13 uniProtSeq.ids ens38MultiUniProtIds.idsOnly.uniq > uniProtMissing
    # these missing sequences are missing because the uniProt IDs are
    # secondary IDs. Find the primary ID.
    hgsql -N -e 'select o.acc, o.val from otherAcc as o, accToTaxon as a \
      where o.acc = a.acc and a.taxon = 7955;' uniProt > otherAccs.zfish.txt
    wc -l otherAccs.zfish.txt
    # 321 otherAccs.zfish.txt
    grep -f uniProtMissing otherAccs.zfish.txt > uniProtMissing.otherAccs.txt  
    # found 83 of them
    awk '{print $2}' uniProtMissing.otherAccs.txt | sort | uniq > otherAccsFound
    comm -13 otherAccsFound uniProtMissing > stillMissing
    # check list of deleted TrEMBL IDs - delac_tr.txt from Expasy site.
    sort delac_tr.txt > delac_tr.sort
    sort stillMissing > stillMissing.sort
    comm -12 delac_tr.sort stillMissing.sort | wc
    # 34. There are 34 in the stillMissing file and these are all in the
    # delac_tr.txt file.
#This file lists the accession numbers of TrEMBL entries which have
#been deleted from the database. Most deletions are due to the deletion of
#the corresponding CDS in the source nucleotide sequence databases EMBL-
#Bank/DDBJ/GenBank. In addition, some entries are recognised to be Open
#Reading frames (ORFs) that have been wrongly predicted to code for
#proteins. When there is enough evidence that these hypothetical proteins
#are not real, we take the decision to remove them from TrEMBL.

    # Get the sequences for otherAccsFound from danioUniProt.fa
    awk '{print $1}' uniProtMissing.otherAccs.txt | sort | uniq \
        > otherAccsFound.altAccs
    faSomeRecords danioUniProt.fa otherAccsFound.altAccs ens38DanioOtherAccs.fa
    grep '>' ens38DanioOtherAccs.fa | wc
    # 73
    wc -l otherAccsFound.altAccs
    # 73 otherAccsFound.altAccs
    cat ens38DanioUniProt.fa ens38DanioOtherAccs.fa > ens38DanioAllUniProt.fa
    # create blastDb database
    ssh pk
    cd /cluster/data/danRer3/bed/ensGenes/blastDb
    mkdir format
    cd format
    mv ../ens38DanioAllUniProt.fa .
    /scratch/blast/formatdb -i ens38DanioAllUniProt.fa \
            -t ensUniProt -n ensUniProt
    # Copy database over to the san
    mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb
    cp ensUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/
    ssh hgwdev
    mkdir /cluster/data/danRer3/bed/ensGenes/blastp
    cd /cluster/data/danRer3/bed/ensGenes/blastp
    # get FASTA file of Ensembl sequences
    
    pepPredToFa danRer3 ensPep ensPep.fa
    # get list of Ensembl transcripts to use in Blastp
    cp ../blastDb/stillMissing .
    # need to remove the missing ones (those no longer in TrEMBL) from list
    grep -v -f stillMissing ../ens38UniProtMorethanOne.uniProtIds \
            > ens38UniProt.uniProtIdsforBlastp
    # get final list of Ensembl Transcript Ids
    awk '{print $1}' ens38UniProt.uniProtIdsforBlastp | sort | uniq \
            > ens38IdsOnlyForBlastp.txt
    wc -l ens38IdsOnlyForBlastp.txt
    # 2252 ens38IdsOnlyForBlastp.txt
    # grab the protein sequences just for these Ensembl Transcripts:
    faSomeRecords ensPep.fa ens38IdsOnlyForBlastp.txt ens38ForBlastp.fa
    # check that there are 2252 records

    # set up the Blastp run
    ssh pk
    cd /cluster/data/danRer3/bed/ensGenes/blastp
    # split Ensembl peptide sequences FASTA file into chunks for cluster
    mkdir split
    faSplit sequence ens38ForBlastp.fa 200 split/ens38
    # make parasol run directory
    mkdir run
    cd run 
    mkdir out
    # Make blast script
cat  << '_EOF_' > blastSome
#!/bin/csh -ef
setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data 
/san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
    -p blastp -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/ensUniProt \
    -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
    chmod +x blastSome
    # Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    # Create parasol batch
    echo ../split/*fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub jobList
    para create jobList
    para try, check, push, check ... etc.
# Completed: 190 of 190 jobs
# CPU time in finished jobs:        279s       4.65m     0.08h    0.00d  0.000 y
# IO & Wait Time:                  2293s      38.22m     0.64h    0.03d  0.000 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              30s       0.50m     0.01h    0.00d
# Submission to last job:            37s       0.62m     0.01h    0.00d
    # Load these into a temporary database table. hgLoadBlastTab
    # picks the best hit for each of the queries (Ensembl peptide).
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ensGenes/blastp/run/out
    time hgLoadBlastTab -maxPer=1 test ensUniProtBlastTab *.tab
    # 0.154u 0.008s 0:00.66 22.7%     0+0k 0+0io 0pf+0w
    # there were 2252 queries
# BLASTP OF ALL ENS PEP VS ALL DANIO UNIPROT SEQS
    # Try doing Blastp again but this time using all the zebrafish UniProt
    # sequences as the database and all the Ensembl peptides as queries.
    # create blastDb database
    ssh pk
    cd /cluster/data/danRer3/bed/ensGenes/blastDb
    mkdir zfishUniProt
    cd zfishUniProt
    cp ../danioUniProt.fa .
    /san/sanvol1/scratch/blast64/blast-2.2.11/bin/formatdb \
        -i danioUniProt.fa -t danioUniProt -n danioUniProt
    # Copy database over to the san
    mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
    cp danioUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
    # split Ensembl peptide sequences FASTA file into chunks for cluster
    cd /cluster/data/danRer3/bed/ensGenes/blastp
    mkdir splitAll
    grep '>' ensPep.fa | wc -l
    # 32143
    faSplit sequence ensPep.fa 8000 splitAll/ens38All
    # make parasol run directory
    mkdir runAll
    cd runAll
    mkdir out
    # Make blast script
cat  << '_EOF_' > blastSome
#!/bin/csh -ef
setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data 
/san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
    -p blastp \
    -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt/danioUniProt \
    -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
    chmod +x blastSome
    # Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    # Create parasol batch
    echo ../splitAll/*fa | wordLine stdin > split.lst
    gensub2 split.lst single gsub jobList
    para create jobList
    para try, check, push, check ... etc.
    para time
#Completed: 7609 of 7609 jobs
#CPU time in finished jobs:      11414s     190.23m     3.17h    0.13d  0.000 y
#IO & Wait Time:                401489s    6691.48m   111.52h    4.65d  0.013 y
#Average job time:                  54s       0.90m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              77s       1.28m     0.02h    0.00d
#Submission to last job:          1096s      18.27m     0.30h    0.01d
    # Load these into a temporary database table. hgLoadBlastTab
    # picks the best hit for each of the queries (Ensembl peptide).
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll/out
    # cat files together as argument list too long for hgLoadBlastTab
    foreach t (*.tab)
       cat $t >> ensAll.tab
    end
    time hgLoadBlastTab -maxPer=1 test ensUniProtAllBlastTab ensAll.tab
    # 4.168u 0.737s 0:06.03 81.0%     0+0k 0+0io 5pf+0w   
    # filter these and select just those with identity >= 95%
    # and eValue <= 0.00001
    hgsql -N -e 'select distinct(target) from ensUniProtAllBlastTab where \
         identity >= 95 and eValue <= 0.00001;' test | sort > out
    # get 11910 UniProt IDs mapping to Ensembl transcripts
    # there are 11343 unique UniProt IDs in ensGeneInfo38Coding.txt
    # load the ensGeneInfo38Coding.txt file into a table

cat << 'EOF' > ens38Zfish.sql
CREATE TABLE ens38Zfish (
    transcriptId varchar(255) not null,     
    extDbId varchar(255) not null,     
    description longblob not null, 
    uniProt varchar(255) not null,
    zfinId varchar(255) not null,
    interProDesc longblob not null,
    interProId varchar(255) not null
);
'EOF'
    # << emacs
    chmod a+r ensGeneInfo38Coding*
    tail +2 ensGeneInfo38Coding.txt > ens38Coding.tab
    hgLoadSqlTab test ens38Zfish ens38Zfish.sql ens38Coding.tab
    hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
         | sort > ens38Zfish.uniProt.uniq
    wc -l ens38Zfish.uniProt.uniq out 
    # 11344 ens38Zfish.uniProt.uniq
    # 9208 out
    comm -12 ens38Zfish.uniProt.uniq out | wc
    # 8526 in common
    comm -13 ens38Zfish.uniProt.uniq out > fromBlastPOnly
    comm -23 ens38Zfish.uniProt.uniq out > fromEns38Only
    wc -l from*
    # 682 fromBlastPOnly
    # 2817 fromEns38Only
    # find out how many from fromEns38Only are on the list of deleted from
    # TrEMBL IDs
    comm -12 fromEns38Only ./blastDb/delac_tr.sort > deletedFromTrEMBL
    comm -13 deletedFromTrEMBL fromEns38Only > fromEns38Only2
    # get list of transcripts matched to a UniProt by blastP that
    # are not in ens38Zfish
    hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38Zfish.noUniProt
    hgsql -N -e 'select distinct(query) from ensUniProtAllBlastTab where \
         identity >= 95 and eValue <= 0.00001;' test | sort > queryBlast.sort
    comm -12 queryBlast.sort ens38Zfish.noUniProt
    # 1967
    # 9943 transcripts.
    # delac_sp.txt in ./blastDb - list of deleted SWISS-PROT IDs
    # as of May 30, 2006. 331 IDs.
    sort blastDb/delac_sp.txt > blastDb/delac_sp.sort 
    # compare to list of SP IDs that are not in Blastp hits
    comm -12 blastDb/delac_sp.sort fromEns38Only2 
    # there are none in common
    # get list of Danio rerio UniProt IDs
    hgsql -N -e 'select distinct(acc) from danioProt;' test | sort \
          > danioProt.accs.uniq
    comm -13 danioProt.accs.uniq fromEns38Only2
    comm -12 danioProt.accs.uniq fromEns38Only2 > inuniProtAndfromEns38Only
    hgsql -e 'create table test.ensBlastp select * from ensUniProtAllBlastTab where identity >= 95 and eValue <= 0.00001;' test 
   ## wc -l in*Only
    # 1967 inBlastpOnly
    # 278 inEns38Only
    # these are trancsript IDs
    # find the UniProt IDs for the 278 inEns38Only
    cd test6/tmp
    hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
          > ensBlastp.tId.sort
    hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38ZfishwithUniProt.tId.sort
    comm -13 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inEns38Only
    comm -23 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inBlastpOnly
    wc -l in*Only
    # 9943 inBlastpOnly
    # 19955 inEns38Only
    wc -l *.sort
    # 32143 ens38Zfish.tId.sort
    # 11910 ensBlastp.tId.sort
    # So there are 9943 that have Blastp hits assigned and 19955 in
    # Ensembl 38 that do not have Blastp hits
    # find those with no description and also have no UniProt ID.
    # there are 21236 and this is the same number without a description
    hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where description = "" and uniProt = "";' test | sort > ens38ZfishNoDesc.tid.sort
    
    # 21236 ens38ZfishNoDesc.tid.sort
    # compare this to the set of transcript IDs in Ensembl 38 Only 
    # and for Blastp Only
    comm -12 inEns38Only ens38ZfishNoDesc.tid.sort > noBlastHitNoDesc
    comm -12 inBlastpOnly ens38ZfishNoDesc.tid.sort > blastHitNoDesc
    wc -l *NoDesc
    # 0 blastHitNoDesc
    # 19712 noBlastHitNoDesc
    # then get list of transcript IDs with no description in Ensembl 38 but
    # do have a Blastp hit
    comm -13 inEns38Only ens38ZfishNoDesc.tid.sort > blastpHitNoDesc.tid 
    wc -l blastpHitNoDesc.tid
    # 1524 blastpHitNoDesc.tid
    # These are sequences with a Blastp hit but no description
    hgsql -N -e 'select distinct(target) from ensBlastp;' test \
          | sort > blastp.uniProt.sort
    hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
          | sort > ens38.uniProt.sort
    wc -l *uniProt.sort
    # 9208 blastp.uniProt.sort
    # 11344 ens38.uniProt.sort
    # there are 8526 in common
    comm -13 blastp.uniProt.sort ens38.uniProt.sort > ens38Only.uniProt
    comm -23 blastp.uniProt.sort ens38.uniProt.sort > blastpOnly.uniProt
    wc -l *.uniProt
    # 682 blastpOnly.uniProt
    # 2817 ens38Only.uniProt
    # there are 80 in the ense38Only.uniProt list that are deleted from TrEMBL
    # there are 3 in the blastpOnly.uniProt list that are deleted from TrEMBL
    # Q503U2
    # Q7SY13
    # Q8AW80
    # Remove these from each list:
  comm -23 ens38Only.uniProt ../../blastDb/delac_tr.sort > ens38Only.uniProt2
  comm -23 blastpOnly.uniProt ../../blastDb/delac_tr.sort > blastpOnly.uniProt2
    # some of these will be ones where there were several SWISS-PROT IDs for
    # each transcript ID and only one is chosen so the others are dropped.
    # find how many of these ens38Only.uniProt2 are not in danioProt.accs.uniq
    comm -13 ../../danioProt.accs.uniq ens38Only.uniProt2 \ 
         > ens38Only.uniProt.notinDanioProt
    # there are 88 of these.
    # find list of zebrafish accs with alternative accs in uniProt
    hgsql -N -e 'select val from otherAcc as a, accToTaxon as t where a.acc = t.acc and taxon = 7955;' uniProt | sort | uniq > zfishVals.otherAccs.uniq
    comm -12 ens38Only.uniProt.notinDanioProt zfishVals.otherAccs.uniq \
    # 88 so all of these have alternate accessions.
    # remove these from list so:
    comm -13 ens38Only.uniProt.notinDanioProt ens38Only.uniProt2 \
             > ens38Only.uniProt3
    wc -l ens38Only.uniProt3
    # 2649 ens38Only.uniProt3
    # find number of uniProt IDs belonging to transcript IDs that have multiple 
    # uniProt IDs: ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq is list of 
    # uniProt IDs for such transcripts.
  comm -12 ens38Only.uniProt3 ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq \
    > ens38Only.multiUniProtIds
    # there are 2310 of these.
    comm -13 ens38Only.multiUniProtIds ens38Only.uniProt3 > ens38Only.uniProt4
    # 339 of these left 
    grep -f ens38Only.uniProt4 ../../ensGene38UniProt.txt \
         > ens38Only.uniProt4.tIdAndUpId
    awk '{print $1}' ens38Only.uniProt4.tIdAndUpId | sort | uniq \
        > ens38Only.uniProt4.tId.uniq
    wc -l ens38Only.uniProt4.tId.uniq
    # 368 ens38Only.uniProt4.tId.uniq
    # Do these all have SWISS-PROT IDs by Blastp?
    hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
          > ensBlastp.query.sort
    comm -12 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort 
    # 183 so remove these:
    comm -23 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort \
         > ens38Only.uniProt4.tId.noBlastp
    wc -l ens38Only.uniProt4.tId.noBlastp
    # 185 ens38Only.uniProt4.tId.noBlastp

    #e.g. ENSDART00000002826, this has only 91% ID to Q6DBUS (Q6NYR4 in BioMart
    # download. It is 91.7% ID to Q6DBUS in Blastp table.
    hgsql -e 'create table test.ensBlastp90 select * from ensUniProtAllBlastTab where identity >= 90 and eValue <= 0.00001;' test 
    hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
          > ensBlastp.tId.sort
    hgsql -N -e 'select distinct(query) from ensBlastp90;' test | sort \
          > ensBlastp90.tId.sort
    # transcript IDs in ensBlastp90 and not in ensBlastp
    comm -23 ensBlastp90.tId.sort ensBlastp.tId.sort > ensBlastp90Only.tId
    wc -l ensBlastp90Only.tId
    # 704 ensBlastp90Only.tId
    # check these against list of ens38 with no description
    comm -12 ens38ZfishNoDesc.tid.sort ensBlastp90Only.tId \
         > ensBlastp90Only.noUniProtInEns38
    # 416
    # also check against list of ens38Only.uniProt4.tId.noBlastp
    comm -12 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId
    # 140
    comm -23 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId \
         > ens38Only.uniProt4.tId.noBlastp90
    # 45 of these left
    # ENSDART00000009971 has only 48% Identity to Q5DTD0. maps to Q58EF8 on
    # Ensembl web page.
    # Check 10 alignments with >= 95% and 10 that have >= 90% and < 95%
    cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll2/out
    # ens38Blastp.out has the alignments in NCBI format
    # 95-96% 226
    # 96-97% 322
    # 97-98% 526
    # 98-99% 1333
    # 99-100% 9503 (both inclusive)
    # lower score can be due to shorter query and target
# for >= 95% identity (ensBlastp table in test db). Get BlastP results 
# and check Ensembl. All Ensembl records show the UniProt ID given below 
# except where noted. 
# Query  Target  Identity qLen qAli tLen tAli  E-value  Score  misMatch Comment
# ENSDART00000012253 Q9W6E8 99.51 609 609 609 609 0   978  3      
# ENSDART00000013114 Q6NYT1 99.63 267 267 267 267 4e-143 502 1  
# ENSDART00000067816 Q6NZZ8 95.78  433 433  471 460 0 838 2 query doesn't 
# begin with Met, no associated UniProt ID in Ensembl
# ENSDART00000018931 Q9DG41 99.42 346 346 552 346 0 709 2 query is partial, 
# doesn't begin with Met
# ENSDART00000023846 Q7ZUQ4 98.33 300 300 625 300 1e-179 624 5 query doesn't
# begin with Met
# ENSDART00000006095 Q6P2V4 99.32 443 443 443 443 0 941 3
# ENSDART00000039597 Q5G9L7 100 146 146 146 146 3e-81 295 0 100% coverage 
# ENSDART00000028930 Q90442 97.53 84 81 85 81 5e-42 164 2
# ENSDART00000028255 Q8JHY2 100 63 63 63 63 2e-32 132 0
# ENSDART00000042947 Q4QRH1 95.22 1849 456 479 452 0 808 10 alignment length =
# 460 bp, Ensembl doesn't show a UniProt protein ID for this.
# Maybe there is a coverage criterion.
# >= 90% and < 95% identity from ensBlastP90 table in test db:
#  There are 705 of these. 11911 have identity >= 95%
# Query  Target  Identity qLen qAli tLen tAli  E-value  Score  misMatch Comment
# ENSDART00000031211 Q6R5A4 94.21 779 779 846 789 0 1266 38  (gapOpen 6) 
# bases 66-846 of target is aligning. Ensembl does not have a UniProt ID 
# for this transcript.
# ENSDART00000028390 Q5TKR3 90.87 241 240 243 241 1e-125 444 21 (gapOpen 1) 
# ENSDART00000053312 Q5SYD9 92.64 325 325 322 322 8e-175 608 19 (gapOpen 2)
# ENSDART00000056703 Q5CZR2 91.02 323 323 323 323 7e-124 605 29 (gapOpen 0)
# Ensembl has no UniProt ID for this transcript. 91 % ID to NP_001013324.1,
# also 323 bp.
# ENSDART00000044490 Q3ZMH2 90.74 992 985 1082 994   0  1682 64 (gapOpen 7)
# Ensembl has no UniProt ID, just InterPro domains.
# ENSDART00000031487 Q5RHD6 92.81 320 320 319 319 7e-172 598 22 (gapOpen 1)
# Ensembl has no UniProt ID, just InterPro domain.
# ENSDART00000020233 Q6DHI1 91.72 298 298 299 299 6e-145 508 18 (gapOpen 2) 
# ENSDART00000061435 Q6PBV8 93.72 76 76 76 76 2e-33 135 5 (gapOpen 0)
# ENSDART00000056959 Q4V9F6 94.21 433 426 440 431 0 728 18 (gapOpen 2)
# only InterPro domain given for Ensembl, no UniProt ID. 
# ENSDART00000040220 Q504G5 90.12 172 172 174 172 3e-100 358 17 (gapOpen 0)
# only InterPro domain given for Ensembl, no UniProt ID.
# ENSDART00000066247 Q58EK5 90.08 767 231 485 251 3e-124 441 3 (gapOpen 3)
# only InterPro domain given for Ensembl, no UniProt ID.
    # for 95% identity and above, there are only 18 proteins that have
    # mismatch > 40.
    # for between 90-95% then there are 62 with mismatch > 40. 
    # use grep -A 100 -w 
    # look at examples with high mismatch but identity < 95%.
    # ---+------+--------+------+--------+----------+
    # | query  | target | identity | aliLength | mismatch | gapOpen     
    # |qStart | qEnd | tStart | tEnd | eValue | bitScore |

    # ENSDART00000012435 | Q6IQX1 |     91.2 |      1932 |      163 |       5
    # |  2 | 1931 |      3 | 1931 |      0 |     3093 |
    # this has a high number of mismatches but distributed throughout
    # the protein and the UniProt sequence aligns to the genome with the 
    # same exon structure as for ENSDART00000012435.
    # ENSDART00000050066 | Q7M558 |    91.69 |      3008 |      249 |       1
    # |  0 | 3008 |      0 | 3007 |      0 |     5543 |
    # this is a very large protein so the mismatch is small compared to
    # the protein size. has same exon structure as Ensembl protein at
    # chr17:18,247,969-18,259,468. Blats to several regions - could be a
    # processed pseudogene or assembly artifact. 
    # If identity < 95% and mismatch > 40 then size is at least around 450bp.
    # ENSDART00000028708 | Q7T296 |    90.12 |       486 |       45 |       1
    # |  0 |  486 |     18 |  501 |      0 |      907 |
    # The most gaps in a sequence is 9 - only 1 sequence < 95% identity and 
    # most have 0-2 gaps. Same for those >= 95% identity.
    #  ENSDART00000039735 | Q7T1C9 |    98.15 |      1406 |       12 |       9
    #  |  0 | 1394 |      0 | 1404 |      0 |     2175 |
    # Gaps are spread throughout the seqeunce and are short. Blat of this
    # UniProt sequence gives the same exon structure as for the Ensembl seq.
    # | ENSDART00000053813 | Q7M560 |    90.07 |      2275 |      104 |      9 
    # |  0 | 2178 |     99 | 2349 |      0 |     3966 |
    # There are several large gaps in the first third of the sequence. The
    # rest of the gaps are short. Ensembl does not have a UniProt ID for this
    # transcript. Blat aligns this sequence to several places on the genome
    # all in close proximity to each other. One alignment corresponds to the
    # an Ensembl ID but not the one above. It does align to the region of 
    # ENSDART00000053813 but with a different exon structure.
    # ENSDART00000044490 | Q3ZMH2 |    90.74 |      1004 |       64 |       7
    # |  0 |  985 |     88 | 1082 |      0 |     1682 |
    # This has a couple of larger gaps. The UniProt sequence aligns to the 
    # same region as ENSDART00000044490 which has 3 extra exons. There is 
    # another transcript with the same exon structure.
    # | ENSDART00000041503 | Q3ZMH2 |    91.42 |       991 |       63 |
    # 5 |  0 |  974 |     82 | 1068 |      0 |     1684 |
    # This has only slightly higher identity.
    # ENSDART00000025635 | Q4FE55 |    99.33 |      2545 |        6 |       7
    # |  0 | 2542 |      0 | 2537 |      0 |     4859 |
    # just short gaps. This Blats to the same region of ENSDART00000025635
    # and gives the same exon structure.
    # could filter more using pslReps but should not filter on minAli since
    # either the query or target could be partial.
    # Use identity >= 90% as the cutoff and then associate the RefSeqs with
    # ZFIN IDs and update the official ZFIN Gene symbols. 
    # 
    ssh hgwdev  # kkstore02
    cd /cluster/data/danRer3/bed/ensGenes
    mkdir alignments
    cd alignments
# Add a proteinID column to the ensGene table:
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ensGenes
    # Add protein ID column:
    hgsql -e 'alter table ensGene add proteinID varchar(40) NOT NULL;' danRer3
    # Add index to this column: 
    # Next step, download the ZFIN IDs and UniProt IDs
    hgsql -e 'alter table ensGene add index(proteinID);' danRer3
    hgsql -e 'select count(*) from ensGene;' danRer3
    # 32143
    hgsql -e 'update ensGene set proteinID = "";' danRer3
    # ensBlastp is the table in the test database where proteins have 
    # >=90% identity to the Ensembl proteins.
    hgsql -e 'select count(*) from ensGene as g, test.ensBlastp90 as p \
          where g.name = p.query;' danRer3
    # for >= 90% there are
    # 12614
    # for >=95%, there are 
    # 11910
   
    # Use these UniProt IDs to fill in proteinID table.
    hgsql -e 'update ensGene as g, test.ensBlastp90 as p \
          set g.proteinID = p.target where g.name = p.query;' danRer3
    # check that there are 12614 rows with proteinID filled.
    hgsql -e 'select count(*) from ensGene where proteinID != "";' danRer3
    # 12614
    # once this is done, can create ensCanonical and ensIsoforms table -
    # see section on "BUILD GENE SORTER TABLES".
    
    # Add table for Ensembl 38 Ensembl Transcript IDs and RefSeq IDs
    # and Entrez Gene ID.
    ssh hgwdev 
    cd /cluster/data/danRer3/bed/ensGenes 
cat << 'EOF' > ens38Zfish2.sql
CREATE TABLE ens38Zfish2 (
    transcriptId varchar(255) not null,     
    entrezGeneId varchar(255) not null,     
    refSeqId varchar(255) not null,
    refSeqProtId varchar(255) not null
);
'EOF'
    # << emacs
    tail +2 ensGeneInfo38Coding2.txt > ens38Coding2.tab
    hgLoadSqlTab test ens38Zfish2 ens38Zfish2.sql ens38Coding2.tab
    # 24523 lines where there is no Entrez Gene Id so these are set to 0.
    hgsql -N -e 'select distinct(entrezGeneId) from ens38Zfish2;' test \
         | sort > ens38Zfish2.geneId.uniq
    wc -l ens38Zfish2.geneId.uniq
    # 6764 ens38Zfish2.geneId.uniq 
    hgsql -e 'select count(distinct extDbId) from ens38Zfish;' test
    # 9028
    hgsql -N -e 'select distinct(extDbId) from ens38Zfish;' test \
          | sort > ens38Zfish.extDbId.sort
    grep -v NM ens38Zfish.extDbId.sort > ens38Zfish.extDbIdNoNM.sort
    # 8982 left
    grep -v BRARE ens38Zfish.extDbIdNoNM.sort \
            > ens38Zfish.extDbIdNoNMandNoSP.sort
    grep -v NP ens38Zfish.extDbIdNoNMandNoSP.sort \
            > ens38Zfish.extDbIdNoNMNoSPNoNP.sort
    wc -l ens38Zfish.extDbIdNoNMNoSPNoNP.sort
    # 5284 ens38Zfish.extDbIdNoNMNoSPNoNP.sort
    awk '{print $2}' ens38/ensToRefSeqvsZFIN.txt | sort | uniq \
        > ensToRefSeqvsZFIN.names.uniq
    # how many in common
    comm -12 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
        > common
    wc -l common
    # 4176 common
    comm -23 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
        > extDbIdNotfromZFINviaRefSeq 
    hgsql -N -e 'select mrnaAcc from refLink where locusLinkId != "";' danRer3 | sort | uniq > mrnaAcc.refLink.dr3.uniq
    wc -l mrnaAcc.refLink.dr3.uniq
    # 8811 mrnaAcc.refLink.dr3.uniq
    comm -12 mrnaAcc.refLink.dr3.uniq ensToRefSeq.refseq | wc
    # 7738 
    wc -l ensToRefSeq.refseq
    # 7738
    # merge the ens38Zfish2 table with ens38ZfishNew.  
    # for the Known Genes details pages. Changed table name from 
    # ensGeneXRef to ensXRefZfish as there are a number of tables already
    # with similar names to ensGeneXRef so this would be confusing.
    # create a table definition for ensXRefZfish:
    # (updated 2006-11-08, hartera)
    cd ~/kent/src/hg/lib
cat << 'EOF' > ensXRefZfish.as
table ensXRefZfish
"Link from an Ensembl Transcript ID to other database IDs and description."
    (
    string ensGeneId;   "Ensembl Transcript ID"
    string zfinId;   "ZFIN ID"
    string uniProtId;   "Unified UniProt protein accession"
    string spDisplayId; "UniProt Display ID"
    string geneId;	"ZFIN Gene Symbol (formerly LocusLink) ID"
    string geneSymbol;  "Official ZFIN Gene Symbol"
    string refSeq;      "RefSeq DNA Accession"
    string protAcc;     "RefSeq Protein Accession"
    string description; "Description"
    )
'EOF'

    autoSql ensXRefZfish.as ensXRefZfish
    mv ensXRefZfish.h ../inc
    # commit ensXRefZfish* files to CVS.
    # add zfinId, uniProtId, spDisplayId, geneId, geneSymbol, refSeq and 
    # protAcc as keys. ensGeneId is already the primary key.
    # description field is not long enough so it must be changed to a
    # longblob.
    perl -pi.bak -e 's/description varchar\(255\)/description longblob/' \
         ensXRefZfish.sql

    # get the gene2refseq file from NCBI to give the Entrez Gene ID
    # and symbol for refSeq accessions. Taxonomy ID is 7955 for Danio rerio.
    # columns in file are tax_id, GeneID, status, 
    # RNA nucleotide accession.version, RNA nucleotide gi, 
    # protein accession.version, protein gi, genomic nucleotide 
    # accession.version, genomic nucleotide gi, start position on the genomic 
    # accession, end position on the genomic accession, orientation.
    # for the gene_info file, column headings are:
    # tax_id, GeneID, Symbol, LocusTag, Synonyms, dbXrefs, chromosome,
    # map location, description, type of gene, Symbol from nomenclature
    # authority, Full name from nomenclature authority, Nomenclature status.
    # DOWNLOAD LATEST versions (from Nov. 8, 2006)
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/ensGenes/downloads
    cd /cluster/data/danRer3/bed/ensGenes/downloads
    wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
    wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
    gunzip gene2refseq.gz
    gunzip gene_info.gz
    # get records for taxon ID: 7955
    awk '{if ($1 == 7955) print;}' gene2refseq > zfish.gene2refseq
    wc -l zfish.gene2refseq
    # 14659 zfish.gene2refseq
    # 50465 zfish.gene2refseq - in March 
    # Most of the ones no longer in the gene2refseq file are 
    # PREDICTED, PROVISIONAL AND MODEL.
# 37206 MODEL
# 6278 PREDICTED
# 6174 PROVISIONAL
# 43 NA
# 13 Reviewed
# 5 REVIEWED
# 1 VALIDATED
# New sequences added:
# 7021 PROVISIONAL
# 6801 PREDICTED
# 52 NA
# 13 Reviewed
# 12 VALIDATED
# 10 INFERRED
# 5 REVIEWED

    awk '{if ($1 == 7955) print;}' gene_info > zfish.gene_info
    wc -l zfish.gene_info
    # 38915 zfish.gene_info
    # 38126 zfish.gene_info - in March
    # checked that the Ensembl 38 genes for zebrafish are the same as 
    # for Ensembl 35 for which these files were downloaded (see above - 
    # updated file names to reflect v38).
    # also download the file from ZFIN that gives gene Symbols, ZFIN IDs
    # and RefSeq accessions. ZFIN associates more than one ZFIN ID with
    # UniProt IDs but there is a one to one relationship for ZFIN IDs 
    # and RefSeq accessions. Therefore the RefSeq accessions can be used
    # to identify a ZFIN ID and gene name and vice versa.
    wget --timestamping http://zfin.org/data_transfer/Downloads/refseq.txt
    # already the ensGeneInfo38Coding.txt and ensGeneInfo38Coding2.txt
    # files into tables so that the information can be put together.
    # these are ens38Zfish and ens38Zfish2 in the test database.
    # first copy the ens38Zfish table and then replace the uniProtId column
    # with the best hits from the ensBlastp90 table. 
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ensGenes
    sed -e 's/ens38Zfish/ens38ZfishNew/' ens38Zfish.sql > ens38ZfishNew.sql
    # create table
    hgsql test < ens38ZfishNew.sql
    hgsql -e 'insert into ens38ZfishNew select * from ens38Zfish;' test
    # Add spDisplayId column:
    hgsql -e \
      'alter table ens38ZfishNew add spDisplayId varchar(255) NOT NULL;' test
    # add some indices
    hgsql -e 'create index uniProt on ens38ZfishNew (uniProt);' test
    hgsql -e 'create index query on ens38ZfishNew (transcriptId(20));' test
    # first remove uniProt IDs and add those found by Blastp:
    hgsql -e 'update ens38ZfishNew set uniProt = "";' test
    # add displayIds from uniProt to this table
    hgsql -e 'select count(*) from ens38ZfishNew as g, ensBlastp90 as p \
          where g.transcriptId = p.query;' test
    # 37362
    hgsql -e 'update ens38ZfishNew as g, ensBlastp90 as p \
          set g.uniProt = p.target where g.transcriptId = p.query;' test
    # check that 37362 rows have an entry for uniProt - ok
    hgsql -e 'select count(*) from ens38ZfishNew as g, uniProt.displayId as p \
          where g.uniProt = p.acc;' test
    # 36647
    # 36647 have display IDs in UniProt
    hgsql -e 'update ens38ZfishNew as g, uniProt.displayId as p \
          set g.spDisplayId = p.val where g.uniProt = p.acc;' test
    # check that 36647 of the rows have spDisplayId - ok.
    # add new columns for ens38ZfishNew
    hgsql -e \
      'alter table ens38ZfishNew add entrezGeneId varchar(255) NOT NULL;' test
    hgsql -e \
      'alter table ens38ZfishNew add refSeqId varchar(255) NOT NULL;' test
    hgsql -e \
      'alter table ens38ZfishNew add refSeqProtId varchar(255) NOT NULL;' test

    # merge together the tables. 
    hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
          set g.entrezGeneId = e.entrezGeneId \
          where g.transcriptId = e.transcriptId;' test
    
    hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
          set g.refSeqId = e.refSeqId \
          where g.transcriptId = e.transcriptId;' test
    
    hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
          set g.refSeqProtId = e.refSeqProtId \
          where g.transcriptId = e.transcriptId;' test
    
    cd /cluster/data/danRer3/bed/ensGenes/downloads/
    hgsql -N -e 'select * from ens38ZfishNew;' test > ens38ZfishNew.txt
    ssh kkstore04
    cd /cluster/data/danRer3/bed/ensGenes/downloads/
    
    # There are 308 cases where there is a RefSeq ID but no Entrez Gene ID.
    # There are 1046 cases where there is an Entrez Gene ID but no RefSeq ID.
    # Use the NCBI files to fill in the gaps where needed.
    # get ZFIN file of ZFIN IDs, gene name and GenBank accession 
    # refseq.txt has ZFIN IDs, gene name and RefSeq ID.
    wget --timestamping http://zfin.org/data_transfer/Downloads/gene_seq.txt
    awk '{print $1, $2}' gene_seq.txt | sort | uniq > geneSeq.genes
    awk '{print $1, $2}' refseq.txt | sort | uniq > refSeq.genes
    comm -23 refSeq.genes geneSeq.genes > refSeqOnly
    comm -13 refSeq.genes geneSeq.genes > geneSeqOnly
    wc -l *SeqOnly
    # 9542 geneSeqOnly
    # 827 refSeqOnly
    # get certain fields from each file and merge
    awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $2, $3, $4, $6;}' \
        zfish.gene2refseq > zfish.gene2refseqSubset.txt
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {print $2, $3, $5, $6, $9, $10, $11, $12;}' \
        zfish.gene_info > zfish.gene_infoSubset.txt
    # need to sort on the GeneID field (second field in refseq file and 
    # first field in gene_info file):
    sort -n -k2 zfish.gene2refseqSubset.txt | uniq \
         > zfish.gene2refseqSubset.sort
    sort -n -k1 zfish.gene_infoSubset.txt | uniq > zfish.gene_infoSubset.sort
    # join the two files based on the GeneID (Entrez Gene ID) which is 
    # the second field in refseq file and first field in gene_info file.
    # Need to set the $tab variable in .tcshrc file:
    # set tab = "	"
    join -t "$tab" -1 2 -2 1 zfish.gene2refseqSubset.sort \
         zfish.gene_infoSubset.sort \
         > zfish.gene2refSeqPlusInfo.txt 
    # The program needs to be written to fill in these gaps for RefSeq ID,
    # Entrez Gene ID and RefSeq Peptide ID. It should then check for the
    # gene symbol using the ZFIN ID using RefSeq ID.  
    # write program taking ensGene38Coding.tsv and ensGene38Coding2.tsv as 
    # input and also the RefSeq files to find Entrez Gene IDs and Gene Symbols.
    # and give the tabbed output for loading into the ensXRefZfish table.
    # hgEnsGeneXRef.c in ~/kent/src/hg/near/hgZfishEnsXRef
    /cluster/home/hartera/bin/x86_64/hgZfishEnsXRef \
       ensGeneInfo38.txt zfish.gene2refSeqPlusInfo.txt refseq.txt \
       ens37XRefZfish.tab >& ens37XRefZfish.log
    # load this tabbed file into ensXRefZfish table 
    ssh hgwdev 
    cd /cluster/data/danRer3/bed/ensGenes
    # remove old table:
    hgsql -e 'drop table ensXRefZfish;' danRer3
    hgLoadSqlTab danRer3 ensXRefZfish ~/kent/src/hg/lib/ensXRefZfish.sql \
          ens38XRefZfish.tab 
    # loaded with no problems.
    # Now need to check its contents:
    mkdir testing
    cd testing
    hgsql -N -e 'select zfinId, geneSymbol, refSeq from ensXRefZfish where \
           zfinId != "" AND refSeq != "";' test > zfinIdsymbAndrefseq.txt
    sort zfinIdsymbAndrefseq.txt | uniq > zfinIdsymbAndrefseq.sort
    sort ../refseq.txt | uniq > refseq.sort
    perl -pi.bak -e 's/\t\n/\n/' refseq.sort
    comm -23 zfinIdsymbAndrefseq.sort refseq.sort | wc 
    comm -12 zfinIdsymbAndrefseq.sort refseq.sort | wc 
    cd /cluster/data/danRer3/bed/ensGenes/testProgram/tmp3
    awk 'BEGIN {FS="\t"} {print $5}' ens38ZfishNew.sort | sort | uniq \
        ensFile.zfinIds.sort
    # There are 7321 zfin IDs
    # 7284 ZFIN IDs in table and 6499 with a RefSeq.
    hgsql -N -e 'select distinct(zfinId) from ensXRefZfish where refseq = "" \
         and zfinId != "" and geneSymbol = "";' test \
         | sort > zfinIdwithNoRefSeqNoSymb.sort
    # There are 853 with no refseq but a zfinId and no gene symbol and 690
    # are unique ZFIN IDs.
    # compare these to ZFIN IDs in the zfish.gene2refSeqPlusInfo.txt from
    # NCBI files:
    awk 'BEGIN {FS="\t"} {print $8;}' zfish.gene2refSeqPlusInfo.txt \
        | sort | uniq > zfinIds.fromNcbiFile.sort
    # remove first line and "ZFIN:" prefix
    tail +2 zfinIds.fromNcbiFile.sort | sed -e 's/ZFIN://' \
         > zfinIds.fromNcbiFile.sort2
    comm -13 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort | wc
    # 251 of these with no symbols are not found in the NCBI file
    comm -12 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort \
         > zfinIds.inNcibFile.noRefSeqOrSymbinXRef
    awk '{print $1}' refseq.txt | sort | uniq > refseq.zfId.sort
    comm -13 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort | wc
    # 176 of these with no symbols are not found in the ZFIN RefSeq file
    comm -12 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort \
         > zfinIds.inZfinFile.noRefSeqOrSymbinXRef
    # 435 are in both of these lists
    wc -l *.noRefSeqOrSymbinXRef
    # 439 zfinIds.inNcibFile.noRefSeqOrSymbinXRef
    # 514 zfinIds.inZfinFile.noRefSeqOrSymbinXRef

    
    # edit ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra to give 
    # mySQL queries to ensGtp and ensXRefZfish to retrieve name, protein and
    # description. Changed XRef table name to new name. 
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra
name global
knownGene ensGene
knownGenePep ensPep
nameSql select gene from ensGtp where transcript = '%s'
descriptionSql select description from ensXRefZfish where ensGeneId = '%s'
proteinSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
_EOF_
# << happy emacs

    # created blastp hgNear tables by alignment of Zebrafish Ensembl peptide
    # sequences to the equivalent "Known Genes" peptide sets for other species
    # - see hgNear sections above. Then create an otherOrg.ra file for 
    # zebrafish specifying the species and databases for these organisms 
    # with blastp homolog tables. 
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/otherOrgs.ra
name human
db hg18

name mouse
db mm8

name rat
db rn4

name drosophila
db dm1

name cElegans
db ce2

name yeast
db sacCer1
_EOF_
     # << this line makes emacs coloring happy
     # add Zebrafish-specific section.ra file
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/section.ra
name method
shortLabel Methods
longLabel Ensembl Genes Methods, Credits, and Data Use Restrictions
priority 140
_EOF_
     # << this line makes emacs coloring happy
     # added links to the Zebrafish links.ra file
     # update links.ra so that link for Ensembl Genes is to the correct
     # stable archive link for Ensembl37 (feb 2006) and change XRef
     # table name to new name.
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/links.ra
# Zebrafish-specific link info.
# This contains info to construct the quick links. 

name genome
tables ensGene 
idSql select chrom,txStart+1,txEnd from ensGene where name = '%s'

name family
tables ensGene
idSql select name from ensGene where name = '%s'

name ensemblGenes
shortLabel Ensembl Genes
tables ensGene
idSql select name from ensGene where name = '%s'
url http://feb2006.archive.ensembl.org/Danio_rerio/transview?transcript=%s
priority 25

name zfin
shortLabel ZFIN
tables ensXRefZfish
idSql select zfinId from ensXRefZfish where ensGeneId = '%s'
url http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=%s
priority 28

name tbSchema
shortLabel Table Schema
tables ensGene

name uniProt
shortLabel UniProt
tables ensXRefZfish
idSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
priority 30

name refSeq
shortLabel RefSeq
tables ensXRefZfish
idSql select refSeq from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=Nucleotide&term=%s&doptcmdl=GenBank&tool=genome.ucsc.edu
priority 40

name refSeqPep
shortLabel RefSeq Peptide
tables ensXRefZfish
idSql select protAcc from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=protein&term=%s&doptcmdl=GenPept&tool=genome.ucsc.edu
priority 42

name entrezGene
shortLabel Entrez Gene
tables ensXRefZfish
idSql select geneId from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=%s&tool=genome.ucsc.edu
priority 45

name genBank
hide

name pubMed
hide

name geneCards
hide

name stanfordSource
hide

name cgap
hide

name ensembl
hide 

name aceView
hide
_EOF_
     # << this line makes emacs coloring happy
     # then make my to visualize in own sandbox
     cd ~/kent/src/hg/hgGene
     make my
     # commit *.ra files for Zebrafish to CVS.
     # edited hgGene.c so that the Gene Symbol (if available) is displayed 
     # in the description section of the details page.
     # added ensXRefZfish to ensemblTranscriptId rules in all.joiner.
     # add entry to danRer3/trackDb.ra:
# track ensGene
# shortLabel Ensembl Genes
# longLabel Ensembl v37 Gene Predictions (Protein Coding Genes)
# group genes
# priority 32.8
# visibility pack
# color 150,0,0
# type genePred ensPep
# hgGene on 
 
# STS MARKERS (in progress, 2005-10-13, hartera)
    # DOWNLOADED RECENTLY FROM NCBI
    ssh kkstore02
    mkdir -p /cluster/data/danRer3/bed/stsMarkers
    cd /cluster/data/danRer3/bed/stsMarkers
    # UniSTS is the a unique subset of markers that are STS markers from the
    # six zebrafish mapping panels: GAT, HS, LN54, MGH, MOP, T51, and also
    # ZMAP which contains markers from the other panels. Among markers in 
    # these map, a subset that are STSs and with available primers sequences
    # were imported to UniSTS. These include submitted maps and those from
    # the Zebrafish Information Network (ZFIN).

############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-28 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzDanRer3.2006-02-28
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits danRer3 chainMm8Link
    #   54831876 bases of 1630323462 (3.363%) in intersection


# SWAP CHAINS/NET RN4 (DONE 4/2/06 angie)
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/blastz.rn4.swap
    cd /cluster/data/danRer3/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.danRer3/DEF \
      -workhorse kkr7u00 >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/danRer3/bed/blastz.rn4


############################################################################
##  BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd)
    ssh pk  
    mkdir /cluster/data/danRer3/bed/blastz.hg17.swap
    ln -s blastz.hg17.swap /cluster/data/danRer3/bed/blastz.hg17
    cd /cluster/data/danRer3/bed/blastz.hg17.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&
   # failed due to netChains: looks like previous stage was not 
   # successful (can't find [danRer3.hg17.]all.chain[.gz]).
   #
    mv swap.out swap.out.1
   # rerun with -continue=net
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=net -stop=net \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&

   # create the net filee (DONE 2006-04-09 markd)
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.hg17.swap/axtChain
    nice netClass -verbose=0 -noAr noClass.net danRer3 hg17 danRer3.hg17.net
    nice gzip danRer3.hg17.net

###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER ASSEMBLIES
# (DONE, 2006-04-17, hartera)
# ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)

    # followed instructions used in makePanTro2.doc
    ssh kkr1u00
    cd /cluster/data/danRer3/bed
    mkdir -p liftOver
    cd liftOver
    makeLoChain-split danRer3 /cluster/data/danRer3/nib >&! split.log &
    # Took about 30 minutes.
    # add split10k to san for pk runs (2006-05-30, hartera)
    ssh kk
    rsync -a --progress /iscratch/i/danRer3/split10k \
         /san/sanvol1/scratch/danRer3/

###########################################################################
# LIFTOVER CHAINS TO DANRER2 (DONE, 2006-04-25 - 2006-05-03, hartera)
# CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
    # Split (using makeLoChain-split) of danRer2 is doc'ed in makeDanRer2.doc
    # Do what makeLoChain-split says to do next (start blat alignment)
    ssh kk
    mkdir -p /cluster/data/danRer3/bed/liftOver
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-align danRer3 /iscratch/i/danRer3/nib danRer2 \
        /iscratch/i/danRer2/split10k \
        /iscratch/i/danRer2/11.ooc >&! align.log &
    # Took about 5 minutes.
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
    para try, check, push, check, ...
    para time >&! run.time
# Completed: 782 of 784 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:    4324484s   72074.73m  1201.25h   50.05d  0.137 y
# IO & Wait Time:                 35200s     586.67m     9.78h    0.41d  0.001 y
# Average job time:                5575s      92.92m     1.55h    0.06d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           62741s    1045.68m    17.43h    0.73d
# Submission to last job:        355469s    5924.48m    98.74h    4.11d

    # 2 jobs keep crashing so try them on the pk: chrUn_chrUn and chrUn_chr20
    # need to copy the danRer2 split10k over to the pk
    ssh kkr1u00
    mkdir -p /san/sanvol1/scratch/danRer2/split10k
    rsync -a --progress /iscratch/i/danRer2/split10k/* \
          /san/sanvol1/scratch/danRer2/split10k/
    # copy over 11.ooc file for danRer2
    cp /iscratch/i/danRer2/11.ooc /san/sanvol1/scratch/danRer2
    ssh pk
    cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
    mkdir extraRun raw
    cd extraRun
    grep chrUn_chrUn ../spec > spec
    grep chrUn_chr20 ../spec >> spec
    # change directories for spec file
    perl -pi.bak -e 's#/iscratch/i#/san/sanvol1/scratch#g' spec
    rm spec.bak 
    para create spec
    para push, check etc.
    para time >& run.time
# Completed: 2 of 2 jobs
# CPU time in finished jobs:     263163s    4386.05m    73.10h    3.05d  0.008 y
# IO & Wait Time:                    62s       1.04m     0.02h    0.00d  0.000 y
# Average job time:              131613s    2193.54m    36.56h    1.52d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          147104s    2451.73m    40.86h    1.70d
# Submission to last job:        147104s    2451.73m    40.86h    1.70d

    ssh kkr1u00
    # merge all raw output:
    cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25
    mv ./run/raw/*.psl ./raw/
    # lift alignments
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-lift danRer3 danRer2 >&! lift.log &
    # Took about 8 minutes to run.

    # chain alignments
    ssh kki
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-chain danRer3 /iscratch/i/danRer3/nib \
                danRer2 /iscratch/i/danRer2/nib >&! chain.log &
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/chainRun
    para try, check, push, check etc. ...
    para time >&! run.time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       2751s      45.86m     0.76h    0.03d  0.000 y
# IO & Wait Time:                   879s      14.64m     0.24h    0.01d  0.000 y
# Average job time:                 130s       2.16m     0.04h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             598s       9.97m     0.17h    0.01d
# Submission to last job:          1520s      25.33m     0.42h    0.02d

    # net alignment chains
    ssh kkstore02
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-net danRer3 danRer2 >&! net.log &
    # Took about 24 minutes to run.
    # load reference to over.chain into database table,
    # and create symlinks  /gbdb  and download area
    ssh hgwdev
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-load danRer3 danRer2 >&! load.log &
    # clean up
    rm *.log
    # test by converting a region using the "convert" link on
    # the browser, and comparing to blat of the same region

    # CLEANUP for LiftOver blat directory (2006-12-14, hartera)
    ssh kkstore02
    rm -r blat.danRer2.2006-04-25

# REDO BACENDS - bacEndPairs, bacEndSingles, bacEndBadPairs and all_bacends
# (split as chrN_allBacends) ONLY (DONE, 2006-05-01 - 2006-05-08, hartera) 
# RELOADED chrN_allBacends TABLES (DONE, 2006-06-08, hartera)
# RECREATED all_bacends table WITH ONLY RELEVANT PSLS FOR THE LFS BED 
# TABLES FOR PAIRS, PAIRSBAD AND SINGLES (DONE, 2006-08-04, hartera)
     # NOTE: there are overlapping BAC clone ends for danRer3. Some of these
     # are only a few kb apart (from beginning of one to end of the other)
     # so use stricter pslPairs parameters as for human and mouse.
     # These BAC Ends should be about 150-200 kb. Typically, they are
     # 50 - 300 kb apart.
     # NOTE: IN FUTURE, IF SPLITTING all_bacends TABLE BY CHROM AND
     # RENAMING AS chrN_allBacends THEN USE allBacends INSTEAD OF
     # all_bacends AS ARGUMENT TO pslPairs. THIS WILL THEN AUTOMATICALLY
     # ADD THE CORRECT PSL TABLE NAME TO THE BED (LFS) TABLES
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/bacends/pairsNew
     cd /cluster/data/danRer3/bed/bacends/pairsNew
     set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl \
        $bacDir/bacEndPairs.txt all_bacends bacEnds
     wc -l bacEnds.*
     # 1725 bacEnds.long
     # 12081 bacEnds.mismatch
     # 242235 bacEnds.orphan
     # 156444 bacEnds.pairs
     # 616 bacEnds.short
     # 1017 bacEnds.slop

     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
          > ../header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
     # make pairs bed file
     cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
               | headchg -del > bacEndPairs.bed
     # also need to process bacEndSingles.txt into a database table
     # for singles in bacEndSingles.txt, create a dummy file where they
     # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
     # sequence, put the dummy sequence in the second column, if the single is
     # a reverse sequence put in first column. use a perl script to do this.
     cd /cluster/data/danRer3/bed/bacends
     set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
     mkdir singlesNew
     cd singlesNew
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
     perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                           $bacDir/bacEndSingles.format
     # then run pslPairs on this formatted file
     /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
     -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
     wc -l bacEnds.*
     # 0 bacEnds.long
     # 0 bacEnds.mismatch
     # 11439 bacEnds.orphan
     # 0 bacEnds.pairs
     # 0 bacEnds.short
     # 0 bacEnds.slop
     # there are 11439 orphans here and 242235 from pair analysis so 
     # a total of 253674 orphans
     cat bacEnds.orphan ../pairsNew/bacEnds.orphan > bacEnds.singles
     wc -l bacEnds.singles
     # 253674 bacEnds.singles
     # make singles bed file
     cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed
     cp bacEndSingles.bed ../pairsNew
     cd ../pairsNew
     # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
     # since orphans are already in bacEndSingles, do not add these
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
     # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
     # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans

     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
     # use extractPslLoad later to get all_bacends.psl for database

     # There are rows where the aligments were the same but the lfNames are 
     # different. This is due to the presence of multiple reads for the 
     # same BAC end sequence. Sometimes they are slightly different lengths 
     # so the alignments are a little different. It would be good to 
     # consolidate all of these. Firstly, the identical rows were merged into 
     # one with a list of all the lfNames corresponding to that alignment.
     
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/pairsNew
     mkdir -p /cluster/data/danRer3/bed/bacends/duplicatesNew
     cd /cluster/data/danRer3/bed/bacends/duplicatesNew
     mkdir -p /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
     cd /cluster/data/danRer3/bed/bacends/duplicatesNew
     ln -s /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
     # write program to do this for linked feature series (lfs) which
     # is the type of data structure used for BAC ends.
     # Need a bed file sorted by chrom and chromStart 
     cd overlapRun
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
        sort -k1,2 /cluster/data/danRer3/bed/bacends/pairsNew/${f}.bed \
              > ${f}.lfs
     end
     wc -l *.lfs
     # 155242 bacEndPairs.lfs
     # 15311  bacEndPairsBadNoOrphans.lfs
     # 221821 bacEndSingles.lfs

     # remove replicate rows where names match and the overlapping region
     # (chromEnd - chromStart) is greater than or equal to 0.999.
     ssh kolossus
     cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         echo "Processing $f"
         nohup nice /cluster/bin/x86_64/lfsOverlap ${f}.lfs \
               ${f}.bed -name -minOverlap=0.999 -notBlocks
     end
     # Started: May 3 23:30 PID: 9199
     # pairs started: May 5 18:10, PID: 13232
     # Segmentation fault with bacEndSingles. This is a very large file so
     # run again using the file split into two
     # chr24 starts at line 109407
     head -109406 bacEndSingles.lfs > bacEndSinglesPart1.lfs
     tail +109407 bacEndSingles.lfs > bacEndSinglesPart2.lfs
     # then try again:
     foreach f (bacEndSinglesPart1 bacEndSinglesPart2)
         echo "Processing $f"
         nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
               ${f}.bed -name -minOverlap=0.999 -notBlocks
     end
     # merge results
     cat bacEndSinglesPart*.bed > bacEndSingles.bed

     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
     # check the numbers of lines are correct
    
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
             | sort | uniq -c | sort -nr > ${f}.uniqCount
     end
     wc -l *
     # 155164 bacEndPairs.bed
     # 155242 bacEndPairs.lfs
     # 155189 bacEndPairs.uniqCount
     # 15293 bacEndPairsBadNoOrphans.bed
     # 15311 bacEndPairsBadNoOrphans.lfs
     # 15303 bacEndPairsBadNoOrphans.uniqCount
     # 221771 bacEndSingles.bed
     # 221821 bacEndSingles.lfs
     # 221799 bacEndSingles.uniqCount
     # 109390 bacEndSinglesPart1.bed
     # 109406 bacEndSinglesPart1.lfs
     # 112381 bacEndSinglesPart2.bed
     # 112415 bacEndSinglesPart2.lfs
     # different numbers for unique count since some of these alignments 
     # were not identical but very close to identical (>0.999 overlap) 
     cd /cluster/data/danRer3/bed/bacends/duplicatesNew
     mv ./overlapRun/* .
     rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
     # Use perl script to choose 2 BAC ends to represent each BAC clone.
     # since there are often more than one read for each BAC end in this set,
     # 2 were chosen for each BAC pair or 1 for the singles. This was based on
     # the ones that had the largest region aligned (using lfSizes).
     # copy perl script over that was used for danRer2
     cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
        pickLfNamesv2.pl 
     # edit so that regular expression for matching BAC end names is the 
     # same as that used in ../bacends.1/getBacEndInfov2.pl
     # need to sort by chrom, chromStart

     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
     end
     # run perl script: input bed file, pairs or singles, name of output file
     perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
     mv error.log log.pairs
     # log.pairs is empty
     perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
     mv error.log log.singles
     sort log.singles | uniq > log.singles.uniq
     cp bacEndSinglesSort.bed bacEndSingles2Sort.bed
     # log.singles has 15 cases where alignments for a BAC clone use 
     # different sequence reads for either the T7 or SP6 BAC end.
     # singles may include both BAC ends for a clone in the case
     # where they aligned to different chromosomes or a long way apart on 
     # the same chromsome (orphans). mostly those that have a different read
     # align to an almost identical or largely overlapping region.
     # CH211-189J23: zC189J23.ya and zC189J23.yb align to overlapping regions.
     # Use zC189J23.yb as aligns to a longer region and remove the other one.
     # CH211-42D5
     # some sequences appear to be different: CH211-98J20 - zC98J20.yb and
     # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
     # like it has low complexity sequence, this is discarded and zKp107B4.yb 
     # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
     # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp 
     # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
     # In these cases, the 2 sequences align to different regions.
     # Some sequences have overlapping alignments as one sequence is a bit
     # longer than the other.
     perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
          badPairs2lfNames.bed
     mv error.log log.badPairs
     # no alignments have a different pair of ends to other alignments
    
     # for each of these new bed files, checks were made that there are
     # only 2 BAC ends per alignments for pairs and 1 for singles.
     # For each pair, there should only be 2 ends which can appear either
     # way round depending on the orientation and there should be 1 end for
     # the beginning (suffix T7, t7 or z) and one end for the end
     # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
     # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
     # orientation. For singles, there should be a single BAC end for each
     # alignment and for each BAC clone, a sequence for either or both types
     # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
     # alignments.
     e.g.
     wc -l pairs2lfNames.bed
     grep ',' pairs2lfNames.bed
     # should be the same number, every line should have a comma
     # should be twice the number of above, just 2 end names per line
     awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
     sed -e 's/,/\n/g' pairs.ends > pairs.ends2
     wc -l pairs.ends2
     # should be twice the number of above, just 2 end names per line
     perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
     sort pairs.ends | uniq > pairs.ends.uniq
     # check that each of these have the correct pair type

     # Finally overlaps in BAC clone names were checked. All BAC clones
     # represented in each of the pairs, badPairs and singles bed files are
     # unique to that file. Between all three bed files, 300323 BAC clones
     # have alignments. 512886 clone ends are aligned in these three bed files. 
     foreach f (*.bed)
        awk '{print $4}' $f | sort | uniq > ${f}.names
     end
     comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
     comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
     comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names
     # None of these files should have any BAC clone names in common and
     # they do not so they are ok.
     # clean up:
     rm *Part1.bed *Part2.bed *.names *.ends *.ends2 *.Part1.lfs *Part2.lfs
     rm *.uniqCount
     # NOTE: using sort and uniq on hgwdev produces tab delimited output
     # after merging rows with the same BAC name, the scoring is now
     # wrong in the bed files.
     # Scores should be 1000 if there is 1 row for that name, else
     # 1500/number of rows for that sequence name - calculated by pslPairs.
     # Correct the scores. The co-ordinates for the singles also need to be
     # corrected.
                                                                                
     mkdir -p /cluster/data/danRer3/bed/bacends/scoresAndCoords
     cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
     # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and 
     # edit so both scripts so that hits file is split on space,not on tabs
     cp \
   /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/correctScores2.pl .
     cp \
     /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/checkScores.pl .
     awk '{print $4}' ../duplicatesNew/pairs2lfNames.bed \
                 | sort | uniq -c > pairs.hits
     perl correctScores2.pl ../duplicatesNew/pairs2lfNames.bed pairs.hits \
          noBin > bacEndPairsGoodScores.bed
     # same for singles
     awk '{print $4}' ../duplicatesNew/singles1lfName.bed \
                 | sort | uniq -c > singles.hits
                                                                                
     perl correctScores2.pl ../duplicatesNew/singles1lfName.bed singles.hits \
                 noBin > bacEndSinglesGoodScores.bed
                                                                                
     # and for badPairs
     awk '{print $4}' ../duplicatesNew/badPairs2lfNames.bed \
                 | sort | uniq -c > badPairs.hits
     perl correctScores2.pl ../duplicatesNew/badPairs2lfNames.bed \
          badPairs.hits noBin > bacEndPairsBadGoodScores.bed
     # check that the scores are now correct  
     awk '{print $4, $5}' bacEndPairsGoodScores.bed \
         | sort | uniq -c > pairs.count
     perl checkScores.pl < pairs.count
     # all the BAC clones should be in good.txt and none in bad.txt
     # wc -l should give same number of lines in good.txt as in pairs.hits
     # repeat for other bed files
     awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
         | sort | uniq -c > badPairs.count
     perl checkScores.pl < badPairs.count
     awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
         | sort | uniq -c > singles.count
     perl checkScores.pl < singles.count
     # for the singles, 7 ended up in bad.txt because their scores 
     # were 214.285714285714 which is correct for 7 alignments. rounding the
     # score caused the discrepancy.
     # For singles, the co-ordinates in the lfs table are wrong. The
     # chromStart should be the same as the lfsStart and chromEnd - chromStart
     # should be the same as lfSizes. Need to correct these:
     # pslPairs has added min/2 to the end or subtracted min/2 from the start
     # depending on whether it is a left or a right BAC end and the 
     # alignment orientation. min used here was 25000.
     awk 'BEGIN {FS="\t"} {OFS="\t"} \
      {if ($2 != $9) print $1,$9,$3,$4,$5,$6,$7,$8,$9,$10,$11; \
      else print $1,$2,$3 - 12500,$4,$5,$6,$7,$8,$9,$10,$11;}' \
      bacEndSinglesGoodScores.bed \
      > bacEndSinglesGoodScoresAndCoords.bed
     # clean up
     rm error.log *.txt *.count *.hits bacEndSinglesGoodScore.bed

     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
     # copy over table definition from danRer2
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
        ../singlesNew/
     # Now load database tables:
     hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
     # Loaded 155164 elements of size 11
     hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScoresAndCoords.bed \
               -sqlTable=../singlesNew/bacEndSingles.sql -notItemRgb
     # Loaded 221754 elements of size 11
     # 221754 record(s), 0 row(s) skipped, 57 warning(s) loading bed.tab
     # warnings are unknown but all of bed file loaded and the number
     # of warnings is small so ignore
     hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
     # Loaded 15293 elements of size 11
     # load BAC end sequences into seq table so alignments may be viewed
     mkdir -p /gbdb/danRer3/bacends
     ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
                                /gbdb/danRer3/bacends/Zv5BACends.fa
     hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa

     # create file for loading all_bacends table
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
     # for all_bacends table, just load the alignments for those sequences
     # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
     # bacEnds.load.psl is the file of alignments
     # get all the names of sequences 
     foreach f (*.bed)
       echo $f
       awk '{print $11;}' $f >> allBacEnds.names
     end
     wc -l allBacEnds.names
     # 392211 allBacEnds.names
     # this is the total number of lines in the *.bed files
     perl -pi.bak -e 's/,/\n/g' allBacEnds.names
     sort allBacEnds.names | uniq > allBacEnds.names.uniq
     wc -l allBacEnds.names.uniq
     # 512321 allBacEnds.names.uniq
     # get alignments for just the BAC ends that are in the database tables
     # make bacEnds.load.psl
     cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
     extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
         bacEndPairsBadGoodScores.bed bacEndSinglesGoodScoresAndCoords.bed | \
         sorttbl tname tstart | headchg -del > bacEnds.load.psl
    # check that alignments are present for all BAC ends in 
    # allBacEnds.names.uniq
    awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
    comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
    # 512321
    wc -l *
    # 512321 allBacEnds.names.uniq
    # 512321 bacEnds.names
   
    # Reloaded split tables. Old bacEnds.load.psl was used 
    # last time. (2006-06-08, hartera)
    ssh hgwdev
    cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
    # remove old all_bacends table. This was moved over from hgwbeta after
    # the recent crash of hgwdevold after the power failure.
    hgsql -e 'drop table all_bacends;' danRer3 
    # Display is very slow for BAC ends on large regions. Try splitting
    # bacEnds.load.psl and load tables as chrN_allBacends. The parsing
    # code is confused if there are two underscores in the table name.
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        echo "Processing $c ..."
        awk '{if ($14 == "'chr${c}'") print;}' \
           /cluster/data/danRer3/bed/bacends/scoresAndCoords/bacEnds.load.psl \
           > chr${c}.bacEnds.load.psl
    end
    # drop old tables
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
       echo $c
       hgsql -e "drop table chr${c}_allBacends;" danRer3
    end
    # load new tables
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
     nice hgLoadPsl danRer3 -table=chr${c}_allBacends chr${c}.bacEnds.load.psl
    end
    # load of chr5_allBacends did not go as planned: 326147 record(s), 
    # 0 row(s) skipped, 1 warning(s) loading psl.tab
    # load of chr8_allBacends did not go as planned: 212665 record(s), 
    # 0 row(s) skipped, 5 warning(s) loading psl.tab
    # load of chr12_allBacends did not go as planned: 156947 record(s), 
    # 0 row(s) skipped, 1 warning(s) loading psl.tab
    # load of chr15_allBacends did not go as planned: 181721 record(s), 
    # 0 row(s) skipped, 1 warning(s) loading psl.tab
    # load of chr19_allBacends did not go as planned: 282423 record(s), 
    # 0 row(s) skipped, 1 warning(s) loading psl.tab
    # load of chr20_allBacends did not go as planned: 315248 record(s), 
    # 0 row(s) skipped, 7 warning(s) loading psl.tab
    # load of chrUn_allBacends did not go as planned: 1524765 record(s), 
    # 0 row(s) skipped, 487 warning(s) loading psl.tab    

    # There are still warnings on loading, most (487) are for chrUn.
    # alter lfs (BED) tables so that pslTable field is "allBacends"
    # instead of all_bacends (this was set by the pslPairs program).
    foreach t (bacEndPairs bacEndSingles bacEndPairsBad)
       hgsql -e "update $t set pslTable = 'allBacends';" danRer3
    end
    # This improves the performance a lot.
    # corrected termRegex for some bacCloneXRef searches in trackDb.ra so 
    # that they work correctly (bacPairsIntName, bacSinglesIntName, 
    # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)

    # Remake the all_bacends table. extractPslLoad extracts psl alignments
    # by name so even those that are filtered out end up in the all_bacends
    # table. Wrote a program that matches BAC end psl alignments from the
    # bacEnd{Pairs, PairsBad, Singles} tables by name, chrom, chromStart and
    # chromEnd.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends
    mkdir extractPsl
    cd extractPsl
    # Some scores in bacEndSinglesGoodScoresAndCoords.psl are not integers
    # so fix these and also for the other bacEnd files just in case.
cat << '_EOF_' > roundPslScore.pl
#!/usr/bin/perl -w
use strict;

my $file = $ARGV[0];

open(FILE, $file) || die "Can not open $file: $!\n";
while (<FILE>)
{
my (@f, $line, $num, $score);
$line = $_;
@f = split(/\t/, $line);
$num = $f[4];
$score = round($num);
$line =~ s/$num/$score/;
print $line;
}

sub round {
    my($number) = shift;
    return int($number + .5);
}
'_EOF_'
    chmod +x roundPslScore.pl
    set bacDir=/cluster/data/danRer3/bed/bacends
    perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsGoodScores.bed \
         > bacEndPairsRoundScore.bed
    perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsBadGoodScores.bed \
         > bacEndPairsBadRoundScore.bed
    perl roundPslScore.pl \
         $bacDir/scoresAndCoords/bacEndSinglesGoodScoresAndCoords.bed \
         > bacEndSinglesRoundScore.bed
    
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndPairsRoundScore.bed bacPairs.psl   
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndPairsBadRoundScore.bed bacPairsBad.psl   
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
     $bacDir/bacEnds.psl \
     bacEndSinglesRoundScore.bed bacSingles.psl   
    cat bac*.psl > allBacends.load.psl
    
    # Now load database tables:
    # Do not need to reload singles table as it is still the same, the 
    # scores were rounded to 214 on loading. These are the only scores that
    # are floats rather than integers. 
    # Drop old split bacends tables and reload new one with only those psls 
    # relevant to alignments in the lfs tables. 
    ssh hgwdev
    cd /cluster/data/danRer3/bed/bacends/extractPsl
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
       hgsql -e "drop table chr${c}_allBacends;" danRer3
    end
    # change the bacEnd{Pairs, PairBad, Singles} tables so that the 
    # pslTable is all_bacends again.
    foreach b (Pairs PairsBad Singles)
       hgsql -e "update bacEnd${b} set pslTable = 'all_bacends';" \
                danRer3
    end
    # Then load all_bacends table. Now there are many less alignments than
    # before, they can all go in one table since the large table size 
    # was previously slowing down the Browser at zoomed out display levels
    # due to slow access of the very large all_bacends table.
    wc -l allBacends.load.psl
    # 549408 allBacends.load.psl
    hgLoadPsl danRer3 -table=all_bacends allBacends.load.psl
    hgsql -e 'select count(*) from all_bacends;' danRer3
    # 549408
    # Table contains the correct number of rows.
    # Get all the lfNames from the bed files and check that these are all
    # represented in allBacends.load.psl
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends/extractPsl
    foreach p (*RoundScore.bed)
        awk '{print $11}' $p >> bedFiles.names
    end
    perl -pi.bak -e 's/,/\n/' bedFiles.names
    sort bedFiles.names | uniq > bedFiles.names.uniq
    # get psl file names
    awk '{print $10}' allBacends.load.psl | sort | uniq > pslFile.names.uniq
    wc -l *.uniq
    # 512321 bedFiles.names.uniq
    # 512321 pslFile.names.uniq
    comm -12 bedFiles.names.uniq pslFile.names.uniq | wc -l
    # 512321
    # Therefore all names from BED files are in PSL file.
    rm bedFiles* pslFile*
    cd /cluster/data/danRer3/bed/bacends
    rm -r all_bacends
    rm ./scoresAndCoords/*.bacEnds.load.psl
    # Duplicate rows in bacCloneXRef and bacCloneAlias tables so remove
    # these, reload tables and test - see sections on  
    # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES and 
    # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES

#######################################################################
    # RE-DO RH MAP:
    # isPcr of sequences. 
    # 1) Make a list from FASTA file of sequences.
    # 2) get one record per file. - need to just split on '>' 
    # 3) use rhFix to adapt to get primers, one set per file and name
    # after sequence
    # run isPcr as cluster job - one per sequence and primers set
    # get RHmap info again. need to remove spaces in primers seqs

cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    sub(/>/,"",$0);
    sub(/\//,"_", $0);
    gsub(/ /,"",$0);
    split(toupper($0), a, "\\|");
    print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
    next;
}
'_EOF_'
# << keep emacs coloring happy
    chmod +x getRhInfo
    getRhInfo ../../rhMap.headers2 > rhMapInfo.tab
     
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
    mkdir -p isPcr/primers
    cd isPcr/primers
    # create primers files
    ssh kkstore02
    cd \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers
    awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1,$8,$9 \
        > $1".primers.fa"}' rhMapInfo.tab
    # there are 7519 primer sets which is correct.
    # get list of sequences
    cd ..
    mkdir markerSeqs
    cd markerSeqs
    grep '>' ../../rhMap.fa | wc
    # 11514
    # get all sequences. There are 11514 total.
    # use faSplit sequence 11514 
    # rhMap.fa is file. Need to fix that one name:
    perl -pi.bak -e 's/\//_/' ../../rhMap.fa
    # splits sequences up with one file per name named with sequence name
    faSplit byname ../../rhMap.fa rhMap
    ls | wc -l
    # 11514
    ssh pk
    # make run dir on the san and link to isPcr dir
    mkdir -p /san/sanvol1/scratch/danRer3/bacends/isPcrRun
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
    ln -s /san/sanvol1/scratch/danRer3/bacends/isPcrRun .
    # get list of sequences with primers
    cd \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun
    awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1 \
        > "primerSeqs.lst"}' \
       /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/rhMapInfo.tab
   
    foreach m (`cat primerSeqs.lst`)
       echo /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc -stepSize=5 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/markerSeqs/${m}.fa /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/${m}.primers.fa '{'check out line+ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun/out/${m}.psl'}' >> jobList
    end
    para create jobList
    para try, check, push, check etc. ...
    # there are 654 that do not have isPcr results. Checked Z4664.MGH and 
    # found that the primers would not align with Blat either.
    # these are in unmatchedPrimers. They crashed even if maxSize=50000 and
    # if -flipReverse used.
    mkdir notMatchedPrimers notMatchedSeqs 
    perl -pi.bak -e 's/\.fa//' unmatchedPrimers
    foreach f (`cat unmatchedPrimers`)
     set d=/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
     cp ${d}/primers/${f}.primers.fa ./notMatchedPrimers/
     cp ${d}/markerSeqs/${f}.fa ./notMatchedSeqs
    end
    
    tar cvzf primers.tar.gz notMatchedPrimers/*primers.fa
    tar cvzf markers.tar.gz notMatchedSeqs/*.fa
    # sent these to Yi Zhou by e-mail and see if they can look at them.
    # include the isPcr parameters.
    # from PSL extract sequence. need tName, tStart and tEnd, fields 14, 16 and
    # 17. Then used faFrag to get sequence from FASTA file.

############################################################################
##  BLASTZ swap from panTro2 alignments (DONE 2006-05-07 markd)
    ssh hgwdev64
    mkdir /cluster/data/danRer3/bed/blastz.panTro2.swap
    ln -s blastz.panTro2.swap /cluster/data/danRer3/bed/blastz.panTro2
    cd /cluster/data/danRer3/bed/blastz.panTro2.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	/cluster/data/panTro2/bed/blastz.danRer3/DEF >& swap.out&

   # create the net files
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.panTro2.swap/axtChain
    nice netClass -verbose=0 -noAr noClass.net danRer3 panTro2 danRer3.panTro2.net

###########################################################################
# LIFTOVER CHAINS TO DANRER4 (DONE, 2006-05-31 - 2006-06-06, hartera)
# CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
   # Split (using makeLoChain-split) of danRer4 is doc'ed in makeDanRer4.doc
   # Do what makeLoChain-split says to do next (start blat alignment)
   # Use pk as runs faster than on kk. Scripts only run on kk so run manually.
   ssh pk
   mkdir -p /cluster/data/danRer3/bed/liftOver
   cd /cluster/data/danRer3/bed/liftOver
cat << '_EOF_' > align.csh
#!/bin/csh -fe
set oldAssembly = $1
set oldNibDir = $2
set newAssembly = $3
set newSplitDir = $4
set ooc = $5
if ("$ooc" != "") then
    set ooc = '-ooc='$ooc
endif

set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
echo "Setting up blat in $blatDir"
rm -fr $blatDir
mkdir $blatDir
cd $blatDir
mkdir raw psl run
cd run

echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
       '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
  >> gsub
echo '#ENDLOOP' >> gsub

# target
ls -1S $oldNibDir/*.{nib,2bit} > old.lst
# query
ls -1S $newSplitDir/*.{nib,fa} > new.lst

gensub2 old.lst new.lst gsub spec
/parasol/bin/para create spec

echo ""
echo "First two lines of para spec:"
head -2 spec
echo ""
echo "DO THIS NEXT:"
echo "    cd $blatDir/run"
echo "    para try, check, push, check, ..."
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x align.csh
   align.csh danRer3 /san/sanvol1/scratch/danRer3/nib danRer4 \
       /san/sanvol1/scratch/danRer4/split10k \
       /san/sanvol1/scratch/danRer4/danRer4_11.ooc >&! align.log &
   # Took a few seconds.
   # Do what its output says to do next (start cluster job)
   cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/run
   para try, check, push, check, ...
   para time
# Completed: 784 of 784 jobs
# CPU time in finished jobs:    2011355s   33522.59m   558.71h   23.28d  0.064 y
# IO & Wait Time:                  3926s      65.43m     1.09h    0.05d  0.000 y
# Average job time:                2571s      42.84m     0.71h    0.03d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          205412s    3423.53m    57.06h    2.38d
# Submission to last job:        219860s    3664.33m    61.07h    2.54d
   
   ssh pk
   cd /cluster/data/danRer3/bed/liftOver

cat << '_EOF_' > lift.csh
#!/bin/csh -ef
set oldAssembly = $1
set newAssembly = $2
set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k

set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
set blatDir = `ls -td $prefix.20* | head -1`
echo "using dir $blatDir"

if ( ! -e $blatDir/raw ) then
    echo "Can't find $blatDir/raw"
endif

if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
    echo "Can't find any .lft files in $newLiftDir"
    exit 1
endif
cd $blatDir/raw

foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
    echo $chr
    liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
end

set execDir = $0:h
echo ""
echo "DO THIS NEXT:"
echo "    ssh pk"
echo "    $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x lift.csh
   lift.csh danRer3 danRer4 >&! lift.log &
   # makeLoChain-chain can be run on pk. chain alignments

   makeLoChain-chain danRer3 /san/sanvol1/scratch/danRer3/nib \
                     danRer4 /san/sanvol1/scratch/danRer4/nib >&! chain.log &
   cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/chainRun
   para try, check, push, check, ...
   para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       3414s      56.91m     0.95h    0.04d  0.000 y
# IO & Wait Time:                  3256s      54.26m     0.90h    0.04d  0.000 y
# Average job time:                 238s       3.97m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             280s       4.67m     0.08h    0.00d
# Submission to last job:           280s       4.67m     0.08h    0.00d

   # net alignment chains
   ssh kkstore02
   cd /cluster/data/danRer3/bed/liftOver
   makeLoChain-net danRer3 danRer4 >&! net.log &
   # load reference to over.chain into database table,
   # and create symlinks  /gbdb  and download area
   ssh hgwdev
   cd /cluster/data/danRer3/bed/liftOver
   makeLoChain-load danRer3 danRer4 >&! load.log &
   # clean up
   rm *.log
   # add md5sum.txt to include this new liftOver file
   cd /usr/local/apache/htdocs/goldenPath/danRer3/liftOver
   rm md5sum.txt
   md5sum *.gz > md5sum.txt
   # copy README.txt from another liftOver directory if it is not there already.
   # test by converting a region using the "convert" link on
   # the browser, and comparing to blat of the same region
   
   # CLEANUP blat directory (2006-12-14, hartera)
   ssh kkstore02
   rm -r /cluster/data/danRer3/bed/blat.danRer4.2006-05-31

###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
# UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2006-10-20, hartera)
# UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
# VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2007-01-08, hartera)
# RE-ORDERED DISPLAY IN TRACK (DONE, hartera, 2007-04-09)
# Array data is for whole embryos of five wild type zebrafish strains. 
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
    ssh hgwdev
    mkdir /cluster/data/danRer3/bed/ZonLab/wtArray
    cd /cluster/data/danRer3/bed/ZonLab/wtArray
   
    # use AllRatio table for mapping. There are not many arrays in this
    # dataset so using AllRatio will allow the selection of All Arrays
    # from the track controls on the track description page. Also set up the
    # Zebrafish microarrayGroups.ra so that the Medians of replicates or
    # Means of replicates can also be selected for display.
    # Create mapped data in zebrafishZonWT.bed.
    rm zebrafishZonWT.bed
    hgsql -e 'drop table affyZonWildType;' danRer3
    hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
         /cluster/data/danRer3/bed/affyZebrafish/affyZebrafish.psl
    # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
    # Mapped 14494,  multiply-mapped 4102, missed 0, unmapped 1123

    # Load mapped data into database:
    hgLoadBed danRer3 affyZonWildType zebrafishZonWT.bed
    # Loaded 18596 elements of size 15
    # add trackDb.ra entry at trackDb/zebrafish level

    # look at range of scores:
    hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
          > ratioExps.out
    perl -pi.bak -e 's/,/\n/g' ratioExps.out
    sort ratioExps.out | uniq -c > ratioExps.uniq.count
    textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
        ratioExps.out > expRatios.hist
    # Most values are between -3 and +2.
    # Therefore use the following trackDb entry:

# track affyZonWildType
# shortLabel Wild Type Array
# longLabel Zon Lab Expression data for Wild Type Zebrafish strains
# group regulation
# priority 80
# visibility hide
# type expRatio
# expScale 2.0
# expStep 0.2
# groupings affyZonWildTypeGroups
    # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
    # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
    # which is microarrayGroups.ra defines how the array data is
    # displayed and also grouped for the Medians and Means of Replicates.
    # It also defines the labels for the track controls for showing
    # All Arrays, Arrays Grouped By Replicate Means or
    # Arrays Grouped By Replicate Medians. This is in the description field.

    # RE-ORDER DISPLAY IN TRACK - (hartera, 2007-04-09)
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/hgCgiData/Zebrafish
    # 14 somites and 15 somites should come before 36 hpf
    # 14-19 somites stage is 16-19h.
    # from hgFixed.zebrafishZonWTAllExps
    # for AB, 0-8 should go after 14, 
    # for TL, 16-22 should go after 24
    # for TU, 25-27 should go after 32
    # re-order accordingly in the config file:
    cd /cluster/data/danRer4/bed/ZonLab/wtArray
cat << '_EOF_' > formatArray
#!/usr/bin/awk -f
BEGIN {FS=","} {OFS=","}
/expIds/ {
    sub(/expIds /,"",$0);
    print "expIds "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
    next;
}
/names AB-36-hpf,AB-36-hpf 2/ {
    sub(/names /,"",$0);
    print "names "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
    next;
}
/names AB-36-hpf,AB-14-somites/ {
    sub(/names /,"",$0);
    print "names "$2,$1,$3,$5,$4,$7,$8,$6,$9;
    next;
}
/groupSizes 9/ {
    sub(/groupSizes /,"",$0);
    print "groupSizes "$2,$1,$3,$5,$4,$7,$8,$6,$9;
    next;
}
{
    print $0;
}
'_EOF_'
    chmod +x formatArray
    formatArray ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra \
                > microarrayGroups2.ra
    cp microarrayGroups2.ra \
       ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra
    cd ~/kent/src/hg/makeDb/hgCgiData/
    make my 
    # after doing make, check this in hgwdev-hartera
    # then commit to CVS as it works fine. 

###########################################################################
# BUILD GENE SORTER TABLES (AKA FAMILY BROWSER) 
# (DONE, 2006-06-08 - 2006-06-12, hartera)
# Zon Lab WT Affy data tables in hgFixed renamed to reflect that the data 
# is log2 transformed (DONE, 2006-07-30, hartera)
# Recreate the ensToAffyZebrafish and ensToAffyZonWildType tables after 
# updating the Affy Zebrafish track with different filtering used for the 
# Blat alignments - see UPDATE AFFY ZEBRAFISH TRACK section. Also the 
# Affy Zon Lab Wild Type Array data was updated with a different method of 
# processing - see hgFixed.txt (DONE, 2006-10-25, hartera)
# Recreated the ensCanonical and ensIsoforms table after updating proteinID
# in ensGene table (DONE, 2006-11-06, hartera) 
#  This should be done after creating ensGene, ensGtp and ensPep tables
#  for the Ensembl Genes track.
#  The BlastTab tables are already built - see HGNEAR PROTEIN BLAST TABLES
#  Blastp of self is ensZfishBlastTab table.
#  Other blastp ortholog tables are: hgBlastTab (hg18), mmBlastTab(mm8), 
#  rnBlastTab (rn4), dmBlastTab (dm2), ceBlastTab (ce2), 
#  sacCerBlastTab (sacCer1).
   ssh hgwdev
   mkdir /cluster/data/danRer3/bed/geneSorter.2006-06-08
   ln -s /cluster/data/danRer3/bed/geneSorter.2006-06-08 \
         /cluster/data/danRer3/bed/geneSorter
   cd /cluster/data/danRer3/bed/geneSorter
   # Create table that maps between known genes and RefSeq
   # Index is only on first 16 characters, too short for Ensembl names
   # manually changed hgMapToGene to create index with 20 characters 
   # on name and use local copy of program.
   $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene ensToRefSeq
   # hgsql -e 'select count(*) from ensToRefSeq;' danRer3
   # 9707

   # Create table that maps between Ensembl genes and LocusLink 
   # LocusLink is now called Entrez Gene.
   hgsql -N -e "select mrnaAcc,locusLinkId from refLink" danRer3 > refToLl.txt
   $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene \
         ensToLocusLink -lookup=refToLl.txt

   # Update the following three tables after update of Affy Zebrafish and
   # Affy Zon Lab Wild Type data (2006-10-25):
   hgsql -e 'drop table ensToAffyZebrafish;' danRer3
   hgsql -e 'drop table ensToAffyZonWildType;' danRer3
   hgsql -e 'drop table zebrafishZonWTDistance;' danRer3

   # Create table that maps between Ensembl genes and the Affy Zebrfish
   # probeset consensus sequences.
   $HOME/bin/x86_64/hgMapToGene danRer3 affyZebrafish ensGene \
           ensToAffyZebrafish

   # Create a table that maps between Ensembl genes and 
   # the Zon lab microarray expression data.
   $HOME/bin/x86_64/hgMapToGene "-type=bed 12" danRer3 affyZonWildType \
               ensGene ensToAffyZonWildType 

   # Create expression distance table.
   nice hgExpDistance danRer3 hgFixed.zebrafishZonWTMedianRatio \
        hgFixed.zebrafishZonWTMedianExps zebrafishZonWTDistance  \
        -lookup=ensToAffyZebrafish &
   # Have 15617 elements in hgFixed.zebrafishZonWTMedian
   # Got 8911 unique elements in hgFixed.zebrafishZonWTMedian
   # Made zebrafishZonWTDistance.tab
   # Loaded zebrafishZonWTDistance
   # Made query index
   # Took 2 minutes.
   # To allow data to be viewed in Gene Sorter, add the hgNearOk=1 
   # to the dbDb table entry for danRer3 on hgcentraltest -
   # see section on MAKE HGCENTRALTEST ENTRY FOR DANRER3.

   # added a protein ID field to ensGene before running this hgClusterGenes
   # Cluster together various alt-splicing isoforms.
   # Creates the knownIsoforms and knownCanonical tables
   # Rebuild this after creating updating the ensGene table with
   # protein IDs from UniProt with >= 90% identity to Ensembl proteins.
   # (2006-11-06, hartera)
   hgsql -e 'drop table ensIsoforms;' danRer3
   hgsql -e 'drop table ensCanonical;' danRer3
   hgClusterGenes danRer3 ensGene ensIsoforms ensCanonical
   # Got 22877 clusters, from 32143 genes in 28 chromosomes
   # There are also 22877 genes in the ensGtp table so this is in agreement.

#######################################################################
# UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND 
# USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT 
# ALIGNMENTS (DONE, 2006-09-27, hartera)
# With the previous version of this track, QA found a number of short 
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to 
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish. 
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or 
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations. 
# Remove old Affy zebrafish directories (DONE, 2006-12-13, hartera)    
    # Array chip sequences already downloaded for danRer1
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyZebrafish
    mkdir -p /san/sanvol1/scratch/affy
    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /san/sanvol1/scratch/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer3
    # remove old link and create new one
    rm /cluster/data/danRer3/bed/affyZebrafish
    mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2006-09-27
    ln -s /cluster/data/danRer3/bed/affyZebrafish.2006-09-27 \
          /cluster/data/danRer3/bed/affyZebrafish

    # Align sequences on the pitakluster. Scaffolds were aligned for NA
    # and Un and lifted to chrom level afterwards. Chroms 1-25 and M
    # were aligned as ~5 Mb chunks.

    ssh pk
    cd /cluster/data/danRer3/bed/affyZebrafish
    mv /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/scaffold*.fa \
       /san/sanvol1/scratch/danRer3/
    ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
    foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst
    # 15149 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer3/affy/psl
    # use -repeats option to report matches to repeat bases separately 
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer3/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 genome.lst affy.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
    para time
# Completed: 15149 of 15149 jobs
#CPU time in finished jobs:      34672s     577.87m     9.63h    0.40d  0.001y
#IO & Wait Time:                 41580s     692.99m    11.55h    0.48d  0.001 y
#Average job time:                   5s       0.08m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             145s       2.42m     0.04h    0.00d
#Submission to last job:          1400s      23.33m     0.39h    0.02d

    # need to do pslSort and lift up 
    ssh pk
    cd /san/sanvol1/scratch/danRer3/affy
    # Do sort, liftUp and then best in genome filter. 
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in 
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed. 
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments. 
    pslSort dirs raw.psl tmp psl
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#             total:     14886   830753
# drop minNonRepSize:     2753    745330
#     drop minIdent:     2645    38916
#     drop minCover:     2472    10516
#        weird over:     384     1529
#        kept weird:     308     403
#    drop localBest:     2559    17395
#              kept:     14494   18596
# 97.3% were kept. 
# There are 15502 Affy sequences originally aligned so there are now
# 93.5% remaining.
    
    # lift up the coordinates to chrom level
    #pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    # lift up chrom contigs to chrom level
    cat /cluster/data/danRer3/jkStuff/liftAll.lft \
     /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
     > allLift.lft
    liftUp affyZebrafish.psl allLift.lft warn contig.psl
    # Got 30168 lifts in allLift.lft
    # Lifting contig.psl

    # rsync these psl files 
    rsync -a --progress /san/sanvol1/scratch/danRer3/affy/*.psl \
         /cluster/data/danRer3/bed/affyZebrafish/
    ssh kkstore02
    cd /cluster/data/danRer3/bed/affyZebrafish
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
    mv affyZebrafish.psl.tmp affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # psl is good
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer3/bed/affyZebrafish
    hgsql -e 'drop table affyZebrafish;' danRer3
    hgLoadPsl danRer3 affyZebrafish.psl
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes
    # these sequences were loaded previously so no need to reload. 
    hgLoadSeq -abbr=Zebrafish: danRer3 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
    # check number of short alignments:
    hgsql -e \
     'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer3
    # 6 
    # for previous filtered set, there were 1195 alignments of <= 50 bp so
    # this has improved. 
    hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer3
    # 14494
    # Previously 14335 distinct affy sequences were aligned. Many of the 
    # short alignments may also have longer alignments to different regions 
    # of the genome that are good.

    # CLEANUP:
    # remove old Affy Zebrafish alignment directories (hartera, 2006-12-13)
    ssh kkstore02
    cd /cluster/data/danRer3/bed
    rm -r affyZebrafish.2005-08-19
    rm -r affyZebrafish.2005-09-25

#########################################################################
# NEW RH MAP SEQUENCES FOR TRACK (in progress, 2006-10-12, hartera)
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/rhMap-2006-10-03
    cd /cluster/data/danRer3/bed
    ln -s rhMap-2006-10-03 rhMap
    # download data files from e-mail:
    # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
    unzip rhSequenceSubmit100306.zip
    unzip rhSequenceSubmitSeq100306.zip
    dos2unix rhSequenceSubmit100306.txt
    dos2unix rhSequenceSubmitSeq100306.txt
    # need to convert format of FASTA file to remove the line numbers

###########################################################################
# BACENDS CLEANUP (DONE, 2006-12-13, hartera)
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends
    mv ./seqs/getCloneEnds.csh .
    rm CHORI73.* DH.* DHBacs.fullnames DHmorethan2.*
    rm bacEndsChroms.psl bacNAandUnScafs.psl
    rm bacends.lst genome.lst names.psl namesPls.uniq header pslCheck.log \
       raw*
    rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
    rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnPsl
    rm -r newPairs2
    rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
    rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun 
    rm -r singles pairs scores
    rm -r ./cloneandStsAliases/tmp
    rm ./cloneandStsAliases/*.bak ./cloneandStsAliases/*.tab \
       ./cloneandStsAliases/*.sort ./cloneandStsAliases/*.uniq 
    rm DH_bacends.fa
    rm -r liftedPsl 
    # the psl directory is large, gzip the contents
    cd psl
    gzip *.psl

#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 451 where name = 'danRer3';"

##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer3

###########################################################################
# REMAKE RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-02-14, hartera)
# Use update of Radiation Hybrid map data from October 2006 and use method
# as documented in danRer4.txt to map these sequences to danRer3.
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
# Latest RH map sequences and primers received on 2006-10-03 from
# Anhua (Peter) Song - asong@enders.tch.harvard.edu
    ssh kkstore02
    mkdir /cluster/data/danRer3/bed/ZonLab/rhMap-2006-10-03
    cd /cluster/data/danRer3/bed/ZonLab
    ln -s rhMap-2006-10-03 rhMap
    cd rhMap
    # download data files from e-mail:
    # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
    unzip rhSequenceSubmit100306.zip
    unzip rhSequenceSubmitSeq100306.zip
    dos2unix rhSequenceSubmit100306.txt
    dos2unix rhSequenceSubmitSeq100306.txt
    # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
    # information are in rhSequenceSubmi100306.txt
    grep '>' rhSequenceSubmitSeq100306.txt | wc -l
    # 11514
    wc -l  rhSequenceSubmit100306.txt
    # 13438 rhSequenceSubmit100306.txt
   
    grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names

    # remove '>' from names and grab first field
    perl -pi.bak -e 's/>//' rhMap.names
    awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
        > rhMap.namesOnly.sort
    awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
        > rhMapPrimers.namesOnly.sort
    wc -l *.sort
    # 11514 rhMap.namesOnly.sort
    # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
    
    # get a list of headers from the FASTA file
    grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
    awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
# BAC_END
# EST
# GENE
# SSLP
# STS
    # There are 5 types of sequence here.
    awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
#BACends
#Custom
#Insertion_Mutant
#Insertion_Mutants
#MGH
#NCBI
#Sanger SG
#Sequencing_Project
#ThisseClone
#Thisse_Clone
#other_zfEst
#wu_zfEst
#wz
    awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
    # CHBG
    # MPIEB

# Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
# So there are 11 different sources.
    # There are 2 sequences with problem primers. E-mailed Peter Song about
    # these and he suggested to delete thoser primers:
    # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
    # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
    # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
    # need to reformat FASTA headers so they are in the format: 
    # NAME.SOURCE.TYPE.ORIGIN
    # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
    # so change these to have the same name. Also shorten Sanger SG to
    # Shotgun.
    sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
       | sed -e 's/Insertion_Mutant/InsertMut/' \
       | sed -e 's/Sanger SG/Shotgun/' \
       | sed -e 's/ThisseClone/Thisse/' \
       | sed -e 's/Thisse_Clone/Thisse/' \
       | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
    # Do the same for the primers and information file:
    sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
       | sed -e 's/Insertion_Mutant/InsertMut/' \
       | sed -e 's/Sanger SG/Shotgun/' \
       | sed -e 's/ThisseClone/Thisse/' \
       | sed -e 's/Thisse_Clone/Thisse/' \
       | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
    # edit these files to remove the extra newline char after the first primer
    # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to 
    # an underscore (2007-02-14, hartera)
    perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
         rhMap100306.fa
    perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
         rhMapPrimers100306.txt

    # use a script to reformat the names for the FASTA headers to the format 
    # >NAME.SOURCE where name is the first field separated by "|" and source
    # is the 9th field. The source is used to make the name unique. Some
    # of these names are BAC ends that occur in the BAC ends track so there
    # are name clashes in the seq table if the names are not made unique.
    # Also make the name upper case as for those for the danRer1 and danRer2
    # RH map and remove base numbering on each sequence line of FASTA file.
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    split(toupper($0), a, "\\|");
    print a[1]"."a[9];
    next;
}

/^[0-9]+ / {
    $0 = $2;
}

{
    print $0;
}

'_EOF_'
# << keep emacs coloring happy
    chmod +x rhFix
    rhFix rhMap100306.fa > rhMap.fa
    # Blat sequences vs danRer3 genome
    ssh pk
    mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
    cd /cluster/data/danRer3/bed/ZonLab/rhMap
    # put the rhMap sequences on the san 
    mkdir -p /san/sanvol1/scratch/danRer3/rhMap
    cp rhMap.fa /san/sanvol1/scratch/danRer3/rhMap/
    # do blat run to align RH map sequences to danRer3 and and use
    # chrNA_random and chrUn_random separated into scaffolds.
    cd blatRun
    ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
    foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
       ls -1S $f >> genome.lst
    end
    wc -l genome.lst 
    # 15149 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
-ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub

    gensub2 genome.lst rhMap.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
    para time
# Completed: 15149 of 15149 jobs
#CPU time in finished jobs:      13684s     228.07m     3.80h    0.16d  0.000y
#IO & Wait Time:                 38258s     637.63m    10.63h    0.44d  0.001 y
#Average job time:                   3s       0.06m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              24s       0.40m     0.01h    0.00d
#Submission to last job:           901s      15.02m     0.25h    0.01d

    # need to do pslSort and lift up
    ssh pk
    cd /san/sanvol1/scratch/danRer3/rhMap
    # Do sort, liftUp and then best in genome filter.
    pslSort dirs raw.psl tmp psl 
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed.
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments.
        
    # Use parameters as determined for danRer4
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
#                         seqs    aligns
#             total:     11060   1767931
#      drop invalid:     1       1
#drop minNonRepSize:     3047    1297013
#     drop minIdent:     763     3913
#     drop minCover:     4065    420022
#        weird over:     288     4267
#        kept weird:     130     189
#    drop localBest:     2188    34092
#              kept:     10447   12890
    # Percent sequences aligned: 10447/11514 = 90.7%
    # This is a compromise between reducing the number of sequences piling
    # up but not losing all alignments for too many sequences.
    awk '{print $10}' contig.psl | sort | uniq -c | sort -nr > contig.count
    head contig.count
#      33 ZKP106G9.YA.BACENDS
#     21 BZ83M20.Z.BACENDS
#     12 ZK4I5.T7.BACENDS
#     10 ZC27I3.ZA.BACENDS
#     10 ZC261G9.ZAF.BACENDS
#     10 ZC261G9.ZA.BACENDS
#      8 ZK8O7.T7.BACENDS
#      8 ZC77P2.ZB.BACENDS
#      8 FJ89A05.X1.WU_ZFEST
#      8 FJ07G09.X1.WU_ZFEST

    cd /cluster/data/danRer3/bed/ZonLab/rhMap
    # lift up to genome level coordinates
    liftUp rhMap.psl \
           /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft warn \
           /san/sanvol1/scratch/danRer3/rhMap/contig.psl
    # Got 30168 lifts in
    # /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft
    pslCheck rhMap.psl
    # psl looks ok
    # cleanup 
    rm *.bak *.sort
    # Load sequence alignments into the database
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ZonLab/rhMap   
    # drop old table and reload final psl file
    hgsql -e 'drop table rhMap;' danRer3 
    hgLoadPsl danRer3 rhMap.psl
     
    # Copy sequences to gbdb if they are not already there.
    mkdir -p /gbdb/danRer3/rhMap
    # remove old sequences
    rm /gbdb/danRer3/rhMap/rhMap022306.fa
    ln -s \
      /cluster/data/danRer3/bed/ZonLab/rhMap/rhMap.fa \
      /gbdb/danRer3/rhMap/rhMap20061003.fa

    # then add sequences to database:
    # remove old ones first
    hgsql -e 'select * from extFile where path like "%rhMap%";' danRer3
#| id     | name           | path                               | size    |
#+--------+----------------+------------------------------------+---------+
#| 747628 | rhMap022306.fa | /gbdb/danRer3/rhMap/rhMap022306.fa | 7456861 |
#+--------+----------------+------------------------------------+---------+
    hgsql -e 'select count(*) from seq where extFile = 747628;' danRer3
    hgsql -e 'delete from seq where extFile = 747628;' danRer3
    # delete from extFile:
    hgsql -e 'delete from extFile where id = 747628;' danRer3
    hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap20061003.fa
    # loaded succesfully
    # Check in the Browser and see if there are many pileups
    # Much reduced now on chr24. Took 10 random sequences in the pileup from
    # minCover=0.20 and found that 7 of them still align to danRer4 
    # with minCover=0.33 and 2 of those that don't also have primers that
    # do not map using the hgPcr tool.
    # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
    # also add the search specs for hgFindSpec to trackDb.ra

    # Add table of related information for the RH map details pages:
    
    # Check that all the headers from rhMap.headers are also in the primers
    # file which seems to contain the same headers from the FASTA file
    # as well as additional markers.
    ssh kkstore02
    cd /cluster/data/danRer3/bed/ZonLab/rhMap/
    # The same RH map version was used as for danRer4 so the data for the 
    # info table is the same as for danRer4 so copy the file over. See 
    # kent/src/hg/makeDb/doc/danRer4.txt for details on how this file is 
    # produced.
    cp /cluster/data/danRer4/bed/ZonLab/rhMap/rhMapInfoWithZfinIds.tab . 
    # load the info table
    ssh hgwdev
    cd /cluster/data/danRer3/bed/ZonLab/rhMap
    hgLoadSqlTab danRer3 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
          rhMapInfoWithZfinIds.tab
    
    # add code to hgc.c to print ZFIN ID, if available, on the details page
    # together with the other marker-related information.
    # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for 
    # the ZFIN IDs to link to the relevant page at http://www.zfin.org 
    # and added an html page for the track.

#########################################################################
# BACENDS CLEANUP (DONE, 2007-03-27, hartera)
    ssh kkstore02
    cd /cluster/data/danRer3/bed/bacends
    # 23G in bacends directory
    # remove sequence file as already in bacSeqs dir
    rm Zv5Bacends.fa
    # du -sh psl   
    # 12G psl
    nice rm -r psl 
    cd bacends.1
    rm bacEndAccs.aliases bacEnds.log bacEnds.names.sort bacPrs.names bacs.log
    rm ch211 intNames intNames.count out test test.pl bacEndSingles.txt
    rm -r test2 bacEndAccs
    rm BACClones* BACEndAccs.txt *.accs allBacEnds* bacEndSingles.names
    cd ../scoresAndCoords
    rm allBacEnds.names.* bacEndSinglesGoodScores.bed error.log *.tab \
       singles.hits bacEnds.load.psl bacEnds.names 
    rm -r tmp
    cd ../pairsNew
    # bacEndSingles.bed is already in singlesNew
    rm bacEnds.* bed.tab bacEndSingles.bed
    cd ../singlesNew
    rm singles.coordcheck bed.tab bacEnds.*
    cd ../duplicates
    # duplicatesNew is latest directory so remove everthing else from
    # duplicates directory
    rm *
    cd duplicatesNew
    rm log* *.lfs
    cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
    rm -r tmp
    rm bacClones* bacs.names log
    cd /cluster/data/danRer3/bed/
    du -sh bacends
    # 5.0G    bacends
    # BAC ENDS track was remade in May 2006 (see REDO BACENDS section)
    # so can remove bacEndsNew which is an old version from 2005"
    du -sh newBacends
    # 37G     newBacends
    nice rm -r newBacends
#########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
echo danRer3 fr1 tetNig1 mm7 hg18  > /hive/data/genomes/danRer3/bed/multiz5way/species.list
update genbank.conf
danRer3.upstreamGeneTbl = refGene
danRer3.upstreamMaf = multiz5way /hive/data/genomes/danRer3/bed/multiz5way/species.list
