# for emacs: -*- mode: sh; -*-


# Anopheles gambiae -- complete chromosomes from Ensembl, apparently from 
# May 2004:
# ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna
# (NCBI still has just scaffolds from 2002 here...?!
#  ftp://ftp.ncbi.nih/gov/genbank/genomes/Anopheles_gambiae/Assembly_scaffolds)
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+---------------------+
#  | tableName | type                |
#  +-----------+---------------------+
#  | ensGene   | genePred ensPep     |
#  | geneid    | genePred geneidPep  |
#  | genscan   | genePred genscanPep |
#  +-----------+---------------------+


# DOWNLOAD SEQUENCE (DONE 6/7/04 angie)
    ssh kksilo
    mkdir /cluster/store7/anoGam1
    cd /cluster/data
    ln -s /cluster/store7/anoGam1 anoGam1
    cd /cluster/data/anoGam1
    mkdir downloads
    cd downloads
    foreach c (2L 2R 3L 3R X UNKN)
      wget ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna/Anopheles_gambiae.MOZ2a.may.dna.chromosome.$c.fa.gz
      wget ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna/Anopheles_gambiae.MOZ2a.may.dna_rm.chromosome.$c.fa.gz
    end
    gunzip *.fa.gz
    # sanity check -- make sure the 2 versions are the same modulo masking:
    foreach f (*.dna.*)
      set g = `echo $f | sed -e 's/\.dna\./\.dna_rm\./'`
      faCmp $f $g
    end
    # Is their masking actually different? 
    foreach f (*.dna.*)
      set g = `echo $f | sed -e 's/\.dna\./\.dna_rm\./'`
      faCmp -softMask $f $g
    end
    # Nope!  ?  Well, OK, clean up then.
    rm Ano*.dna_rm.*

    # Put in our usual chrom dir structure.
    foreach c (2L 2R 3L 3R X)
      mkdir ../$c
      sed -e 's/^>.*$/>'chr$c/ A*.dna.chromosome.$c.fa > ../$c/chr$c.fa
    end
    mkdir ../U
    sed -e 's/^>.*$/>'chrU/ A*.dna.chromosome.UNKN.fa > ../U/chrU.fa
    cd ..
    foreach c (?{,?})
      diff $c/chr$c.fa downloads/A*.dna.*.$c*.fa
    end

    # download mitochondrion sequence
    mkdir M
    cd M
    # go to http://www.ncbi.nih.gov/ and search Genome for 
    # "anopheles gambiae mitochondrion complete".  That shows the gi number:
    # 5834911
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834911&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Anopheles gambiae mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".
    cd ..
    # tidy up downloads
    nice gzip downloads/* &


# GET ASSEMBLY INFO FROM ENSEMBL (DONE 6/7/04 angie)
    # Since there's no AGP, extract what we can from Ensembl... looks like 
    # they give offsets of both their "chunks" and of the real scaffolds.
    ssh kksilo
    cd /cluster/data/anoGam1/bed/ensChunks
    wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/assembly.txt.table.gz
    wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/anopheles_gambiae_core_22_2b.sql.gz
    wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/seq_region.txt.table.gz
    gunzip *.gz
    ssh hgwdev
    cd /cluster/data/anoGam1/bed/ensChunks
    hgsql -e "create database ensAnoGam"
    hgsql ensAnoGam < anopheles_gambiae_core_22_2b.sql
    hgsql ensAnoGam \
      -e 'load data local infile "assembly.txt.table" into table assembly'
    hgsql ensAnoGam \
      -e 'load data local infile "seq_region.txt.table" into table seq_region'
    hgsql ensAnoGam -e 'create table assembly1 \
      select assembly.asm_seq_region_id,seq_region.name, \
      assembly.asm_start,assembly.asm_end,assembly.cmp_start,\
      assembly.cmp_end,assembly.ori from assembly,seq_region  \
      where assembly.cmp_seq_region_id = seq_region.seq_region_id'
    hgsql ensAnoGam -e 'create table assembly2 \
      select seq_region.name as chrom, \
      assembly1.asm_start as chromStart, assembly1.asm_end as chromEnd, \
      assembly1.name, \
      assembly1.cmp_start as fragStart, assembly1.cmp_end as fragEnd, \
      assembly1.ori from assembly1,seq_region  \
      where assembly1.asm_seq_region_id = seq_region.seq_region_id'
    hgsql ensAnoGam -e 'update assembly2 set chrom = "U" where chrom = "UNKN"'
    hgsql ensAnoGam -e 'update assembly2 set chrom = concat("chr", chrom)'
    hgsql ensAnoGam -N -e 'select * from assembly2 order by chrom,chromStart' \
    | perl -we '$i = 0; \
         while (<>) { \
           chomp;  @w = split;  $w[6] =~ s/1$//;  $w[6] =~ s/^$/+/;  ++$i; \
           print "$w[0]\t$w[1]\t$w[2]\t$i\tW\t$w[3]\t$w[4]\t$w[5]\t$w[6]\n"; \
         }' \
      > ensAll.agp
    # Get just the scaffold entries, and put gap records in there...
    ssh kksilo
    cd /cluster/data/anoGam1
    awk '$6 !~ /_[0-9]+$/ {print;}' ensAll.agp \
    | perl -we '$i = 0; $prevChr = ""; \
       while (<>) { \
         @w = split;  ++$i; \
         if ($w[0] eq $prevChr && defined $prevEnd) { \
           $gapEnd = $w[1] - 1;  $size = $w[1] - $prevEnd; \
           print "$w[0]\t$prevEnd\t$gapEnd\t$i\tN\t$size\tclone\tno\n"; $i++; \
         } \
         $prevEnd = $w[2] + 1;  $prevChr = $w[0];  \
         print; \
       }' \
    > ensScaffold.agp
    cd ../..
    foreach c (?{,?})
      grep "^chr$c" bed/ensChunks/ensScaffold.agp > $c/chr$c.agp
    end
    # clean up 0-length inappropriate chrM.agp:
    rm M/chrM.agp
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    foreach agp (?{,?}/chr*.agp)
      if (-e $agp) then
        set fa = $agp:r.fa
        echo checking consistency of $agp and $fa
        checkAgpAndFa $agp $fa | tail -1
      endif
    end
    # clean up ensAnoGam database
    ssh hgwdev
    hgsql -e "drop database ensAnoGam"


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CHUNKS OF N'S (DONE 6/7/04 angie)
    # Since we didn't get real AGP with gap info, we don't have enough info 
    # to help us break into ~5MB chunks at proper gap boundaries.  So just 
    # use runs of N's as gaps.
    ssh kksilo
    cd /cluster/data/anoGam1
    foreach c (?{,?})
      faSplit gap -minGapSize=100 -lift=$c/chr$c.lft \
        $c/chr$c.fa 5000000 $c/chr${c}_
    end
    # put chunks into usual "contig" dir structure
    foreach ctg (?{,?}/chr?{,?}_?{,?}.fa)
      mkdir $ctg:r
      mv $ctg $ctg:r/
    end


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/7/04 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/anoGam1/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/anoGam1/bed


# CREATING DATABASE (DONE 6/7/04 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
    hgsql -e 'create database anoGam1'


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/7/04 angie)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
      | hgsql anoGam1


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 6/7/04 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kksilo
    cd /cluster/data/anoGam1
    mkdir nib
    foreach c (?{,?})
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
        endif
      end
    end

    # Make symbolic links from /gbdb/anoGam1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/anoGam1/nib
    foreach f (/cluster/data/anoGam1/nib/chr*.nib)
      ln -s $f /gbdb/anoGam1/nib
    end
    # Load /gbdb/anoGam1/nib paths into database and save size info.
    cd /cluster/data/anoGam1
    hgsql anoGam1  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib anoGam1 /gbdb/anoGam1/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N anoGam1 > chrom.sizes
    # take a look at chrom.sizes, should be 18 lines
    wc chrom.sizes


# REPEATMASKER (DONE 6/8/04 angie)
    #- Split sequence into 500kb chunks, at gaps if possible:
    ssh kksilo
    cd /cluster/data/anoGam1
    foreach c (?{,?})
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/anoGam1
    cat << '_EOF_' > jkStuff/RMAnopheles
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/anoGam1/$2
/bin/cp $2 /tmp/anoGam1/$2/
cd /tmp/anoGam1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec anopheles $2
popd
/bin/cp /tmp/anoGam1/$2/$2.out ./
if (-e /tmp/anoGam1/$2/$2.align) /bin/cp /tmp/anoGam1/$2/$2.align ./
if (-e /tmp/anoGam1/$2/$2.tbl) /bin/cp /tmp/anoGam1/$2/$2.tbl ./
if (-e /tmp/anoGam1/$2/$2.cat) /bin/cp /tmp/anoGam1/$2/$2.cat ./
/bin/rm -fr /tmp/anoGam1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/anoGam1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/anoGam1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMAnopheles
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (?{,?})
      foreach d ($c/chr${c}*_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/anoGam1/jkStuff/RMAnopheles \
                 /cluster/data/anoGam1/$d $f \
               '{'check out line+ /cluster/data/anoGam1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk9
    cd /cluster/data/anoGam1/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 747 of 747 jobs
#Average job time:                1597s      26.61m     0.44h    0.02d
#Longest job:                     2788s      46.47m     0.77h    0.03d
#Submission to last job:         20691s     344.85m     5.75h    0.24d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/anoGam1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (?{,?})
      echo lifting $c
      cd $c
      liftUp chr$c.fa.out chr$c.lft warn \
        `awk '{print $2 "/" $2 ".fa.out";}' chr$c.lft` \
        > /dev/null
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/anoGam1
    hgLoadOut anoGam1 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 6/9/04 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on anoGam1 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits anoGam1 rmsk
#21785530 bases of 278268413 (7.829%) in intersection

    # compare to dm1:
    featureBits dm1 rmsk
#15452875 bases of 126527731 (12.213%) in intersection


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 6/7/04 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("anoGam1", "Feb. 2003", "/gbdb/anoGam1/nib", "A. gambiae", \
               "chr3R:51042001-51053000", 1, 59, "A. gambiae", \
                "Anopheles gambiae", "/gbdb/anoGam1/html/description.html", \
                0, 0, "MOZ2");'
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO defaultDb (genome, name) values ("A. gambiae", "anoGam1");'

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add anoGam1 in all the right places and do
    make update

    mkdir /gbdb/anoGam1/html
    # go public on genome-test
    cvs commit makefile
    make alpha

    # Add trackDb directories
    mkdir anopheles
    mkdir anopheles/anoGam1
    cvs add anopheles
    cvs add anopheles/anoGam1
    cvs commit anopheles/anoGam1


# GOLD AND GAP TRACKS (DONE 6/7/04 angie)
    ssh hgwdev
    cd /cluster/data/anoGam1
    cp /dev/null chrom.lst
    foreach f (?{,?}/chr*.agp chrM)
      echo $f:t:r >> chrom.lst
    end
    hgGoldGapGl -noGl -chromLst=chrom.lst anoGam1 /cluster/data/anoGam1 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql anoGam1
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql anoGam1 -e 'create table chrM_gap select * from chr2L_gap where 0=1'


# MAKE LIFTALL.LFT (DONE 6/8/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1
    cat ?{,?}/chr*.lft > jkStuff/liftAll.lft


# SIMPLE REPEATS (TRF) (DONE 6/8/04 angie)
    ssh kksilo
    mkdir /cluster/data/anoGam1/bed/simpleRepeat
    cd /cluster/data/anoGam1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/anoGam1/?{,?}/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \
      trf/*.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed anoGam1 simpleRepeat \
      /cluster/data/anoGam1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    featureBits anoGam1 simpleRepeat
#5668988 bases of 278268413 (2.037%) in intersection


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 6/8/04 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/anoGam1/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      echo -n "filtering $f... "
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords:
    mkdir trfMaskChrom
    foreach f (../../?{,?}/chr*.fa)
      set c = $f:t:r
      echo $c
      liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \
        trfMask/${c}_[0-9]*.bed > /dev/null
    end


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 6/8/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    foreach c (?{,?})
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    #- Rebuild the nib files, using the soft masking in the fa:
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make one big 2bit file as well, and make a link to it in 
    # /gbdb/anoGam1/nib because hgBlat looks there:
    faToTwoBit */chr*.fa anoGam1.2bit
    ssh hgwdev
    ln -s /cluster/data/anoGam1/anoGam1.2bit /gbdb/anoGam1/nib/


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 6/17/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1
    #- Build the .zip files -- no genbank for now.
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
zip -j zip/chromOut.zip */chr*.fa.out
zip -j zip/chromFa.zip */chr*.fa
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/anoGam1/zip
    set gp = /usr/local/apache/htdocs/goldenPath/anoGam1
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( ../*/chr*.fa )
      nice zip -j $gp/chromosomes/$f:t.zip $f
    end
    cd $gp/bigZips
    nice md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    nice md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 6/8/04 angie)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/anoGam1/nib
    mkdir -p /iscratch/i/anoGam1/nib
    cp -p /cluster/data/anoGam1/nib/chr*.nib /iscratch/i/anoGam1/nib
    iSync
    # bluearc too (rack 9 can't see iscratch):
    ssh kksilo
    rm -rf /cluster/bluearc/anoGam1/nib
    mkdir -p /cluster/bluearc/anoGam1/nib
    cp -p /cluster/data/anoGam1/nib/chr*.nib /cluster/bluearc/anoGam1/nib


# LOAD ENSEMBL GENES (DONE 6/9/04 angie)
    mkdir /cluster/data/anoGam1/bed/ensembl
    cd /cluster/data/anoGam1/bed/ensembl
    # Get the ensembl gene data from 
    # http://www.ensembl.org/Anopheles_gambiae/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Anopheles_gambiae choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensembl.gff.gz
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    gunzip -c ensembl.gff.gz \
    | perl -wpe 's/^UNKN/U/; s/^([2-3][LR]?|X|U)/chr$1/ \
                 || die "Line $. doesnt start with anopheles chrom:\n$_"' \
    > ensGene.gtf
    ssh hgwdev
    ldHgGene -gtf -genePredExt anoGam1 ensGene \
      /cluster/data/anoGam1/bed/ensembl/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql anoGam1 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql anoGam1 -e 'load data local infile "ensGtp.txt" into table ensGtp'
    # delete header line to avoid joinerCheck error
    hgsql anoGam1 -e 'delete from ensGtp where gene = "Ensembl Gene ID"'

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip -c ensemblPep.fa.gz \
    | perl -wpe 's/^(>ENSANGT\d+\.\d+).*/$1/' \
    > ensPep.fa
    hgPepPred anoGam1 generic ensPep ensPep.fa


# PRODUCING GENSCAN PREDICTIONS (DONE 6/9/04 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh hgwdev
    mkdir /cluster/data/anoGam1/bed/genscan
    cd /cluster/data/anoGam1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/anoGam1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach f ( `ls -1S /cluster/data/anoGam1/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 68 of 68 jobs
#Average job time:                 133s       2.21m     0.04h    0.00d
#Longest job:                      167s       2.78m     0.05h    0.00d
#Submission to last job:           720s      12.00m     0.20h    0.01d

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/anoGam1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/anoGam1/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf anoGam1 genscan genscan.gtf
    hgPepPred anoGam1 generic genscanPep genscan.pep
    hgLoadBed anoGam1 genscanSubopt genscanSubopt.bed

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch anoGam1 genscanPep 0501071300.00

# SWAP BLASTZ MELANOGASTER-ANOPHELES TO ANOPHELES-MEL (DONE 6/10/04 angie)
    ssh kolossus
    mkdir /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10
    set aliDir = /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
#157M    /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChrom
#157M    unsorted
#157M    axtChrom
    rm -r unsorted


# CHAIN MELANOGASTER BLASTZ (DONE 6/10/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Make our own linear gap file with reduced gap penalties:
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=../../chickenHumanTuned.gap \
         -verbose=0 $1 \
  /iscratch/i/anoGam1/nib \
  /cluster/bluearc/drosophila/dm1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 7 of 7 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest job:                       13s       0.22m     0.00h    0.00d
#Submission to last job:            13s       0.22m     0.00h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    du -sh chain run1/chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain anoGam1 ${c}_chainDm1 $i
    end


# NET MELANOGASTER BLASTZ (DONE 6/10/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    netClass -noAr noClass.net anoGam1 dm1 melanogaster.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    rm noClass.net
    netFilter -syn melanogaster.net > melanogasterSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    netFilter -minGap=10 melanogaster.net |  hgLoadNet anoGam1 netDm1 stdin
    netFilter -minGap=10 melanogasterSyn.net | hgLoadNet anoGam1 netSyntenyDm1 stdin


# MAKE AXTNET FOR DOWNLOAD (DONE 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    netSplit melanogaster.net net
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/anoGam1/nib \
        /cluster/data/dm1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end


# MAKE VSDM1 DOWNLOADABLES (DONE 6/9/04 angie)
    ssh kksilo
    cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain
    ln all.chain melanogaster.chain
    zip /cluster/data/anoGam1/zip/melanogaster.chain.zip melanogaster.chain
    rm melanogaster.chain
    zip /cluster/data/anoGam1/zip/melanogaster.net.zip melanogaster.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/anoGam1/vsDm1
    cd /usr/local/apache/htdocs/goldenPath/anoGam1/vsDm1
    mv /cluster/data/anoGam1/zip/melanogaster*.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
    # Add axtChrom.zip (as advertised in original README) and axtNet/*.gz 
    # (new std offering) 8/20/04:
    mkdir axtNet
    foreach f (/cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtNet/*)
      gzip -c $f > axtNet/$f:t.gz
    end
    zip -j axtChrom.zip \
      /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChrom/*.axt
    md5sum *.zip *.gz */*.gz > md5sum.txt


# MAKE 11.OOC FILE FOR BLAT (DONE 8/2/04 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # mosquito size is ~9.2% of human)
    ssh kkr1u00
    mkdir /cluster/data/anoGam1/bed/ooc
    cd /cluster/data/anoGam1/bed/ooc
    ls -1 /cluster/data/anoGam1/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/anoGam1/11.ooc -repMatch=100
#Wrote 11692 overused 11-mers to /cluster/bluearc/anoGam1/11.ooc
    cp -p /cluster/bluearc/anoGam1/*.ooc /iscratch/i/anoGam1/
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 8/2/04 angie)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# anoGam1 (A. gambiae)
anoGam1.genome = /iscratch/i/anoGam1/nib/chr*.nib
anoGam1.lift = /cluster/data/anoGam1/jkStuff/liftAll.lft
anoGam1.refseq.mrna.native.load = no
anoGam1.genbank.mrna.xeno.load = yes
anoGam1.genbank.est.xeno.load = no
anoGam1.downloadDir = anoGam1

    cvs ci etc/genbank.conf
    # Since A. gambiae is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    cvs diff src/lib/gbGenome.c
    make
    cvs ci src/lib/gbGenome.c
    # Edit src/align/gbBlat to add /iscratch/i/anoGam1/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial anoGam1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad anoGam1
    featureBits anoGam1 mrna
#4879150 bases of 278268413 (1.753%) in intersection
    featureBits anoGam1 xenoMrna
#9229323 bases of 278268413 (3.317%) in intersection
    # Clean up:
    rm -rf work/initial.anoGam1

    ssh eieio
    # -initial for ESTs (now with /cluster/store7 and iservers):
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial anoGam1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 anoGam1 &
    # Not bad... certainly better than, say, D. yakuba:
    featureBits anoGam1 intronEst
#8712776 bases of 278268413 (3.131%) in intersection
    featureBits anoGam1 est
#19664902 bases of 278268413 (7.067%) in intersection
    # Clean up:
    rm -rf work/initial.anoGam1


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 7/30/04 angie)
    ssh hgwdev
    echo 'insert into blatServers values("anoGam1", "blat8", "17780", 1, 0); \
          insert into blatServers values("anoGam1", "blat8", "17781", 0, 0);' \
      | hgsql -h genome-testdb hgcentraltest

# MAKE Drosophila Proteins track (Done 08-07-2004 braney)
    ssh kksilo
    mkdir -p /cluster/data/anoGam1/blastDb
    cd /cluster/data/anoGam1/blastDb
    for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/anoGam1/blastDb
    cp /cluster/data/anoGam1/blastDb/* /iscratch/i/anoGam1/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/anoGam1/bed/tblastn.dm1FB
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB
    ls -1S /iscratch/i/anoGam1/blastDb/*.nsq | sed "s/\.nsq//" > skeeter.lst
    exit

    # back to kksilo
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs (didn't actually do this for anoGam1)
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(164630/`wc skeeter.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/463) = 57.828700
    split -l 58 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/anoGam1/jkStuff/liftAll.lft warn $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.3
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 skeeter.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB
    para create blastSpec
    para shove   # jobs will need to be restarted, but they should all finish

    exit
    # back to kksilo

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=20000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB
    para create chainSpec
    para push

    exit
    # back to kksilo
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* |  sort -T /tmp -k 14,14 -k 16,16n -k 17,17n |uniq > /cluster/data/anoGam1/bed/tblastn.dm1FB/preblastDm1FB.psl
    cd ..
    blatDir=/cluster/data/dm1/bed/blat.dm1FB
    protDat -fb preblastDm1FB.psl $blatDir/dm1FB.psl $blatDir/dm1FB.mapNames blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/anoGam1/bed/tblastn.dm1FB
    hgLoadPsl anoGam1 blastDm1FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# MAKE GCPERCENT (DONE 10/5/04 angie)
     ssh hgwdev
     mkdir /cluster/data/anoGam1/bed/gcPercent
     cd /cluster/data/anoGam1/bed/gcPercent
     # create and load gcPercent table
     hgsql anoGam1  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent anoGam1 ../../nib


# ANOEST FROM ZDOBNOV (DONE 6/2/05 angie)
    ssh hgwdev
    mkdir /cluster/data/anoGam1/bed/anoEst
    cd /cluster/data/anoGam1/bed/anoEst
    wget http://komar.embl.de/Data/ag_clusters_GFF.txt
    perl -wpe 'chomp;  @words = split; \
      if (scalar(@words) != 9 || $words[0] eq "") {$_ = ""; next;} \
      $words[0] = "chr$words[0]"; \
      $words[6] = "+" if ($words[6] eq "1"); \
      $words[6] = "-" if ($words[6] eq "-1"); \
      $words[8] = $words[8] . $words[6] if ($words[8] eq "NCLAG018843" \
         || $words[8] eq "NCLAG026485" || $words[8] eq "NCLAG028206" \
         || $words[8] eq "NCLAG030852" || $words[8] eq "TCLAG029693" \
         || $words[8] eq "TCLAG068099"); \
      $_ = join("\t", @words) . "\n";' \
    ag_clusters_GFF.txt > anoEst.gff
    # Two tracks, separated by ID.  From Evgeny's email 4/17/05:
    # TCLAG.. - clusters have at least one EST that 'uniquely' aligns
    # to this genomic region, and
    # NCLAG.. - all the rest that did not get into TCLAGs, eg mostly 
    # transposon derived and some resent paralogs (or haplotype artifacts).
    # NCLAG.. hidden by default and TCLAG..s expanded.
    awk '$9 ~ /^TCLAG/ {print;}' anoEst.gff > anoEstTcl.gff
    awk '$9 ~ /^NCLAG/ {print;}' anoEst.gff > anoEstNcl.gff
    wc -l *.gff
# 229268 anoEst.gff
# 154371 anoEstNcl.gff
#  74897 anoEstTcl.gff
    expr 154371 + 74897
#229268
    # These are not really meant to be genePreds (no CDS/UTR), so 
    # translate into bed 12 via genePred.tab.  Also fix chrom names...
    ldHgGene anoGam1 anoEstTcl anoEstTcl.gff -out=stdout \
    | $HOME/kent/src/utils/genePredToBed/genePredToBed \
    | sed -e 's/^chrMIAGCG/chrM/; s/chrUNKN/chrU/;' \
    > anoEstTcl.bed
    ldHgGene anoGam1 anoEstNcl anoEstNcl.gff -out=stdout \
    | $HOME/kent/src/utils/genePredToBed/genePredToBed \
    | sed -e 's/^chrMIAGCG/chrM/; s/chrUNKN/chrU/;' \
    > anoEstNcl.bed
    # List of TCLAG* ids considered to be expressed, saved from email 
    # 6/2/05 to /cluster/data/anoGam1/bed/anoEst/expressed_cl_ids.txt.
    # OK, load 'em up:
    hgLoadBed anoGam1 anoEstTcl anoEstTcl.bed
    hgLoadBed anoGam1 anoEstNcl anoEstNcl.bed
    # Scores are not real so set to 0:
    hgsql anoGam1 -e 'update anoEstTcl set score = 0'
    hgsql anoGam1 -e 'update anoEstNcl set score = 0'
    tail +2 expressed_cl_ids.txt > anoEstExpressed.tab
    hgsql anoGam1 -e \
    'CREATE TABLE anoEstExpressed ( \
      name char(11) not null, \
      PRIMARY KEY(name(11)) \
     ); \
    load data local infile "anoEstExpressed.tab" into table anoEstExpressed'

# MAKE Drosophila Proteins track  (DONE 06-27-05 braney)
    ssh kk
    mkdir -p /cluster/data/anoGam1/bed/tblastn.dm2FB
    cd /cluster/data/anoGam1/bed/tblastn.dm2FB
    ls -1S /iscratch/i/anoGam1/blastDb/*.nsq | sed "s/\.nsq//" > skeeter.lst
    mkdir fbfa
    # calculate a reasonable number of jobs (didn't actually do this for anoGam1)
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc skeeter.lst | awk "{print \\\$1}"`\)
    # 18929/(164630/815) = 93.707921
    split -l 94 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/anoGam1/jkStuff/subChr.lft warn $f.2
        liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/anoGam1/jkStuff/liftAll.lft warn $f.3
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 skeeter.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/anoGam1/bed/tblastn.dm2FB
    para create blastSpec
    para shove   # jobs will need to be restarted, but they should all finish

    exit
    # back to kksilo

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=20000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/anoGam1/bed/tblastn.dm2FB
    para create chainSpec
    para push

    exit
    # back to kksilo
    cd /cluster/data/anoGam1/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoGam1/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/anoGam1/bed/tblastn.dm2FB
    hgLoadPsl anoGam1 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
    ssh hgwdev
    mkdir /cluster/data/anoGam1/bed/geneid
    cd /cluster/data/anoGam1/bed/geneid
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/A.gambiae/golden_path_200302/geneidv1.2/$chr.gtf
    end
    ldHgGene -gtf -genePredExt anoGam1 geneid *.gtf


# SWAP/CHAIN/NET DM2 (DONE 1/13/06 angie)
    ssh hgwdev
    mkdir /cluster/data/anoGam1/bed/blastz.dm2.swap
    cd /cluster/data/anoGam1/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.anoGam1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm2.swap /cluster/data/anoGam1/bed/blastz.dm2


###########################################################################
# SWAP/CHAIN/NET DM3 (DONE 6/7/07 angie)
    ssh kkstore03
    mkdir /cluster/data/anoGam1/bed/blastz.dm3.swap
    cd /cluster/data/anoGam1/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.anoGam1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /cluster/data/anoGam1/bed/blastz.dm3


