### gencode.txt wrangler notes for all submissions for "ENCODE Gencode"

select id,name,status from projects where name like "Gencode%";
+-----+---------------------+-----------+
| id  | name                | status    |
+-----+---------------------+-----------+
|  21 | GENCODE_08_10_01    | displayed |
| 265 | Gencode_2009_01_01  | displayed |
+-----+---------------------+-----------+
2 rows in set (0.00 sec)

Sanger Gencode
==============
initial encode Gencode Genes

desc encodeGencodeGeneClassMar07;
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+------------------+-------+
| Field | Type                                                                                                                                                                                                                                              | Null | Key | Default          | Extra |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+------------------+-------+
| name  | varchar(255)                                                                                                                                                                                                                                      |      | PRI |                  |       |
| class | enum('Novel_transcript','Novel_transcript_gencode_conf','Artifact','Known','Novel_CDS',
          'Novel_transcript','Novel_transcript_gencode_conf','Putative','Putative_gencode_conf','TEC',
          'Processed_pseudogene','Unprocessed_pseudogene','Polymorphic') |      |     | Novel_transcript |       |

mysql> select * from encodeGencodeGeneClassMar07 limit 2;
+----------------+----------------------+
| name           | class                |
+----------------+----------------------+
| AC000059.1-001 | Processed_pseudogene |
| AC000061.1-001 | Known                |
+----------------+----------------------+

WG encode Gencode Genes
select * from wgEncodeSangerGencodeGencodeAuto20081001 limit 2;
+-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+
| bin | name            | chrom | strand | txStart | txEnd  | cdsStart | cdsEnd | exonCount | exonStarts                                 | exonEnds                                   | score | name2           | cdsStartStat | cdsEndStat | exonFrames   |
+-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+
| 585 | ENST00000382784 | chr11 | -      |  117925 | 119388 |   117925 | 119388 |         2 | 117925,119059,                             | 118376,119388,                             |     0 | ENSG00000206082 | cmpl         | cmpl       | 2,0,         |
| 585 | ENST00000382782 | chr11 | +      |  118170 | 119275 |   118170 | 119275 |         6 | 118170,118244,119031,119092,119174,119246, | 118242,118349,119088,119170,119235,119275, |     0 | ENSG00000206080 | incmpl       | incmpl     | 0,0,0,0,0,1, |
+-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+
cd /usr/local/apache/htdocs/goldenPath/hg18/wgEncodeSangerGencode
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep ENST00000382784
chr11   ENSEMBL exon    119060  119388  .       -       .        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;
chr11   ENSEMBL CDS     119060  119388  .       -       0        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;
chr11   ENSEMBL start_codon     119386  119388  .       -       0        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;
chr11   ENSEMBL exon    117926  118376  .       -       .        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;
chr11   ENSEMBL CDS     117929  118376  .       -       1        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;
chr11   ENSEMBL stop_codon      117926  117928  .       -       0        gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3;

Uniformity of files?
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | wc -l               
259464
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_type" | wc -l
259459
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_status" | wc -l
259459
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_type" | wc -l
259459
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_status" | wc -l
259459
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_id" | wc -l
259459
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_id" | wc -l
259459
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep gene_name | wc -l
249356
[hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep transcript_name | wc -l
243875
wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz   259464 lines, 5 without any of the 6 gene-id,gene-type,gene_status,transcript_id,transcript_type,transcript_status,
                                                  249356 lines with gene_name 
                                                  243875 lines with transcript_name 
wgEncodeSangerGencodeGencodeManual20081001.gtf.gz 734750 lines, 5 without any of the 8 gene-id,gene-type,gene_status,gene_name,transcript_id,transcript_type,transcript_status,transcript_name
Missing 5 due to header:
##description: evidence-based annotation of the human genome (NCBI36)
##provider: GENCODE
##contact: fsk@sanger.ac.uk
##format: gtf 2.2
##date: 2008-10-02


What do we need?
A) gene_name or else transcript_name or else gene_id or else transcript_id in NAME2?
B) itemClassTable (like encodeGencodeGeneClassMar) to tie gencode_genes.NAME to itemClassTable.name and class as ?
'Known',  				gene_status else transcript_status = "KNOWN"
'Novel_transcript',     		gene_status else transcript_status = "NOVEL"
'Novel_transcript_gencode_conf',
'Artifact',
'Novel_CDS',
'Putative',				??? No "putative" in submission files 
'Putative_gencode_conf',		???
'TEC',					gene_type else transcript_type = "TEC"
'Processed_pseudogene',			gene_type else transcript_type = "pseudogene" !!! There are qualifier: "scRNA_pseudogene"
'Unprocessed_pseudogene',		gene_type else transcript_type = "unprocessed_pseudogene"
'Polymorphic'				gene_type = "polymorphic_pseudogene"
transcript

Streamline the file first:
#1      2       3       4               5               6       7       8        9       10                    11            12                    13              14                                    15                16         17        18                        19          20       21        22               23              24                   25    26    
chr1    HAVANA  exon    13087744        13088030        .       -       .        gene_id "OTTHUMG00000009501"; transcript_id "OTTHUMT00000026267"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "UNKNOWN"; gene_type "polymorphic_pseudogene"; gene_status "NOVEL"; gene_name "RP13-221M14.3"; transcript_name "RP13-221M14.3-001"; level 2;
#		h,e,loc,str,gi,ti,gn,tn,gt,tt,gs,ts,lvl
zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | sed "s/\"//g" | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	 printf "%s\t%s\t%s:%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n",$2,$3,$1,$4,$5,$7,$10,$12,$22,$24,$18,$14,$20,$16,$26;}' >  wgEncodeSangerGencodeGencodeManual20081001.stream.tab
wc -l wgEncodeSangerGencodeGencodeManual20081001.stream.tab: 734745
uniq -f 4 wgEncodeSangerGencodeGencodeManual20081001.stream.tab >  wgEncodeSangerGencodeGencodeManual20081001.uniq.tab
wc -l wgEncodeSangerGencodeGencodeManual20081001.uniq.tab 67432
select count(*) from wgEncodeSangerGencodeGencodeManual20081001; 67432 
=== OR: ===
zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | sed "s/\"//g" | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	 printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n",$10,$12,$22,$24,$18,$14,$20,$16,$26;}' | uniq > wgEncodeSangerGencodeGencodeManual20081001.stream.tab

* * * * * * 
Manual is uniform, but auto has exceptions:
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep -v gene_name | grep -v transcript_name | wc -l 10108
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep -v gene_name | grep transcript_name | wc -l    0
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep gene_name | grep -v transcript_name | wc -l    5481
* * * * * * 

Load uniq classes tables 
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "level 3" | sed "s/\"//g" | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t3\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t3\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n",    $10,$12,$18,$14,$20,$16;}' > ggL3.classes.tab
wc -l ggL3.classes.tab 259459
zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | wc -l   259464
uniq ggL3Auto.classes.tab > ggL3.uniq.tab            
wc -l ggL3.uniq.tab 6293

zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | grep "level 1" | sed "s/\"//g" | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t1\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t1\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n",     $10,$12,$18,$14,$20,$16;}' > ggL1.classes.tab
zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | grep "level 2" | sed "s/\"//g" | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t2\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t2\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n",     $10,$12,$18,$14,$20,$16;}' > ggL2.classes.tab

zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | wc -l  734750
wc -l ggL1.classes.tab 4563
wc -l ggL2.classes.tab 730182    730182 + 4563 + 5 = 734750

uniq ggL1.classes.tab > ggL1.uniq.tab            
uniq ggL2.classes.tab > ggL2.uniq.tab            
wc -l ggL1.uniq.tab 3638
wc -l ggL2.uniq.tab 63794  63794 + 3638 = 67432
cp ggL1.uniq.tab ggMan.uniq.tab
cat ggL2.uniq.tab >> ggMan.uniq.tab 

>ENSEMBL exon    chr11:566486-566592     +       ENSG00000070047 ENST00000264555 K1542_HUMAN     K1542_HUMAN     protein_coding  protein_coding  KNOWN   KNOWN   3


Plan: 
A) load uniq.tabs into hg18 temporarily.
echo "CREATE TABLE wgEncodeSangerGencodeGeneClasses_tmp (
    geneId varchar(255) not null,
    transcriptId varchar(255) not null,
    geneName varchar(255) not null,
    transcriptName varchar(255) not null,
    geneType varchar(255) not null,
    transcriptType varchar(255) not null,
    geneStatus varchar(255) not null,
    transcriptStatus varchar(255) not null,
    level integer,
    class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined',
    INDEX(transcriptName),
    INDEX(class,level,transcriptType),
    PRIMARY KEY(transcriptId));" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'ggL3.uniq.tab' into table wgEncodeSangerGencodeGeneClasses_tmp" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'ggMan.uniq.tab' into table wgEncodeSangerGencodeGeneClasses_tmp" | hgsql hg18

select count(*) from wgEncodeSangerGencodeGencodeAuto20081001;   |    16293 |
select count(*) from wgEncodeSangerGencodeGencodeManual20081001; |    67432 |
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp;       |    83725 |

B) Update the 2 gpf tracks.
Currently NAME = transcriptId and NAME2 = gene_id;  What do we want? gn > tn > gi > ti ?
update wgEncodeSangerGencodeGencodeAuto20081001,  wgEncodeSangerGencodeGeneClasses_tmp set name2 = wgEncodeSangerGencodeGeneClasses_tmp.gene_name   where wgEncodeSangerGencodeGencodeAuto20081001.name   = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId and wgEncodeSangerGencodeGeneClasses_tmp.gene_name != "";
update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = wgEncodeSangerGencodeGeneClasses_tmp.gene_name where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;


select count(*) from wgEncodeSangerGencodeGencodeManual20081001; 67432
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp,wgEncodeSangerGencodeGencodeManual20081001 where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;  67432
update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId   where wgEncodeSangerGencodeGencodeManual20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;
update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeManual20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name where wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;
select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;  67432
select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeManual20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name and wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 67432
select count(*) from wgEncodeSangerGencodeGencodeManual20081001 where name = ""; 0

select count(*) from wgEncodeSangerGencodeGencodeAuto20081001; 16293 
select count(*) from tmpGencodeGeneAuto_tab,wgEncodeSangerGencodeGencodeAuto20081001 where wgEncodeSangerGencodeGencodeAuto20081001.name = tmpGencodeGeneAuto_tab.transcriptId; 16293
update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId   where wgEncodeSangerGencodeGencodeAuto20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;
update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeAuto20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name where wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId and wgEncodeSangerGencodeGeneClasses_tmp.transcript_name != "";
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId;  16293
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name  = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name and wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 11752
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 4541

Try again: want name=transcriptId, name2=transcriptName
Currently NAME = transcriptId and NAME2 = gene_id;  What do we want? gn > tn > gi > ti ?
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name2 = transcriptId;  16293
update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set name = transcriptId where name2 = transcriptId;
update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = transcriptName where name2 = transcriptId and transcriptName != "";
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId;  16293
select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId and name2 != transcriptName;4541

select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name2 = transcriptId;  67432
update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name = transcriptId where name2 = transcriptId;
update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = transcriptName where name2 = transcriptId and transcriptName != "";
select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId;  67432
select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId and name2 != transcriptName;0



C) Class is very tricky

Name                              Color          Contents
----                              -----          --------
Validated_coding                  Dark Yellow    protein_coding/level_1
Validated_processed               Light Yellow   processed_transcript/level_1
Validated_processed_pseudogene   Dark Purple     processed_pseudogene/level_1,
                                                 transcribed_processed_pseudogene/level_1
Validated_unprocessed_pseudogene Medium Purple   unitary_processed_pseudogene/level_1,
                                                 unprocessed_pseudogenne/level_1
Validated_pseudogene             Light Purple    IG_pseudogene/level_1,
                                                 pseudogene/level_1, 
                                                 transcribed_pseudogene/level_1
Havana_coding                    Dark Orange     protein_coding/level_2,
                                                 IG_gene/level_2
Havana_nonsense                  Medium Orange   nonsense_mediated_decay/level_2
Havana_non_coding                Light Orange    ambiguous_orf/level_2,
                                                 antisense/level_2,
                                                 non_coding/level_2,
                                                 retained_intron/level_2,
                                                 processed_transcript/level_2
Havana_processed_pseudogene      Dark Pink       processed_pseudogene/level_2,
                                                 transcribed_processed_pseudogene/level_2,        
                                                 transcribed_pseudogene/level_2        
Havana_unprocessed_pseudogene    Medium Pink     polymorphic_pseudogene/level_2,
                                                 transcribed_unprocessed_pseudogene/level_2,
                                                 unitary_pseudogene/level_2,
                                                 unprocessed_pseudogene/level_2
Havana_pseudogene                Light Pink      pseudogene/level_2,
                                                 IG_pseudogene/level_2,
Havana_TEC                       Grey            TEC/level_2,
                                                 artifact/level_2
Havana_polyA                     Black           polyA features
Ensembl_coding                   Dark Red        protein_coding/level_3,
                                                 IG_protein/level_3,
                                                 C_segment/level_3,                      
                                                 J_segment/level_3,                      
                                                 V_segment/level_3
Ensembl_RNA                      Light Red       Mt_tRNA_pseudogene/level3,
                                                 miRNA/level_3,                      
                                                 miRNA_pseudogene/level_3,                      
                                                 misc_RNA/level_3,                      
                                                 misc_RNA_pseudogene/level_3,                      
                                                 rRNA/level_3,                      
                                                 rRNA_pseudogene/level_3,                      
                                                 scRNA/level_3,                      
                                                 scRNA_pseudogene/level_3,                      
                                                 snRNA/level_3,                      
                                                 snRNA_pseudogene/level_3,                      
                                                 snoRNA/level_3,                      
                                                 snoRNA_pseudogene/level_3,                      
                                                 tRNA_pseudogene/level_3,
Ensembl_pseudogene               Dark Pink       pseudogene/level_3,
                                                 retrotransposon/level_3                

select distinct level,transcriptType from wgEncodeSangerGencodeGeneClasses_tmp order by level,transcriptType;
+-------+------------------------------------+
| level | transcriptType                     |
+-------+------------------------------------+
|     1 | IG_pseudogene                      |  Validated_pseudogene
|     1 | processed_pseudogene               |  Validated_processed_pseudogene
|     1 | pseudogene                         |  Validated_pseudogene
|     1 | transcribed_processed_pseudogene   |  Validated_processed_pseudogene
|     1 | transcribed_pseudogene             |  Validated_pseudogene
|     1 | unitary_pseudogene                 |  Validated_unprocessed_pseudogene
|     1 | unprocessed_pseudogene             |  Validated_unprocessed_pseudogene
|     2 | ambiguous_orf                      |  Havana_non_coding
|     2 | antisense                          |  Havana_non_coding
|     2 | artifact                           |  Havana_TEC
|     2 | IG_gene                            |  Havana_coding
|     2 | IG_pseudogene                      |  Havana_pseudogene
|     2 | nonsense_mediated_decay            |  Havana_nonsense
|     2 | non_coding                         |  Havana_non_coding
|     2 | polymorphic_pseudogene             |  Havana_unprocessed_pseudogene
|     2 | processed_pseudogene               |  Havana_processed_pseudogene
|     2 | processed_transcript               |  Havana_non_coding
|     2 | protein_coding                     |  Havana_coding
|     2 | pseudogene                         |  Havana_pseudogene
|     2 | retained_intron                    |  Havana_non_coding
|     2 | TEC                                |  Havana_TEC
|     2 | transcribed_processed_pseudogene   |  Havana_processed_pseudogene
|     2 | transcribed_pseudogene             |  Havana_processed_pseudogene
|     2 | transcribed_unprocessed_pseudogene |  Havana_unprocessed_pseudogene
|     2 | unitary_pseudogene                 |  Havana_unprocessed_pseudogene
|     2 | unprocessed_pseudogene             |  Havana_unprocessed_pseudogene
|     3 | C_segment                          |  Ensembl_coding
|     3 | J_segment                          |  Ensembl_coding
|     3 | miRNA                              |  Ensembl_RNA
|     3 | miRNA_pseudogene                   |  Ensembl_RNA
|     3 | misc_RNA                           |  Ensembl_RNA
|     3 | misc_RNA_pseudogene                |  Ensembl_RNA
|     3 | Mt_tRNA_pseudogene                 |  Ensembl_RNA
|     3 | protein_coding                     |  Ensembl_coding
|     3 | pseudogene                         |  Ensembl_pseudogene
|     3 | retrotransposed                    |  Ensembl_pseudogene
|     3 | rRNA                               |  Ensembl_RNA
|     3 | rRNA_pseudogene                    |  Ensembl_RNA
|     3 | scRNA                              |  Ensembl_RNA
|     3 | scRNA_pseudogene                   |  Ensembl_RNA
|     3 | snoRNA                             |  Ensembl_RNA
|     3 | snoRNA_pseudogene                  |  Ensembl_RNA
|     3 | snRNA                              |  Ensembl_RNA
|     3 | snRNA_pseudogene                   |  Ensembl_RNA
|     3 | tRNA_pseudogene                    |  Ensembl_RNA
|     3 | V_segment                          |  Ensembl_coding
+-------+------------------------------------+
46 rows in set (0.34 sec)

alter table wgEncodeSangerGencodeGeneClasses_tmp add column class enum('Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene','Undefined') not null;

update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Undefined';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'IG_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'transcribed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_processed_pseudogene' where level = 1 and transcriptType = 'processed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_processed_pseudogene' where level = 1 and transcriptType = 'transcribed_processed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_unprocessed_pseudogene' where level = 1 and transcriptType = 'unitary_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_unprocessed_pseudogene' where level = 1 and transcriptType = 'unprocessed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_coding' where level = 2 and transcriptType = 'IG_gene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_coding' where level = 2 and transcriptType = 'protein_coding';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'ambiguous_orf';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'antisense';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'non_coding';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'retained_intron';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'processed_transcript';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_nonsense' where level = 2 and transcriptType = 'nonsense_mediated_decay';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_pseudogene' where level = 2 and transcriptType = 'IG_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_pseudogene' where level = 2 and transcriptType = 'pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'processed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'transcribed_processed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'transcribed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'polymorphic_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'transcribed_unprocessed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'unitary_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'unprocessed_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_TEC' where level = 2 and transcriptType = 'artifact';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_TEC' where level = 2 and transcriptType = 'TEC';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'C_segment';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'J_segment';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'protein_coding';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'V_segment';                          
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'miRNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'miRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'misc_RNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'misc_RNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'Mt_tRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'rRNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'rRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'scRNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'scRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snoRNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snoRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snRNA';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'tRNA_pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_pseudogene' where level = 3 and transcriptType = 'pseudogene';
update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_pseudogene' where level = 3 and transcriptType = 'retrotransposed';
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where class = 'Undefined';

select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where class = 'Havana_processed_pseudogene';                             |     1295 |
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'processed_pseudogene';             |     1238 |
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'transcribed_processed_pseudogene'; |        7 |
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'transcribed_pseudogene';           |       50 |

select distinct class,level,transcriptType from wgEncodeSangerGencodeGeneClasses_tmp order by class,level,transcriptType;
+----------------------------------+-------+------------------------------------+
| class                            | level | transcriptType                     |
+----------------------------------+-------+------------------------------------+
| Validated_processed_pseudogene   |     1 | processed_pseudogene               |
| Validated_processed_pseudogene   |     1 | transcribed_processed_pseudogene   |
| Validated_unprocessed_pseudogene |     1 | unitary_pseudogene                 |
| Validated_unprocessed_pseudogene |     1 | unprocessed_pseudogene             |
| Validated_pseudogene             |     1 | IG_pseudogene                      |
| Validated_pseudogene             |     1 | pseudogene                         |
| Validated_pseudogene             |     1 | transcribed_pseudogene             |
| Havana_coding                    |     2 | IG_gene                            |
| Havana_coding                    |     2 | protein_coding                     |
| Havana_nonsense                  |     2 | nonsense_mediated_decay            |
| Havana_non_coding                |     2 | ambiguous_orf                      |
| Havana_non_coding                |     2 | antisense                          |
| Havana_non_coding                |     2 | non_coding                         |
| Havana_non_coding                |     2 | processed_transcript               |
| Havana_non_coding                |     2 | retained_intron                    |
| Havana_processed_pseudogene      |     2 | processed_pseudogene               |
| Havana_processed_pseudogene      |     2 | transcribed_processed_pseudogene   |
| Havana_processed_pseudogene      |     2 | transcribed_pseudogene             |
| Havana_unprocessed_pseudogene    |     2 | polymorphic_pseudogene             |
| Havana_unprocessed_pseudogene    |     2 | transcribed_unprocessed_pseudogene |
| Havana_unprocessed_pseudogene    |     2 | unitary_pseudogene                 |
| Havana_unprocessed_pseudogene    |     2 | unprocessed_pseudogene             |
| Havana_pseudogene                |     2 | IG_pseudogene                      |
| Havana_pseudogene                |     2 | pseudogene                         |
| Havana_TEC                       |     2 | artifact                           |
| Havana_TEC                       |     2 | TEC                                |
| Ensembl_coding                   |     3 | C_segment                          |
| Ensembl_coding                   |     3 | J_segment                          |
| Ensembl_coding                   |     3 | protein_coding                     |
| Ensembl_coding                   |     3 | V_segment                          |
| Ensembl_RNA                      |     3 | miRNA                              |
| Ensembl_RNA                      |     3 | miRNA_pseudogene                   |
| Ensembl_RNA                      |     3 | misc_RNA                           |
| Ensembl_RNA                      |     3 | misc_RNA_pseudogene                |
| Ensembl_RNA                      |     3 | Mt_tRNA_pseudogene                 |
| Ensembl_RNA                      |     3 | rRNA                               |
| Ensembl_RNA                      |     3 | rRNA_pseudogene                    |
| Ensembl_RNA                      |     3 | scRNA                              |
| Ensembl_RNA                      |     3 | scRNA_pseudogene                   |
| Ensembl_RNA                      |     3 | snoRNA                             |
| Ensembl_RNA                      |     3 | snoRNA_pseudogene                  |
| Ensembl_RNA                      |     3 | snRNA                              |
| Ensembl_RNA                      |     3 | snRNA_pseudogene                   |
| Ensembl_RNA                      |     3 | tRNA_pseudogene                    |
| Ensembl_pseudogene               |     3 | pseudogene                         |
| Ensembl_pseudogene               |     3 | retrotransposed                    |
+----------------------------------+-------+------------------------------------+
46 rows in set (0.11 sec)

rename table wgEncodeSangerGencodeGeneClasses_tmp to wgEncodeSangerGencodeClasses;
alter table wgEncodeSangerGencodeClasses add column name varchar(255); 
update wgEncodeSangerGencodeClasses set name = transcriptId; 
 alter table wgEncodeSangerGencodeClasses drop primary key;
alter table wgEncodeSangerGencodeClasses add key(name) primary;
rename table wgEncodeSangerGencodeClasses to wgEncodeGencodeClasses;

GENCODE Round 2
===============
A) Split gtf into header, lvls 1,2,3
head -5 gencode_data.rel2.gtf > gencode.rel2.header.gtf
grep "level 1" gencode_data.rel2.gtf > gencode.rel2.lvl1.gtf
grep "level 2" gencode_data.rel2.gtf > gencode.rel2.lvl2.gtf
grep "level 3" gencode_data.rel2.gtf > gencode.rel2.lvl3.gtf
wc -l gencode_data.rel2.gtf
wc -l gencode.rel2.header.gtf
wc -l gencode.rel2.lvl1.gtf
wc -l gencode.rel2.lvl2.gtf
wc -l gencode.rel2.lvl3.gtf
1238932 gencode_data.rel2.gtf
5 gencode.rel2.header.gtf
12617 gencode.rel2.lvl1.gtf
954754 gencode.rel2.lvl2.gtf
248892 gencode.rel2.lvl3.gtf         5 + 12617 + 954754 + 248892 = 1216268 - 1238932 = -22664 Missing!
grep -v "##" gencode_data.rel2.gtf |  grep -v "level" > gencode.rel2.missing.gtf 
wc -l gencode.rel2.missing.gtf
22664 gencode.rel2.missing.gtf
head gencode.rel2.missing.gtf
chr1    HAVANA  polyA_signal    131340  131345  .       -       .       .
chr1    HAVANA  pseudo_polyA    218144  218149  .       -       .       .
grep -v "HAVANA" gencode.rel2.missing.gtf | wc -l
0
grep -v "olyA" gencode.rel2.missing.gtf | wc -l
0
mv gencode.rel2.missing.gtf gencode.rel2.HavanaPolyA.gtf

cp gencode.rel2.header.gtf gencode.rel2.manual.gtf
cat gencode.rel2.lvl1.gtf >> gencode.rel2.manual.gtf
cat gencode.rel2.lvl2.gtf >> gencode.rel2.manual.gtf
cat gencode.rel2.HavanaPolyA.gtf >> gencode.rel2.manual.gtf
cp gencode.rel2.header.gtf gencode.rel2.auto.gtf
cat gencode.rel2.lvl1.gtf >> gencode.rel2.auto.gtf
cp gencode.rel2.header.gtf gencode.rel2.manual_noPolyA.gtf
cat gencode.rel2.lvl1.gtf >> gencode.rel2.manual_noPolyA.gtf
cat gencode.rel2.lvl2.gtf >> gencode.rel2.manual_noPolyA.gtf

B) Doctor missing transcript_ids 

C) format uniq classes
sed "s/\"//g" gencode.rel2.lvl3.gtf | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t3\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t3\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n",    $10,$12,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n",    $10,$10,$12,$12,$14,$14;
             else 
	        printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl3.tab
wc -l gencode.rel2.lvl3.gtf 
wc -l encode.gencode.classes.lvl3.tab 
grep "EXTERMINATE" encode.gencode.classes.lvl3.tab | wc -l
248892 gencode.rel2.lvl3.gtf
248892 encode.gencode.classes.lvl3.tab
0 

uniq encode.gencode.classes.lvl3.tab > encode.gencode.classes.u3.tab            
wc -l encode.gencode.classes.u3.tab 
39695 encode.gencode.classes.u3.tab

sed "s/\"//g" gencode.rel2.lvl1.gtf | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t1\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t1\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n",     $10,$12,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n",    $10,$10,$12,$12,$14,$14;
             else 
	        printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl1.tab
wc -l gencode.rel2.lvl1.gtf 
wc -l encode.gencode.classes.lvl1.tab 
grep "EXTERMINATE" encode.gencode.classes.lvl1.tab | wc -l
12617 gencode.rel2.lvl1.gtf
12617 encode.gencode.classes.lvl1.tab
0

uniq encode.gencode.classes.lvl1.tab > encode.gencode.classes.u1.tab            
wc -l encode.gencode.classes.u1.tab 
7775 encode.gencode.classes.u1.tab

sed "s/\"//g" gencode.rel2.lvl2.gtf | sed "s/;//g" | 
       awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") 
	        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t2\n",$10,$12,$22,$24,$18,$14,$20,$16;
	     else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") 
	        printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t2\n",  $10,$12,$22,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n",     $10,$12,$18,$14,$20,$16;
             else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") 
	        printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n",    $10,$10,$12,$12,$14,$14;
             else 
	        printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl2.tab
wc -l gencode.rel2.lvl2.gtf 
wc -l encode.gencode.classes.lvl2.tab 
grep "EXTERMINATE" encode.gencode.classes.lvl2.tab | wc -l
954754 gencode.rel2.lvl2.gtf
954754 encode.gencode.classes.lvl2.tab
0

uniq encode.gencode.classes.lvl2.tab > encode.gencode.classes.u2.tab            
wc -l encode.gencode.classes.u2.tab 
100681 encode.gencode.classes.u2.tab

D) tar
rm gencode_rel2.tgz
tar -cpzf gencode_rel2.tgz *  

E) Submit
F) load uniq.tabs into hg18 temporarily.
echo "CREATE TABLE wgEncodeSangerGencodeGeneClasses_tmp (
    geneId varchar(255) not null,
    transcriptId varchar(255) not null,
    geneName varchar(255) not null,
    transcriptName varchar(255) not null,
    geneType varchar(255) not null,
    transcriptType varchar(255) not null,
    geneStatus varchar(255) not null,
    transcriptStatus varchar(255) not null,
    level integer,
    class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined',
    INDEX(transcriptName),
    INDEX(class,level,transcriptType),
    PRIMARY KEY(transcriptId));" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u1.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u2.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u3.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18

select count(*) from wgEncodeSangerGencodeGencodeAuto20090101;   |    16293 |
select count(*) from wgEncodeSangerGencodeGencodeManual20090101; |    67432 |
select count(*) from wgEncodeSangerGencodeGeneClasses_tmp;       |    83725 |

GENCODE tRNAs and polyAs 2009-04-07
===================================
/cse/staff/tdreszer/docs/ENCODE/gencode/gencode_polyAs.rel2.gtf
/cse/staff/tdreszer/docs/ENCODE/gencode/gencode_tRNAscans.rel2.gtf
 pip cd 265
cp /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_polyAs.rel2.gtf .
cp /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_tRNAscans.rel2.gtf .

create table wgEncodeTmpGencodeAutoRel2 select * from wgEncodeGencodeAutoRel2;

/cluster/bin/x86_64/ldHgGene -gtf -genePredExt -noncoding -oldTable hg18 wgEncodeTmpGencodeAutoRel2 gencode_tRNAscans.rel2.gtf > load_tRnas.out 2>&1 &
[hgwdev:tdreszer 265> Reading gencode_tRNAscans.rel2.gtf
Read 623 transcripts in 623 lines in 1 files
  623 groups 25 seqs 1 sources 1 feature types
0 gene predictions

/cluster/bin/x86_64/ldHgGene -gtf -genePredExt -noncoding hg18 wgEncodeGencodePolyaRel2 gencode_polyAs.rel2.gtf > load_polyAs.out 2>&1 &
/cluster/bin/x86_64/ldHgGene -genePredExt -noncoding hg18 wgEncodeGencodePolyaRel2 gencode_polyAs.rel2.gtf > load_polyAs.out 2>&1 &

 /cluster/bin/x86_64/ldHgGene                                                                     
ldHgGene - load database with gene predictions from a gff file.
usage:
     ldHgGene database table file(s).gff
options:
     -bin         Add bin column (now the default)
     -nobin       don't add binning (you probably don't want this)
     -exon=type   Sets type field for exons to specific value
     -oldTable    Don't overwrite what's already in table
     -noncoding   Forces whole prediction to be UTR
     -gtf         input is GTF, stop codon is not in CDS
     -predTab     input is already in genePredTab format
     -requireCDS  discard genes that don't have CDS annotation
     -out=gpfile  write output, in genePred format, instead of loading
                  table. Database is ignored.
     -genePredExt create a extended genePred, including frame
                  information and gene name
     -impliedStopAfterCds - implied stop codon in GFF/GTF after CDS

head gencode_tRNAscans.rel2.gtf 
chr1    HAVANA  tRNAscan        7912926 7912995 .       -       .       gene_id 199079; transcript_id 199079; genename "Pseudo"; transcriptname "Pseudo";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3;
chr1    HAVANA  tRNAscan        16719667        16719740        .       -       .       gene_id 199126; transcript_id 199126; genename "Asn"; transcriptname "Asn";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3;
chr1    HAVANA  tRNAscan        16731480        16731553        .       -       .       gene_id 199125; transcript_id 199125; genename "Asn"; transcriptname "Asn";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3;
chr1    HAVANA  tRNAscan        16734361        16734432        .       -       .       gene_id 199124; transcript_id 199124; genename "Glu"; transcriptname "Glu";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3;
chr1    HAVANA  tRNAscan        16745021        16745091        .       -       .       gene_id 199123; transcript_id 199123; genename "Gly"; transcriptname "Gly";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3;

head gencode_polyAs.rel2.gtf    
chr1    HAVANA  polyA_site      131340  131345  .       -       .       gene_id 418719; transcript_id 418719; 
chr1    HAVANA  pseudo_polyA    218144  218149  .       -       .       gene_id 418720; transcript_id 418720; 
chr1    HAVANA  polyA_signal    443706  443711  .       -       .       gene_id 418722; transcript_id 418722; 
chr1    HAVANA  polyA_signal    519796  519801  .       +       .       gene_id 418723; transcript_id 418723; 
chr1    HAVANA  polyA_signal    552634  552639  .       -       .       gene_id 418726; transcript_id 418726; 

/cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeTmpGencodeAutoRel2 gencode_tRNAscans.rel2.gtf > load_tRnas.out 2>&1 &
Now the tRNAs are loaded but there are no class records to cover them

grep HAVANA gencode_polyAs.rel2.gtf | wc -l
23036
grep polyA_signal gencode_polyAs.rel2.gtf | wc -l
21636
grep pseudo_polyA gencode_polyAs.rel2.gtf | wc -l
1277
grep polyA_site gencode_polyAs.rel2.gtf | wc -l
42
grep -v polyA_signal gencode_polyAs.rel2.gtf | grep -v pseudo_polyA | grep -v polyA_site

grep PolyA_signal gencode_polyAs.rel2.gtf | wc -l
79

21636 + 1277 + 42 + 79 = 23034

grep -v polyA_signal gencode_polyAs.rel2.gtf | grep -v pseudo_polyA | grep -v PolyA_signal | grep -v polyA_site
"PolyA signal"
grep PolyA gencode_polyAs.rel2.gtf | wc -l
81
grep -v PolyA gencode_polyAs.rel2.gtf > gencode.rel2.polyAs.gtf
grep PolyA gencode_polyAs.rel2.gtf | sed "s/PolyA/polyA/" | sed "s/polyA signal/polyA_signal/" >> gencode.rel2.polyAs.gtf
wc -l gencode.rel2.polyAs.gtf 23041 gencode.rel2.polyAs.gtf
wc -l gencode_polyAs.rel2.gtf 23041 gencode_polyAs.rel2.gtf
 
grep -v polyA_signal gencode.rel2.polyAs.gtf | grep -v pseudo_polyA | grep -v polyA_site | grep -v "##" | wc -l 
0
So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site

head -5 gencode.rel2.polyAs.gtf > gencode.rel2.polyA_signal.gtf
grep polyA_signal gencode.rel2.polyAs.gtf >> gencode.rel2.polyA_signal.gtf  
head -5 gencode.rel2.polyAs.gtf > gencode.rel2.pseudo_polyA.gtf
grep pseudo_polyA gencode.rel2.polyAs.gtf >> gencode.rel2.pseudo_polyA.gtf  
head -5 gencode.rel2.polyAs.gtf > gencode.rel2.polyA_site.gtf
grep polyA_site gencode.rel2.polyAs.gtf >> gencode.rel2.polyA_site.gtf  

/cluster/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.polyA_signal.gtf > load_polyA.out 2>&1 &
/cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.pseudo_polyA.gtf >> load_polyA.out 2>&1 &
/cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.polyA_site.gtf >> load_polyA.out 2>&1 &

okay, it is now all about classes!

Consistent naming:
rename table wgEncodeTmpGencodeAutoRel2 to wgEncodeGencodeAutoRel2Tmp
cp gencode_tRNAscans.rel2.gtf gencode.rel2.tRNAs.gtf

Work on tmp table:
create table wgEncodeGencodeClassesRel2Tmp select * from wgEncodeGencodeClassesRel2;

make tRNA only table to simplify
/cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding hg18 wgEncodeGencodeAutoRel2TrnasOnly gencode_tRNAscans.rel2.gtf > load_tRnas2.out 2>&1 &
No need
drop table wgEncodeGencodeAutoRel2TrnasOnly;

desc wgEncodeGencodeClassesRel2Tmp;
+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+
| Field            | Type                                                                                                                                                                                                                                                                                                                                                                          | Null | Key | Default   | Extra |
+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+
| geneId           | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| transcriptId     | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| geneName         | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| transcriptName   | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| geneType         | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| transcriptType   | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| geneStatus       | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| transcriptStatus | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
| level            | int(11)                                                                                                                                                                                                                                                                                                                                                                       | YES  |     | NULL      |       |
| class            | enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') |      |     | Undefined |       |
| name             | varchar(255)                                                                                                                                                                                                                                                                                                                                                                  |      |     |           |       |
+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+

tail -23036 gencode.rel2.polyAs.gtf | sed "s/\"//g" | sed "s/;//g" | 
   awk '{printf "%s\t%s\t\t\t%s\t%s\tUNKNOWN\tUNKNOWN\t2\tHavana_polyA\t%s\n",$10,$12,$3,$3,$10;}' > encode.gencode.classes.polyA.tab
head encode.gencode.classes.polyA.tab
tail -623 gencode.rel2.tRNAs.gtf | sed "s/\"//g" | sed "s/;//g" | 
   awk '{printf "%s\t%s\t\t\t%s\t%s\tUNKNOWN\tUNKNOWN\t3\tEnsembl_RNA\t%s\n",$10,$12,$3,$3,$10;}' > encode.gencode.classes.tRNAs.tab
head encode.gencode.classes.tRNAs.tab

echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.polyA.tab' into table wgEncodeGencodeClassesRel2Tmp" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.tRNAs.tab' into table wgEncodeGencodeClassesRel2Tmp" | hgsql hg18
rename table wgEncodeGencodeClassesRel2 to wgEncodeGencodeClassesRel2Old;
rename table wgEncodeGencodeClassesRel2Tmp to wgEncodeGencodeClassesRel2;

rename table wgEncodeGencodeAutoRel2 to wgEncodeGencodeAutoRel2Old;
rename table wgEncodeGencodeAutoRel2Tmp to wgEncodeGencodeAutoRel2;

select count(*) from wgEncodeGencodeAutoRel2 where chrom NOT in (
"chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20",
"chr21","chr22","chrX","chrY","chrM");
42
select * from wgEncodeGencodeAutoRel2 where chrom NOT in (
"chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20",
"chr21","chr22","chrX","chrY","chrM");
chrMT
select count(*) from wgEncodeGencodeAutoRel2 where chrom = "chrM";
0
update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT";

hgsql -N -B -e "select * from wgEncodeGencodeClassesRel2" hg18 > wgEncodeGencodeClassesRel2.tab

TOBEDONE!!!!:
1) Should genePred tables have geneName in name2?  Currently they have geneId
2) Classes table is missing geneName and transcriptName!!!  Must rebuild
3) Once classes is rebuilt, either:
   a) Fix ldHgGene to use geneName, not geneId
   b) Manually update genePred tables with name2 = classes.geneName
   c) Code browser to look up geneName whenever name2 is used!

Changed local ldHgGene
~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualNew gencode.rel2.manual_noPolyA.gtf > reload_manual.out 2>&1 &
rename table wgEncodeGencodeManualRel2 to wgEncodeGencodeManualRel2Old;
rename table wgEncodeGencodeManualNew to wgEncodeGencodeManualRel2;
~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoNew gencode.rel2.auto_with_tRNAs.gtf > reload_auto.out 2>&1 &
rename table wgEncodeGencodeAutoRel2 to wgEncodeGencodeAutoRel2Old;
rename table wgEncodeGencodeAutoNew to wgEncodeGencodeAutoRel2;
update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT";

# classes tables doesn't have the right key structure (but Rel1 does) so:
alter table wgEncodeGencodeClassesRel2 add primary key name; ### Doesn't work
create table wgEncodeGencodeClassesNew like wgEncodeGencodeClassesRel1; ### Doesn't work
CREATE TABLE wgEncodeGencodeClassesNew (
    geneId varchar(255) not null,
    transcriptId varchar(255) not null,
    geneName varchar(255) not null,
    transcriptName varchar(255) not null,
    geneType varchar(255) not null,
    transcriptType varchar(255) not null,
    geneStatus varchar(255) not null,
    transcriptStatus varchar(255) not null,
    level integer,
    class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined',
    name varchar(255) not null,
    # indexes
    INDEX(transcriptType),
    INDEX(level,class),
    PRIMARY KEY(name)
);

insert into wgEncodeGencodeNew select * from wgEncodeGencodeRel2;
rename table wgEncodeGencodeClassesRel2 to wgEncodeGencodeClassesRel2Old;
rename table wgEncodeGencodeClassesNew to wgEncodeGencodeClassesRel2;

tRNAs appear to be missing!!! 2009-05-05
========================================
~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeTrnaNew gencode.rel2.tRNAs.gtf > reload_tRNAs.out 2>&1 &
 select count(*) from wgEncodeGencodeTrnaNew;|        0 |
~/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt hg18 wgEncodeGencodeTrnaNew gencode.rel2.tRNAs.gtf > reload_tRNAs.out 2>&1 &
 select count(*) from wgEncodeGencodeTrnaNew;|      623 |
select count(*) from wgEncodeGencodeTrnaNew t1, wgEncodeGencodeClassesRel2 t2 where t1.name = t2.name;|      623 |
# So these transcripts are already in wgEncodeGencodeClassesRel2
insert into wgEncodeGencodeAutoRel2 select * from wgEncodeGencodeTrnaNew;

update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT";
drop table wgEncodeGencodeTrnaNew;

/cluster/data/encode/pipeline/bin/encodeStatus.pl 21 approved
/cluster/data/encode/pipeline/bin/encodeStatus.pl 265 approved

/cluster/data/encode/pipeline/bin/encodeStatus.pl 21 reviewing
/cluster/data/encode/pipeline/bin/encodeStatus.pl 265 reviewing

/cluster/data/encode/pipeline/bin/encodeStatus.pl 21 released
/cluster/data/encode/pipeline/bin/encodeStatus.pl 265 released


Release 3 2009-08-26
====================
start with 
cd {pip}/265
mkdir rel3; cd rel3
ftp release_3_DCC.tgz into .
tar -tzf release_3_DCC.tgz
tar -xzf release_3_DCC.tgz
cd to_release
wc -l classes.def  68
uniq -u < classes.def | wc -l   68
sort -k 2,3 classes.def | uniq -f 1 | wc -l 19
sort -k 2,3 classes.def > classes_sorted.def
sort -k 2,3 classes.def | uniq -f 1 > classes_uniq.def
sort -f classes.def > classes_types.def
# Use these files to create unique lists of classes, types (and relation between the two)
# Figure out what the differences between Rel2 and Rel3 are for classes and types

# Classes:
Validated_coding: protein_coding
Validated_processed
Validated_processed_pseudogene: processed_pseudogene,processed_transcript,transcribed_processed_pseudogene
Validated_unprocessed_pseudogene: transcribed_unprocessed_pseudogene,unprocessed_pseudogene
Validated_pseudogene: IG_pseudogene,polymorphic_pseudogene,pseudogene,retrotransposed,unitary_pseudogene
Havana_coding: IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,protein_coding
Havana_non_coding: ambiguous_orf,antisense,non_coding,processed_transcript,retained_intron
Havana_nonsense: nonsense_mediated_decay
Havana_polyA: polyA_signal,polyA_site,pseudo_polyA
Havana_processed_pseudogene: processed_pseudogene,transcribed_processed_pseudogene
Havana_unprocessed_pseudogene: transcribed_unprocessed_pseudogene,unprocessed_pseudogene
Havana_pseudogene: IG_pseudogene,TR_pseudogene,polymorphic_pseudogene,pseudogene,retrotransposed,unitary_pseudogene
Havana_TEC: TEC,artifact
Ensembl_coding: IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,protein_coding
Ensembl_non_coding: antisense,non_coding,processed_transcript,retained_intron
Ensembl_processed_pseudogene: processed_pseudogene
Ensembl_unprocessed_pseudogene: unprocessed_pseudogene
Ensembl_pseudogene: IG_pseudogene,miRNA_pseudogene,misc_RNA_pseudogene,pseudogene,retrotransposed,unitary_pseudogene
Ensembl_RNA: Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,miRNA,misc_RNA,rRNA,rRNA_pseudogene,scRNA_pseudogene,snRNA,snRNA_pseudogene,snoRNA,snoRNA_pseudogene,tRNA_pseudogene,tRNAscan

# Class List (superset of Rel2):
Validated_coding,Validated_processed,Validated_processed_pseudogene,Validated_unprocessed_pseudogene,Validated_pseudogene,Havana_coding,Havana_non_coding,Havana_nonsense,Havana_polyA,Havana_processed_pseudogene,Havana_unprocessed_pseudogene,Havana_pseudogene,Havana_TEC,Ensembl_coding,Ensembl_non_coding,Ensembl_processed_pseudogene,Ensembl_unprocessed_pseudogene,Ensembl_pseudogene,Ensembl_RNA

# Types:
ambiguous_orf: Havana_non_coding
antisense: Havana_non_coding,Ensembl_non_coding
artifact: Havana_TEC
IG_C_gene: Havana_coding,Ensembl_coding
IG_D_gene: Havana_coding,Ensembl_coding/hive/groups/encode/dcc/pipeline/encpipeline_prod/453/
IG_J_gene: Havana_coding,Ensembl_coding
IG_pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene
IG_V_gene: Havana_coding,Ensembl_coding
miRNA: Ensembl_RNA
miRNA_pseudogene: Ensembl_pseudogene
misc_RNA: Ensembl_RNA
misc_RNA_pseudogene: Ensembl_pseudogene
Mt_rRNA: Ensembl_RNA
Mt_tRNA: Ensembl_RNA
Mt_tRNA_pseudogene: Ensembl_RNA
nonsense_mediated_decay: Havana_nonsense
non_coding: Havana_non_coding,Ensembl_non_coding
polyA_signal: Havana_polyA
polyA_site: Havana_polyA
polymorphic_pseudogene: Validated_pseudogene,Havana_pseudogene
processed_pseudogene: Validated_processed_pseudogene,Havana_processed_pseudogene,Ensembl_processed_pseudogene
processed_transcript: Validated_processed_pseudogene,Havana_non_coding,Ensembl_non_coding
protein_coding: Validated_coding,Havana_coding,Ensembl_coding
pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene
pseudo_polyA: Havana_polyA
retained_intron: Havana_non_coding,Ensembl_non_coding
retrotransposed: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene
rRNA: Ensembl_RNA
rRNA_pseudogene: Ensembl_RNA
scRNA_pseudogene: Ensembl_RNA
snoRNA: Ensembl_RNA
snoRNA_pseudogene: Ensembl_RNA
snRNA: Ensembl_RNA
snRNA_pseudogene: Ensembl_RNA
TEC: Havana_TEC
transcribed_processed_pseudogene: Validated_processed_pseudogene,Havana_processed_pseudogene
transcribed_unprocessed_pseudogene: Validated_unprocessed_pseudogene,Havana_unprocessed_pseudogene
tRNAscan: Ensembl_RNA
tRNA_pseudogene: Ensembl_RNA
TR_pseudogene: Havana_pseudogene
unitary_pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene
unprocessed_pseudogene: Validated_unprocessed_pseudogene,Havana_unprocessed_pseudogene,Ensembl_unprocessed_pseudogene

# Type List:
ambiguous_orf,antisense,artifact,IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,tRNAscan,tRNA_pseudogene,TR_pseudogene,unitary_pseudogene,unprocessed_pseudogene

Rel3: ambiguous_orf,antisense,artifact,                  IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene,          miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,      scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,                       tRNAscan,tRNA_pseudogene,TR_pseudogene,          unitary_pseudogene,unprocessed_pseudogene
Rel2: ambiguous_orf,antisense,artifact,C_segment,IG_gene,                              IG_pseudogene,          J_segment,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,                Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,transcribed_pseudogene,tRNAscan,tRNA_pseudogene,              V_segment,unitary_pseudogene,unprocessed_pseudogene

# New in Rel3:
IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,Mt_rRNA,Mt_tRNA,TR_pseudogene,
# Missing in Rel3:
C_segment,IG_gene,J_segment,scRNA,transcribed_pseudogene,V_segment,

# Make new trackDb setting based upt Rel3:
filterBy level:Level=+Validated,Manual_annotation,Automatic_annotation class:Class=Validated_coding,Validated_processed,Validated_processed_pseudogene,Validated_unprocessed_pseudogene,Validated_pseudogene,Havana_coding,Havana_non_coding,Havana_nonsense,Havana_polyA,Havana_processed_pseudogene,Havana_unprocessed_pseudogene,Havana_pseudogene,Havana_TEC,Ensembl_coding,Ensembl_non_coding,Ensembl_processed_pseudogene,Ensembl_unprocessed_pseudogene,Ensembl_pseudogene,Ensembl_RNA transcriptType:Transcript_Type=ambiguous_orf,antisense,artifact,IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,tRNAscan,tRNA_pseudogene,TR_pseudogene,unitary_pseudogene,unprocessed_pseudogene

wc -l:
84824 gencode.v3.level_1_2_annotation.NCBI36.classes
42472 gencode.v3.level_3_annotation.NCBI36.classes
127296 gencode.v3.annotation.NCBI36.classes

23361 gencode.v3.polyAs.NCBI36.classes
621 gencode.v3.tRNAs.NCBI36.classes

Make special DAF/DDF in to_release, then tar all
tar -cpzf gencode_rel3.tgz *
Submit
cd ../../../453 

m validate_error 
sh: line 1:  1118 Segmentation fault      gtfToGenePred /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.le
vel_1_2_annotation.NCBI36.gtf /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/doEncodeValidate.gtf.bed > /hive/groups
/encode/dcc/pipeline/encpipeline_prod/453/doEncodeValidate.gtf.err 2>&1
File 'gencode.v3.level_1_2_annotation.NCBI36.gtf' failed GTF validation

~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_1_2_annotation.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1
gtfToGenePred gencode.v3.level_3_annotation.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1
gtfToGenePred gencode.v3.polyAs.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1
~/bin/x86_64/gtfToGenePred gencode.v3.tRNAs.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1

# Turns out Mark has been making changes to gtfToGenePred but when I compled the latest, running it by hand did not segFault
resubmit

m validate_error 
File 'gencode.v3.level_1_2_annotation.NCBI36.gtf' failed GTF validation
no exons defined for ENSG00000223972
~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_1_2_annotation.NCBI36.gtf v3Manual.gtf.bed > v3Manual.gtf.err 2>&1
~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_3_annotation.NCBI36.gtf v3Auto.gtf.bed > v3Auto.gtf.err 2>&1
~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.polyAs.NCBI36.gtf v3polyAs.gtf.bed > v3polyAs.gtf.err 2>&1
~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.tRNAs.NCBI36.gtf v3tRNAs.gtf.bed > v3tRNAs.gtf.err 2>&1
84824 v3Manual.gtf.bed 28387 v3Manual.gtf.err
42472 v3Auto.gtf.bed 27353 v3Auto.gtf.err
0 v3polyAs.gtf.bed 23362 v3polyAs.gtf.err
0 v3tRNAs.gtf.bed 622 v3tRNAs.gtf.err
~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.polyAs.NCBI36.gtf v3polyAs.gtf.bed > v3polyAs.gtf.err 2>&1
~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.tRNAs.NCBI36.gtf v3tRNAs.gtf.bed > v3tRNAs.gtf.err 2>&1
# doesn't change anything.  What is needed:
# For polyAs and tRNAs do not use pipeline.  For Manual and Auto, send word to Felix

## Did Rel2 gtf pass gtfToGenePred ?
cd ../265
~/bin/x86_64/gtfToGenePred -allErrors gencode.rel2.manual.gtf test.gtf.bed > test.gtf.err 2>&1
No to manual.  Yes to auto.
~/bin/x86_64/gtfToGenePred -allErrors gencode.rel2.manual_noPolyA.gtf test.gtf.bed > test.gtf.err 2>&1
Only "olyA"s fail.  So yes Rel2 gtfs were okay by validator

At this point Felix agrees to remake the datasets

Release 3b 2009-09-03
=====================
cd pip 453
mkdir v3b
ftp://ftp.sanger.ac.uk/pub/gencode/gencode.v3b.for_DCC.tgz to v3b
mkdir v3
mv gencode.* v3
cd v3b
tar -tzf gencode.v3b.for_DCC.tgz
tar -xzf gencode.v3b.for_DCC.tgz
mv to_release/* ..
submit
loaded!
rename table wgEncodeSangerGencodeGencodeAutoV3   to wgEncodeGencodeAutoV3;
rename table wgEncodeSangerGencodeGencodeManualV3 to wgEncodeGencodeManualV3;

show tables like "wgEncodeGencode%";
| wgEncodeGencodeAutoRel1           | 
| wgEncodeGencodeAutoRel2           | 
| wgEncodeGencodeAutoV3             | 
| wgEncodeGencodeClassesRel1        | 
| wgEncodeGencodeClassesRel2        | 
| wgEncodeGencodeClassesRel2_full   | 
| wgEncodeGencodeClassesRel2_unused | 
| wgEncodeGencodeManualRel1         | 
| wgEncodeGencodeManualRel2         | 
| wgEncodeGencodeManualV3           | 
| wgEncodeGencodePolyaRel2          | 
 
Now the difficult part of loading polyAs, tRNAs and classes

create table wgEncodeGencodeAutoV3_tmp select * from wgEncodeGencodeAutoV3;

/cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_tmp gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 &
Reading gencode.v3.tRNAs.NCBI36.gtf
Read 621 transcripts in 621 lines in 1 files
  621 groups 24 seqs 1 sources 1 feature types
621 gene predictions

rename table wgEncodeGencodeAutoV3 to wgEncodeGencodeAutoV3_noTrnas;
rename table wgEncodeGencodeAutoV3_tmp to wgEncodeGencodeAutoV3;

grep HAVANA gencode.v3.polyAs.NCBI36.gtf | wc -l
23361
grep polyA_signal gencode.v3.polyAs.NCBI36.gtf | wc -l
21397
grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf | wc -l
1973
grep polyA_site gencode.v3.polyAs.NCBI36.gtf | wc -l
36
grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site
header
grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v3.polyAs.NCBI36.header.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.signal.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.pseudo.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.site.gtf
grep polyA_signal gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.signal.gtf
grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.pseudo.gtf
grep polyA_site gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.site.gtf

So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site

# Trick to create empty table
create table wgEncodeGencodePolyaV3 select * from wgEncodeGencodePolyaRel2;
delete from wgEncodeGencodePolyaV3;

~/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.signal.gtf > load_polyA.out 2>&1 &
/cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.pseudo.gtf >> load_polyA.out 2>&1 &
/cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.site.gtf >> load_polyA.out 2>&1 &
Reading gencode.v3.polyAs.NCBI36.signal.gtf
Read 21397 transcripts in 21397 lines in 1 files
  21397 groups 24 seqs 1 sources 2 feature types
21364 gene predictions
Reading gencode.v3.polyAs.NCBI36.pseudo.gtf
Read 1973 transcripts in 1973 lines in 1 files
  1973 groups 24 seqs 1 sources 3 feature types
1961 gene predictions
Reading gencode.v3.polyAs.NCBI36.site.gtf
Read 36 transcripts in 36 lines in 1 files
  36 groups 6 seqs 1 sources 1 feature types
36 gene predictions

# Time to build the classes table
ls -1 *.NCBI36*.classes
gencode.v3.polyAs.NCBI36.classes
gencode.v3.tRNAs.NCBI36.classes
gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes
gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.classes
head -2 gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes
geneId          transcriptId    transcriptType          level   Class
ENSG00000223972 ENST00000450305 unprocessed_pseudogene  2       Havana_unprocessed_pseudogene
ENSG00000227232 ENST00000488147 unprocessed_pseudogene  2       Havana_unprocessed_pseudogene

E) Submit
F) load uniq.tabs into hg18 temporarily.
echo "CREATE TABLE wgEncodeGencodeClassesV3 (
    geneId varchar(255) not null,
    name varchar(255) not null,
    transcriptType varchar(255) not null,
    level integer,
    class enum ('Undefined',
      'Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene',
      'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC',
      'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene','Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA')
     not null default 'Undefined',
    # indexes
    INDEX(transcriptType),
    INDEX(level,class),
    PRIMARY KEY(name));" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3.polyAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3.tRNAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
select count(*) from wgEncodeGencodeClassesV3;|   151278 | 
select count(*) from wgEncodeGencodeAutoV3;   |    43093 | +
select count(*) from wgEncodeGencodeManualV3; |    84824 | + 
select count(*) from wgEncodeGencodePolyaV3;  |    23325 | = 151242 What are the extra 36?

!!! Once agin I fell into the trap: geneId is used as name2 be default but I need geneName!!!

# My own copy of ldHgGene uses geneName as name2!
create table wgEncodeGencodeManualV3_old select * from wgEncodeGencodeManualV3;
delete from wgEncodeGencodeManualV3;
~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualV3 gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.gtf > reload_manual.out 2>&1 &
Reading gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.gtf
Read 84824 transcripts in 1116896 lines in 1 files
  84824 groups 24 seqs 1 sources 6 feature types
84824 gene predictions
| 585 | ENST00000450305 | chr1  | +      |    1872 |  3533 |     3533 |   3533 |         6 | 1872,2041,2475,2837,3083,3315,                            | 1920,2090,2560,2915,3237,3533,                            |     0 | RP11-34P13.1 | none         | none       | -1,-1,-1,-1,-1,-1,                | 
                                                                    ------------
drop table wgEncodeGencodeManualV3_old;

create table wgEncodeGencodeAutoV3_noTrnas_old select * from wgEncodeGencodeAutoV3_noTrnas;
delete from wgEncodeGencodeAutoV3_noTrnas;
create table wgEncodeGencodeAutoV3_Trnas select * from wgEncodeGencodeAutoV3_noTrnas;
~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoV3_noTrnas gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.gtf > reload_auto.out 2>&1 &
Reading gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.gtf
Read 42472 transcripts in 831718 lines in 1 files
  42472 groups 25 seqs 1 sources 6 feature types
42472 gene predictions

/cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_Trnas gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 &
Reading gencode.v3.tRNAs.NCBI36.gtf
Read 621 transcripts in 621 lines in 1 files
  621 groups 24 seqs 1 sources 1 feature types
621 gene predictions

drop table wgEncodeGencodeAutoV3;
create table wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_noTrnas;
insert into wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_Trnas;
select count(*) from wgEncodeGencodeAutoV3;|    43093 | 
| 585 | ENST00000456328 | chr1  | +      |    1736 |  4275 |     4021 |   4252 |         3 | 1736,2475,3083,                                                | 2090,2584,4275,                                                |     0 | RP11-34P13.1 | cmpl         | cmpl       | -1,-1,0,                   | 
                                                                         ------------
drop table wgEncodeGencodeAutoV3_noTrnas_old;

show tables like "wgEncodeGencode%V3%";
+--------------------------------------+
| Tables_in_hg18 (wgEncodeGencode%V3%) |
+--------------------------------------+
| wgEncodeGencodeAutoV3                | 
| wgEncodeGencodeAutoV3_Trnas          | 
| wgEncodeGencodeAutoV3_noTrnas        | 
| wgEncodeGencodeClassesV3             | 
| wgEncodeGencodeManualV3              | 
| wgEncodeGencodePolyaV3               | 
+--------------------------------------+
6 rows in set (0.09 sec)

Now edit trackDb.wgEncode.ra making an alpha and beta version of the wgEncodeSangerGencode track.
Add 3 new classes with colors to the track.
Because of numerous differences, and because there will soon be an hg19 version, I am creating a new composite
called wgEncodeGencode for this release 3.  The html will be moved immediately from hg18 to human (but rel2 remains as SangerGencode at hg18).

# Spend time massaging metadata
# set up search specs for hgFindSpec

/cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed

### 2009-10-02 
Controversy remains for colors and for OTTER IDs vs. Ensembl Ids.  Need to resolve before pushing to QA or having Brian do hg19.

2009-11-02 Release 3c 
=====================
cd pip 453
mkdir v3c
ftp://ftp.sanger.ac.uk/pub/gencode/gencode.v3c.for_DCC.tgz to v3c
mkdir v3
mv gencode.* v3
cd v3c
tar -tzf gencode.v3c.for_DCC.tgz
tar -xzf gencode.v3c.for_DCC.tgz
mv forDCC/* ..
# edit DAF/DDF to try to include new 2-way track (polyAs, tRNAs and classes still go by hand)
# unload 453 (since names are changed, all should be well.
submit
# 1) Needed dafVersion 1.1
# 2) needed validationSettings (allowReload seems appropriate)
# 3) No such luck on 2-way "no exons defined for Overlap1"
loaded!

#save first:
rename table wgEncodeGencodeAutoV3   to wgEncodeGencodeAutoV3b;
rename table wgEncodeGencodeManualV3 to wgEncodeGencodeManualV3b;
# now put in place:
rename table wgEncodeSangerGencodeGencodeAutoV3   to wgEncodeGencodeAutoV3;
rename table wgEncodeSangerGencodeGencodeManualV3 to wgEncodeGencodeManualV3;

rename table wgEncodeGencodePolyaV3 to wgEncodeGencodePolyaV3b;
rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3b;
rename table wgEncodeGencodeAutoV3_Trnas to wgEncodeGencodeAutoV3b_Trnas;
rename table wgEncodeGencodeAutoV3_noTrnas to wgEncodeGencodeAutoV3b_noTrnas;
 
show tables like "wgEncodeGencode%";
| wgEncodeGencodeAutoRel1           |     old: v1
| wgEncodeGencodeAutoRel2           |     old: v2
| wgEncodeGencodeAutoV3             | <== New v3c (needs tRNAs)
| wgEncodeGencodeAutoV3b            |     old: v3b
| wgEncodeGencodeAutoV3b_Trnas      |     old: working on it
| wgEncodeGencodeAutoV3b_noTrnas    |     old: working on it
| wgEncodeGencodeClassesRel1        |     old: v1
| wgEncodeGencodeClassesRel2        |     old: v2
| wgEncodeGencodeClassesRel2_full   |     old: working on it
| wgEncodeGencodeClassesRel2_unused |     old: working on it
| wgEncodeGencodeClassesV3b         |     old: v3b
| wgEncodeGencodeManualRel1         |     old: v1
| wgEncodeGencodeManualRel2         |     old: v2
| wgEncodeGencodeManualV3           | <== New v3c
| wgEncodeGencodeManualV3b          |     old: v3b
| wgEncodeGencodePolyaRel2          |     old: v2
| wgEncodeGencodePolyaV3b           |     old: v3b

!!! Once again: geneId is used as name2 by default but I need geneName!!!

# Use my local version of ldHgGene, since I edited genePred.c but never did the plumbing to make it a full option
#   because: geneId is used as name2 by default but I need geneName!!!

# empty table!
create table wgEncodeGencodeManualV3_old select * from wgEncodeGencodeManualV3;
# Records: 87627  Duplicates: 0  Warnings: 0
delete from wgEncodeGencodeManualV3;

~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualV3 gencode.v3c.annotation.NCBI36.level_1_2.gtf > reload_manual.out 2>&1 &
Reading gencode.v3c.annotation.NCBI36.level_1_2.gtf
Read 87627 transcripts in 1154766 lines in 1 files
  87627 groups 24 seqs 1 sources 6 feature types
87627 gene predictions
| 585 | ENST00000450305 | chr1  | +      |    1872 |  3533 |     3533 |   3533 |         6 | 1872,2041,2475,2837,3083,3315, | 1920,2090,2560,2915,3237,3533, |     0 | RP11-34P13.1 | none         | none       | -1,-1,-1,-1,-1,-1, | 
                                                                    ------------
drop table wgEncodeGencodeManualV3_old;

# empty table!
create table wgEncodeGencodeAutoV3_noTrnas select * from wgEncodeGencodeAutoV3;
# Records: 43889  Duplicates: 0  Warnings: 0
delete from wgEncodeGencodeAutoV3_noTrnas;
create table wgEncodeGencodeAutoV3_Trnas select * from wgEncodeGencodeAutoV3_noTrnas;
# Records: 0  Duplicates: 0  Warnings: 0

~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoV3_noTrnas gencode.v3c.annotation.NCBI36.level_3.gtf > reload_auto.out 2>&1 &
Reading gencode.v3c.annotation.NCBI36.level_3.gtf
Read 43889 transcripts in 867844 lines in 1 files
  43889 groups 25 seqs 1 sources 6 feature types
43889 gene predictions

/cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_Trnas gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 &
Reading gencode.v3.tRNAs.NCBI36.gtf
Read 621 transcripts in 621 lines in 1 files
  621 groups 24 seqs 1 sources 1 feature types
621 gene predictions

drop table wgEncodeGencodeAutoV3;
create table wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_noTrnas;
insert into wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_Trnas;
select count(*) from wgEncodeGencodeAutoV3;|    44510 | 
| 585 | ENST00000456328 | chr1  | +      |    1736 |  4275 |     4021 |   4252 |         3 | 1736,2475,3083, | 2090,2584,4275, |     0 | RP11-34P13.1 | cmpl         | cmpl       | -1,-1,0,   | 
                                                                         ------------
drop table wgEncodeGencodeAutoV3_noTrnas_old;

### Now the difficult part of loading polyAs, 2way and classes

# determine if only 3 types of polyA:
grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site
# Header only header
grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v3.polyAs.NCBI36.header.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.signal.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.pseudo.gtf
cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.site.gtf
grep polyA_signal gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.signal.gtf
grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.pseudo.gtf
grep polyA_site gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.site.gtf

# So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site

# empty table:
create table wgEncodeGencodePolyaV3 select * from wgEncodeGencodePolyaV3b;
delete from wgEncodeGencodePolyaV3;

~/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.signal.gtf > load_polyA.out 2>&1
/cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.pseudo.gtf >> load_polyA.out 2>&1
/cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.site.gtf >> load_polyA.out 2>&1
Reading gencode.v3.polyAs.NCBI36.signal.gtf
Read 21397 transcripts in 21397 lines in 1 files
  21397 groups 24 seqs 1 sources 2 feature types
21364 gene predictions
Reading gencode.v3.polyAs.NCBI36.pseudo.gtf
Read 1973 transcripts in 1973 lines in 1 files
  1973 groups 24 seqs 1 sources 3 feature types
1961 gene predictions
Reading gencode.v3.polyAs.NCBI36.site.gtf
Read 36 transcripts in 36 lines in 1 files
  36 groups 6 seqs 1 sources 1 feature types
36 gene predictions

# Now 2way...
create table wgEncodeGencode2WayV3 select * from wgEncodeGencodePolyaV3;
delete from wgEncodeGencode2WayV3;

~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencode2WayV3 gencode.v3.2wayconspseudos.NCBI36.gtf  > load_2way.out 2>&1
Reading gencode.v3.2wayconspseudos.NCBI36.gtf
Read 9474 transcripts in 9474 lines in 1 files
  9474 groups 24 seqs 1 sources 1 feature types
0 gene predictions
### Didn't work...

grep Yale_UCSC gencode.v3.2wayconspseudos.NCBI36.gtf | wl    9474
grep transcript gencode.v3.2wayconspseudos.NCBI36.gtf | wl   9474
# so lets treat "transcript as the exon"
~/bin/x86_64/ldHgGene -exon=transcript -genePredExt -noncoding -oldTable hg18 wgEncodeGencode2WayV3 gencode.v3.2wayconspseudos.NCBI36.gtf > load_2way.out 2>&1
Reading gencode.v3.2wayconspseudos.NCBI36.gtf
Read 9474 transcripts in 9474 lines in 1 files
  9474 groups 24 seqs 1 sources 1 feature types
9474 gene predictions
| 585 | Overlap414 | chr1  | -      |  118891 | 123443 |        0 |      0 |         1 | 118891,    | 123443,  |     0 | Overlap414 | none         | none       | -1,        | 

select count(*) from wgEncodeGencode2WayV3;|     9474 | 

# Time to build the classes table
ls -1 *.NCBI36*.classes
gencode.v3.2wayconspseudos.NCBI36.classes
gencode.v3.polyAs.NCBI36.classes
gencode.v3.tRNAs.NCBI36.classes
gencode.v3c.annotation.NCBI36.level_1_2.classes
gencode.v3c.annotation.NCBI36.level_3.classes
head -2 gencode.v3c.annotation.NCBI36.level_1_2.classes
geneId          transcriptId    transcriptType          level   Class                           OTT TranscriptId          OTT GeneId
ENSG00000223972 ENST00000450305 unprocessed_pseudogene  2       Havana_unprocessed_pseudogene   OTTHUMG00000000961        OTTHUMT00000002844
ENSG00000227232 ENST00000488147 unprocessed_pseudogene  2       Havana_unprocessed_pseudogene   OTTHUMG00000000958        OTTHUMT00000002839
head -1 gencode.v3c.annotation.NCBI36.level_3.classes
ENSG00000223972 ENST00000456328 protein_coding  3       Ensembl_coding  .       .
# Notice that OTTs are '.'

head -2 gencode.v3.2wayconspseudos.NCBI36.classes
Overlap1        Overlap1        pseudogene      3       2way_pseudogene
Overlap2        Overlap2        pseudogene      3       2way_pseudogene
head -1 gencode.v3.polyAs.NCBI36.classes
440716  440716  polyA_signal    2       Havana_polyA
head -1 gencode.v3.tRNAs.NCBI36.classes
38172   38172   tRNAscan        3       Ensembl_RNA
# Notice that there are not fields for OTT.

What to do?  Load them then set all '.' OTTs to NULL

E) Submit
F) load uniq.tabs into hg18 temporarily.
echo "CREATE TABLE wgEncodeGencodeClassesV3 (
    geneId varchar(255) not null,
    name varchar(255) not null,
    transcriptType varchar(255) not null,
    level integer,
    class enum ('Undefined',
      'Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene',
      'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC',
      'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene','Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA')
     not null default 'Undefined',
    ottTranscriptId varchar(255),
    ottGeneId varchar(255),
    # indexes
    INDEX(transcriptType),
    INDEX(level,class),
    PRIMARY KEY(name));" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3c.annotation.NCBI36.level_1_2.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3c.annotation.NCBI36.level_3.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3.tRNAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3.polyAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18
select count(*) from wgEncodeGencodeClassesV3;|   164972 | 
select count(*) from wgEncodeGencodeAutoV3;   |    44510 | +
select count(*) from wgEncodeGencodeManualV3; |    87627 | + 
select count(*) from wgEncodeGencodePolyaV3;  |    23361 | +
select count(*) from wgEncodeGencode2WayV3;   |     9474 | = 164972


show tables like "wgEncodeGencode%V3%";
+--------------------------------------+
| Tables_in_hg18 (wgEncodeGencode%V3%) |
+--------------------------------------+
| wgEncodeGencode2WayV3                | <== New v3c
| wgEncodeGencodeAutoV3                | <== New v3c
| wgEncodeGencodeAutoV3_Trnas          | <== Save for building AutoV3 
| wgEncodeGencodeAutoV3_noTrnas        | <== Save for building AutoV3  
| wgEncodeGencodeAutoV3b               |     Old V3b
| wgEncodeGencodeAutoV3b_Trnas         |     Old V3b for building AutoV3b
| wgEncodeGencodeAutoV3b_noTrnas       |     Old V3b for building AutoV3b
| wgEncodeGencodeClassesV3             | <== New v3c 
| wgEncodeGencodeClassesV3b            |     Old V3b
| wgEncodeGencodeManualV3              | <== New v3c 
| wgEncodeGencodeManualV3b             |     Old V3b
| wgEncodeGencodePolyaV3               | <== New v3c 
| wgEncodeGencodePolyaV3b              |     Old V3b
+--------------------------------------+
6 rows in set (0.09 sec)

select count(*) from wgEncodeGencodeClassesV3 where class = "Undefined";                       |        0 |
select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_coding";                |        0 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_processed";             |        0 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_processed_pseudogene";  |     3028 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_unprocessed_pseudogene";|       67 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_pseudogene";            |       74 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_coding";                |    36639 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_non_coding";            |    37583 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_nonsense";              |     4688 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_polyA";                 |    23361 |
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_processed_pseudogene";  |     3261 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_unprocessed_pseudogene";|     1344 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_pseudogene";            |      895 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_TEC";                   |       48 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_coding";                |    32289 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_non_coding";            |      395 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_processed_pseudogene";  |      138 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_unprocessed_pseudogene";|        9 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_pseudogene";            |     1852 | 
select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_RNA";                   |     9825 | 

select count(*) from wgEncodeGencodeClassesV3 where class = "";|     9476 | # Yipes.  However, this is a familiar number
select distinct class,level,transcriptType from wgEncodeGencodeClassesV3 order by class,level,transcriptType;
|                                  |     3 | non_coding                         | 
|                                  |     3 | pseudogene                         | 

select count(*) from wgEncodeGencodeClassesV3 where transcriptType = "non_coding" and class = "";  |        2 | 
| ENSG00000225880 | ENST00000426669 | non_coding     |     3 |       | .               | .         | 
| ENSG00000178796 | ENST00000431954 | non_coding     |     3 |       | .               | .         |
update wgEncodeGencodeClassesV3 set class = "Ensembl_non_coding" where transcriptType = "non_coding" and class = ""; 

### Note back to Felix:
update wgEncodeGencodeClassesV3 set class = "Ensembl_non_coding" where transcriptType = "non_coding" and class = ""; 
### Note back to Felix:
select count(*) from wgEncodeGencodeClassesV3 where ottGeneId = '.';                          |    54613 | 
select count(*) from wgEncodeGencodeClassesV3 where ottGeneId = '.' and ottTranscriptId = '.';|    54613 |  
update wgEncodeGencodeClassesV3 set ottGeneId = NULL,ottTranscriptId = NULL where ottGeneId = '.' and ottTranscriptId = '.'; 
# Rows matched: 54613  Changed: 54613  Warnings: 0


select count(*) from wgEncodeGencodeClassesV3 where transcriptType = "pseudogene" and class = "";  |     9474 | # familiar number needs new class?
select count(*) from wgEncodeGencodeClassesV3 where class = "";  |     9474 | # familiar number needs new class?
update wgEncodeGencodeClassesV3 set class = "2way_consensus_pseudogene" where transcriptType = "pseudogene" and class = "";
## Rows matched: 9474  Changed: 0  Warnings: 9474
select count(*) from wgEncodeGencodeClassesV3 where class = "2way_consensus_pseudogene"; | 0 | # Must change class enum definition first!!!

rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3_tmp;
echo "CREATE TABLE wgEncodeGencodeClassesV3 (
    geneId varchar(255) not null,
    name varchar(255) not null,
    transcriptType varchar(255) not null,
    level integer,
    class enum ('Undefined',
      'Validated_coding','Validated_processed','Validated_processed_pseudogene',
      'Validated_unprocessed_pseudogene','Validated_pseudogene',
      'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA',
      'Havana_processed_pseudogene','Havana_unprocessed_pseudogene',
      'Havana_pseudogene','Havana_TEC',
      'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene',
      'Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA',
      '2way_consensus_pseudogene')
     not null default 'Undefined',
    ottTranscriptId varchar(255),
    ottGeneId varchar(255),
    # indexes
    INDEX(transcriptType),
    INDEX(level,class),
    PRIMARY KEY(name));" | hgsql hg18
insert into wgEncodeGencodeClassesV3 select * from wgEncodeGencodeClassesV3_tmp;
# Records: 164972  Duplicates: 0  Warnings: 0

### Note back to Felix:
update wgEncodeGencodeClassesV3 set class = "2way_consensus_pseudogene" where transcriptType = "pseudogene" and class = "";
# Rows matched: 9474  Changed: 9474  Warnings: 0

drop table wgEncodeGencodeClassesV3_tmp;
 
select distinct class,level,transcriptType from wgEncodeGencodeClassesV3 order by class,level,transcriptType;
| 2way_consensus_pseudogene        |     3 | pseudogene                         | 
| Validated_processed_pseudogene   |     1 | processed_pseudogene               | 
| Validated_processed_pseudogene   |     1 | transcribed_processed_pseudogene   | 
| Validated_unprocessed_pseudogene |     1 | transcribed_unprocessed_pseudogene | 
| Validated_unprocessed_pseudogene |     1 | unprocessed_pseudogene             | 
| Validated_pseudogene             |     1 | IG_pseudogene                      | 
| Validated_pseudogene             |     1 | pseudogene                         | 
| Validated_pseudogene             |     1 | unitary_pseudogene                 | 
| Havana_coding                    |     2 | IG_C_gene                          | 
| Havana_coding                    |     2 | IG_D_gene                          | 
| Havana_coding                    |     2 | IG_J_gene                          | 
| Havana_coding                    |     2 | IG_V_gene                          | 
| Havana_coding                    |     2 | protein_coding                     | 
| Havana_non_coding                |     2 | ambiguous_orf                      | 
| Havana_non_coding                |     2 | antisense                          | 
| Havana_non_coding                |     2 | non_coding                         | 
| Havana_non_coding                |     2 | processed_transcript               | 
| Havana_non_coding                |     2 | retained_intron                    | 
| Havana_nonsense                  |     2 | nonsense_mediated_decay            | 
| Havana_polyA                     |     2 | polyA_signal                       | 
| Havana_polyA                     |     2 | pseudo_polyA                       | 
| Havana_processed_pseudogene      |     2 | processed_pseudogene               | 
| Havana_processed_pseudogene      |     2 | transcribed_processed_pseudogene   | 
| Havana_unprocessed_pseudogene    |     2 | transcribed_unprocessed_pseudogene | 
| Havana_unprocessed_pseudogene    |     2 | unprocessed_pseudogene             | 
| Havana_pseudogene                |     2 | IG_pseudogene                      | 
| Havana_pseudogene                |     2 | polymorphic_pseudogene             | 
| Havana_pseudogene                |     2 | pseudogene                         | 
| Havana_pseudogene                |     2 | retrotransposed                    | 
| Havana_pseudogene                |     2 | TR_pseudogene                      | 
| Havana_pseudogene                |     2 | unitary_pseudogene                 | 
| Havana_TEC                       |     2 | artifact                           | 
| Havana_TEC                       |     2 | TEC                                | 
| Ensembl_coding                   |     3 | IG_C_gene                          | 
| Ensembl_coding                   |     3 | IG_D_gene                          | 
| Ensembl_coding                   |     3 | IG_J_gene                          | 
| Ensembl_coding                   |     3 | IG_V_gene                          | 
| Ensembl_coding                   |     3 | protein_coding                     | 
| Ensembl_non_coding               |     3 | antisense                          | 
| Ensembl_non_coding               |     3 | non_coding                         | 
| Ensembl_non_coding               |     3 | processed_transcript               | 
| Ensembl_non_coding               |     3 | retained_intron                    | 
| Ensembl_processed_pseudogene     |     3 | processed_pseudogene               | 
| Ensembl_unprocessed_pseudogene   |     3 | unprocessed_pseudogene             | 
| Ensembl_pseudogene               |     3 | IG_pseudogene                      | 
| Ensembl_pseudogene               |     3 | miRNA_pseudogene                   | 
| Ensembl_pseudogene               |     3 | misc_RNA_pseudogene                | 
| Ensembl_pseudogene               |     3 | pseudogene                         | 
| Ensembl_pseudogene               |     3 | retrotransposed                    | 
| Ensembl_pseudogene               |     3 | unitary_pseudogene                 | 
| Ensembl_RNA                      |     3 | miRNA                              | 
| Ensembl_RNA                      |     3 | misc_RNA                           | 
| Ensembl_RNA                      |     3 | Mt_rRNA                            | 
| Ensembl_RNA                      |     3 | Mt_tRNA                            | 
| Ensembl_RNA                      |     3 | Mt_tRNA_pseudogene                 | 
| Ensembl_RNA                      |     3 | rRNA                               | 
| Ensembl_RNA                      |     3 | rRNA_pseudogene                    | 
| Ensembl_RNA                      |     3 | scRNA_pseudogene                   | 
| Ensembl_RNA                      |     3 | snoRNA                             | 
| Ensembl_RNA                      |     3 | snoRNA_pseudogene                  | 
| Ensembl_RNA                      |     3 | snRNA                              | 
| Ensembl_RNA                      |     3 | snRNA_pseudogene                   | 
| Ensembl_RNA                      |     3 | tRNAscan                           | 
| Ensembl_RNA                      |     3 | tRNA_pseudogene                    | 
| 2way_consensus_pseudogene        |     3 | pseudogene                         | 

select distinct class,level,transcriptType from wgEncodeGencode2WayV3 order by class,level,transcriptType;

# Oops, ottTranscriptId and ottGeneId appear to be reversed!

rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3_tmp;
echo "CREATE TABLE wgEncodeGencodeClassesV3 (
    geneId varchar(255) not null,
    name varchar(255) not null,
    transcriptType varchar(255) not null,
    level integer,
    class enum ('Undefined',
      'Validated_coding','Validated_processed','Validated_processed_pseudogene',
      'Validated_unprocessed_pseudogene','Validated_pseudogene',
      'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA',
      'Havana_processed_pseudogene','Havana_unprocessed_pseudogene',
      'Havana_pseudogene','Havana_TEC',
      'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene',
      'Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA',
      '2way_consensus_pseudogene')
     not null default 'Undefined',
    ottGeneId varchar(255),
    ottTranscriptId varchar(255),
    # indexes
    INDEX(transcriptType),
    INDEX(level,class),
    PRIMARY KEY(name));" | hgsql hg18
insert into wgEncodeGencodeClassesV3 select * from wgEncodeGencodeClassesV3_tmp;
# Records: 164972  Duplicates: 0  Warnings: 0

drop table wgEncodeGencodeClassesV3_tmp;

## Now edit trackDb.wgEncode.ra making a new 2way subtrack

# Spend time massaging metadata
# set up search specs for hgFindSpec

/cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed

# change metadata to "Gencode October 2009 Freeze"

# Search spec OTT ID??? 
# in trackDb.wgEncode.ra...
# (added Angie's hgFindSpec wiki entry) needed xrefQuery to wgEncodeGencode Classes

# Need to get downloads dir in order
cd {downloadsDir}
mkdir release2/
cp index.html release2/
cp fileDb.ra release2/
mv wgEncodeGencode*Rel2*.gz release2/

# wgEncodeSangerGencodeGencode versions were made by pipeline
mv wgEncodeSangerGencodeGencodeManualV3.gtf.gz wgEncodeGencodeManualV3.gtf.gz
mv wgEncodeSangerGencodeGencodeAutoV3.gtf.gz wgEncodeGencodeAutoV3.gtf.gz 
# manually make the reset

pushd {pip}/453
cat gencode.v3c.annotation.NCBI36.level_1_2.classes >> gencode.v3c.NCBI36.all.classes
cat gencode.v3c.annotation.NCBI36.level_3.classes >> gencode.v3c.NCBI36.all.classes
cat gencode.v3.tRNAs.NCBI36.classes >> gencode.v3c.NCBI36.all.classes
cat gencode.v3.2wayconspseudos.NCBI36.classes >> gencode.v3c.NCBI36.all.classes
cat gencode.v3.polyAs.NCBI36.classes >> gencode.v3c.NCBI36.all.classes
wl gencode.v3c.NCBI36.all.classes 164972
pop

gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.2wayconspseudos.NCBI36.gtf > wgEncodeGencode2wayV3.gtf.gz
gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.polyAs.NCBI36.gtf > wgEncodeGencodePolyaV3.gtf.gz
gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.tRNAs.NCBI36.gtf > wgEncodeGencodeTrnasV3.gtf.gz
gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3c.NCBI36.all.classes > wgEncodeGencodeClassesV3.tab.gz

#edit fileDb.ra
encodeDownloadsPage.pl index.html
/cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed


# Rachel says that pseudogenes need links out to Yale.  Oh boy.  That will wait til another day!


create table trackDb_qateam select * from trackDb_tdreszer;
create table hgFindSpec_qateam select * from hgFindSpec_tdreszer;
