#!/usr/bin/env python

"""Module Description

ChIPAssoc: ChIP-Seq Association Analysis. 

Given a genomic coordinates (e.g. ChIP-Seq of a transcription factor), 
and multiple gene sets, ChIPAssoc estimates the degree to which each gene set
is associated with the genomic coordinates by conducting KS test on the 
distances from TSSs to the nearest coordinates (TF binding sites).  

Copyright (c) 2011 H. Gene Shin <ulexis99@gmail.com>

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).

@status:  experimental
@version: $Revision$
@author:  H. Gene Shin
@contact: ulexis99@gmail.com
"""

# import python modules.
import sys, os, re, subprocess
import CEAS.inout
import logging
import operator
import itertools
import bisect
from optparse import OptionParser

# import my CEAS modules.
import CEAS.inout as inout
import CEAS.corelib as cl

# some constants to include
logging.basicConfig(level=20,
                    format='%(levelname)-5s @ %(asctime)s: %(message)s ',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    stream=sys.stderr,
                    filemode="w"
                    )
# some misc functions for eash programming.
error   = logging.critical        # function alias
warn    = logging.warning
debug   = logging.debug
info    = logging.info

# functions
def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    
    usage = "usage: %prog <-g gdb -b bed -g geneset> [options]"
    description = "ChIPAssoc -- Gene Set Association Analysis. Note that if the number of genomic coordinates is below 1000 or the number of a gene set is below 500, the p value of ChIPAssoc may not represent biological implications well enough."
    
    optparser = OptionParser(version="%prog 0.7",description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    optparser.add_option("-b","--bed",dest="bed",type="string",\
                         help="BED file of genomic coordinates (e.g. ChIP-Seq peaks). The center of each peak will be used to compute the distance from a gene.")
    optparser.add_option("-d","--db",dest="gdb",type="string",\
                         help="Gene annotation table. This can be a sqlite3 local db file, BED file or genome version of UCSC. The BED file must have an extension of '.bed'")
    optparser.add_option("-g", "--gset", dest="gset", action="append", type="string", help="Gene set to see the association with the genomic coordinates given through -b. Multiple gene sets can be given by repeatedly using this option (e.g. -g geneset1.txt -g geneset2.txt -g geneset3.txt). Genes must be given in a TXT file each line of which has a refseq accession ID or an official gene symbol (i.e. a single-column TXT file).")
    optparser.add_option("-n", "--min", dest="min", type="float", help="The lower limit of the distance to consider in association analysis (in kb). Note that the lower and upper limits must be reasonably apart to obtain a meaningful result. By default 0kb.", default=0)
    optparser.add_option("-x", "--max", dest="max", type="float", help="The upper limit of the distance to consider in association analysis (in kb). Note that the lower and upper limits must be reasonably apart to obtain a meaningful result. By default 200kb.", default=200)
    optparser.add_option("--name",dest="name", help="Experiment name. This will be used to name the output file. If an experiment name is not given, input BED file name will be used instead.")
    optparser.add_option("-l", "--lab", dest="label", action="append", type="string", help="Label for each gene set. Likewise, multiple gene set labels can be given by repeatedly using this option (e.g. -l label1 -l label2 -l label3). If labels are not given, 'gene set' will be used by default.", default=None)
    optparser.add_option("-r", "--rbg", dest="rbg", type="string", help="Background gene set. If a set of genes is given using this option, the gene set will be used as background or null set when running KS test. Otherwise, all refseq genes will be used as background. Background genes must be given in a TXT file each line of which has a refseq accession ID or an offical gene symbol  (i.e. a single-column TXT file).", default=None)
    optparser.add_option("--gname2",dest="name2",action="store_true", \
                         help="If this switch is on, gene or transcript IDs in files given through -g will be considered as official gene symbols.", default=False)                           
    return optparser


def opt_validate (optparser):
    """Validate options from a OptParser object.

    Ret: Validated options object.
    """
    (options,args) = optparser.parse_args()
    
    # if gdb not given, print help, either BED or WIG must be given 
    if not options.gdb or not options.bed or not options.gset:
        error("-d (--db) and -b (--bed) and -g (--gset) must be given files.")
        optparser.print_help()
        sys.exit(1)
        
    # check if gene annotation file exists.
    if not os.path.isfile(options.gdb):
        error("Check -d or --db. No such file as '%s'" %options.gdb)
        sys.exit(1)
    else:
        options.Host = None
        options.User = None
        options.Db = options.gdb
        
    # check if genomic coordinates (peak file) exist.
    if not os.path.isfile(options.bed):
        error("Check -b or --bed. No such file as %s" %options.bed)
        sys.exit(1)

    # check if gene set files exist.
    if type(options.gset) == str:
        options.gset = [options.gset]

    for gset in options.gset:
        if not os.path.isfile(gset):
            error("Check -g or --gset. No such file as %s" %gset)
            sys.exit(1)
    
    # check if gene set labels are given. 
    if options.label:
        if len(options.label) != len(options.gset):
            error("The number of the gene set labels (-l or --lab) must be the same as that of the gene sets (-g or --gset).")
            sys.exit(1)
    else:
        options.label =  map(lambda x: "gene set" + str(x), range(1, len(options.gset)+1))
    
    # if name is not given, use the stem of the bed file name. 
    if not options.name:
        options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0] 

    # by default, the lower and upper limits will be set to 0 and 1e9.
    options.lo = 0
    options.up = 1e9

    if options.min <0 or options.max <0:
        error("The lower and upper limits (-n and -x) of distance must be positive numbers!")
        sys.exit(1)
    elif options.min >= options.max:
        error("The upper limit (-x) must be larger than the lower limit (-n)!")
        sys.exit(1)
    elif options.max - options.min <= 1:
        error("The minimum distance resolution (the difference between the upper (-x) and lower (-n) limits) for the analysis is 1kb. Adjust -n and -x accordingly.")
        sys.exit(1)

    # when a background gene set is given.
    if options.rbg:
        if not os.path.isfile(options.rbg):
            error("Check -r or --rbg. No such file as %s" %options.rbg)
            sys.exit(1)

    return options


def read_a_gene_set( fn ):
    """
    Read a file of gene group.
    """

    genes = []
    for line in open(fn, 'r').xreadlines():

        if line == "\n": continue
        if not line: continue
        if line.startswith("#"): continue

        genes.append( line.strip().split("\t")[0] )

    return genes


def read_gene_sets( fns ):
    """
    Read a series of gene group files.
    
    fns must be a list containing gene group file names.
    """

    gene_sets = []
    for fn in fns:
        gene_set = list( set( read_a_gene_set( fn ) ) )
    
        gene_sets.append( gene_set )

    return gene_sets


def filter_chroms(chroms, regex):
    """Get rid of chromosome names with a user-specified re
    
    Parameters:
    1. chroms: chromosome names
    2. re: regular expression as a raw string
    
    Return:
    filtered_chrom: chromosome names after filtering
    
    """
    filtered_chroms=[]
    for chrom in chroms:
        if not re.search(regex, chrom):
            filtered_chroms.append(chrom)
    
    return filtered_chroms
    
    
def convert_BED2GeneTable(Bed):
    """Convert a BED-formatted gene annotation table into a GeneTable object
    
    Note that the Bed must strictly follow the BED format not to generate error conversion error.
    """

    # make a GeneTable object
    GeneT = inout.GeneTable()
    GeneT.reset()
    # set the column names of the gene table
    GeneT.columns=('name','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds')
    Bed_columns = ('name', 'strand', 'start','end','thickStart','thickEnd', 'blockCount','blockSizes','blockStarts')
    for chrom in Bed.get_chroms():
        GeneT[chrom] = {}
        # copy the columns into the GeneTable
        for gc, bc in itertools.izip(GeneT.columns, Bed_columns):
            GeneT[chrom][gc] = Bed[chrom][bc]
                
    return GeneT 


def argmin_window( array, lo, up):
    """Return the min and its index within a window defined by lo and up
    
    e.g.
    min, ix_min = argmin_window( array, lo=0, up=1e12 )
    
    """
    
    # index
    ix = range( 0, len( array ) )
    
    ix_win = [ i for i in ix if array[i] >= lo and array[i] < up ]
    array_win = map ( lambda x: array[x], ix_win )
    
    # what if there is no elements within the range? return None
    if len(ix_win) == 0:
        mn = None
        ix_min = None
    else:
        ix_min = cl.argmin( array_win )
        ix_min = ix_win[ix_min]
        mn = array[ix_min]

    return mn, ix_min


def argmax_window( array, lo, up ):
    """Return the max and its index within a window defined by lo and up.

    e.g.
    max, ix_max = argmin_window( array, lo=0, up=1e12 )
    
    """
    
    # index
    ix = range( 0, len( array ) )

    # windowing
    ix_win = [ i for i in ix if array[i] >= lo and array[i] < up ]
    array_win = map( lambda x: array[x], ix_win )

    if len(ix_win) == 0:
        mx = None
        ix_max = None
    else:
        # get the local maximum within the window
        ix_max = cl.argmax( array_win )
        ix_max = ix_win[ix_max]
        mx = array[ix_max]

    return mx, ix_max


def replace_None_with_NA( a ):
    """Replace None with "NA" in a given list

    """
    new = []
    for i in a:
        if i == None:
            new.append("NA")
        else:
            new.append( i )

    return new
        
def write_R(fn, bg, dist_genesets, label, bedfn, genesetfns, bgfn, min_dist, max_dist):
    """Write a R file for drawing the cumulative distributions of distances and performing KS test.
    
    Input;
    fn: output R file name
    bg: a list of background distances (all genes for example)
    dist_genesets: a nested list of distances of multiple gene sets. 
    label: gene set label. The number of elements in label must be the same as that of dist_genesets.
    min_dist: the lower limit of the distance to consider in association analysis (kb).
    max_dist: the upper limit of the distance to consider in association analysis (kb).
    """
    #internal variables
    by = 1000 # binsize is 1kb.
    height = 6
    width = 6
    xlab = "Distance to the nearest site (kb)"
    ylab = "Cumulative %"

    # header 
    rscript = "\n".join(("# Gene Set Association Analysis", \
                         "# Bed: %s" %(bedfn),\
                         "# Gene sets: %s" %(",".join(genesetfns)),\
                         "# Background set: %s" %(bgfn)))

    rscript += "\n"
    rscript += "\n"

    # make a color plate.
    rscript += "# make a color palete.\n"
    rscript += 'cr <- colorRampPalette(col=c("#C8524D", "#BDD791", "#447CBE", "#775A9C"), bias=1)\n'
    
    # check the number of gene sets.
    n = len(dist_genesets)
    if n != len(label):
        error("The number of gene sets and that of labels must be the same!!!")
        sys.exit(1)
    
    rscript += "col <- cr(%d)\n" %n
    rscript += "\n"

    # make a R list of distances.
    rscript += "# make a list of distances of gene sets.\n"
    rscript += "dist <- list()\n"
    for i in range(n):
        rscript += "d <- as.numeric(c(%s))\n" %(str(dist_genesets[i])[1:-1])
        #rscript += "d[d == 'NA'] <- NA\n"
        rscript += 'dist[["%s"]] <- d\n' %(label[i])
    rscript += "bg <- as.numeric(c(%s))\n" %(str(bg)[1:-1])
    rscript += "bg[bg == 'NA'] <- NA\n"
    rscript += "\n"

    # binning get the histograms. 
    rscript += "# Get the histograms for cumulative distributions.\n"
    rscript += "breaks <- seq(%d, %d+%d, by=%d)\n" %(int(min_dist*1000), int(max_dist*1000), by, by)
    rscript += "hs <- list()\n"
    for i in range(n):
        rscript += 'd <- as.vector(na.omit(dist[["%s"]]))\n' %(label[i])
        rscript += "d <- d[d >= %d & d <= %d]\n" %(int(min_dist*1000), int(max_dist*1000))
        rscript += "h <- hist(d, breaks=breaks, plot=FALSE)\n"
        rscript += 'hs[["%s"]] <- h\n' %(label[i])
        rscript += "\n"
    rscript += "dbg <- as.vector(na.omit(bg))\n"
    rscript += "dbg <- dbg[dbg >= %d & dbg <= %d]\n" %(int(min_dist*1000), int(max_dist*1000))
    rscript += "hbg <- hist(dbg, breaks=breaks, plot=FALSE)\n"
    rscript += "\n"
    
    rscript += "x <- h$mids / 1000 \n"
    rscript += "chs <- list()\n"
    for i in range(n):
        rscript += 'c <- 100 * cumsum(hs[["%s"]]$count) / sum(hs[["%s"]]$count)\n' %(label[i], label[i])
        rscript += 'chs[["%s"]] <- c\n' %(label[i])
        rscript += "\n"
    rscript += "cbg <- 100* cumsum(hbg$count) / sum(hbg$count)\n"
    rscript += "\n"

    # perform KS test.
    rscript += "# perform KS test.\n"
    rscript += "p <- list()\n"
    for i in range(n):
        rscript += 'd <- as.vector(na.omit(dist[["%s"]]))\n' %(label[i])
        rscript += "d <- d[d >= %d & d <= %d]\n" %(int(min_dist*1000), int(max_dist*1000))        
        rscript += 'ks <- ks.test(d, dbg, alternative="greater")\n'
        rscript += 'p[["%s"]] <- ks$p.value\n' %(label[i])
        rscript += "\n"

    # draw cumulative distributions. 
    rscript += "# draw the cumulative distributions.\n"
    rscript += 'pdf("%s", height=%d, width=%d)\n' %(re.sub(".R$", ".pdf", fn), height, width)
    rscript += 'plot(x, cbg, type="l", lty=2, lwd=2, col="black", xlab="%s", ylab="%s", main="ChIP Association Analysis", xlim=c(%d, %d), ylim=c(0, 100))\n' %(xlab, ylab, int(min_dist), int(max_dist))
    rscript += "l <- c()\n"
    for i in range(n):
        rscript += 'lines(x, chs[["%s"]], lty=1, lwd=2, col=col[%d])\n' %(label[i], i+1)
        rscript += 'l <- c(l, paste("%s (", format(p[["%s"]], digits=3), ")", sep=""))\n' %(label[i], label[i])
    rscript += "legend('bottomright', legend=l, col=col, pch=15)\n"
    rscript += "dev.off()\n"
    
    return rscript
    

# gene center annotator
class GeneAnnotator:
    """Class GeneAnnotator performs gene-centered annotation given a list of ChIP regions. This class is different in CEAS.annotator. 

       This class only gives the distance from each gene's TSS to its nearest binding site.
    
    """
    
    def __init__(self):
        
        self.map = inout.DataFrame()
        
    def annotate(self, GeneT, ChIP, lo=0, up=1e9):
        """Perform gene-centered annotation
        
        Note that lo and up represent the lower and upper bounds for the search range, respecitvely and are in kb.
        """
                
        # initialize the DtaFrame
        self.map.append_column([], colname = 'refseq')
        self.map.append_column([], colname= 'genesymbol')
        self.map.append_column([], colname = 'txStart')
        self.map.append_column([], colname = 'txEnd')
        self.map.append_column([], colname = 'strand')
        self.map.append_column([], colname = 'chrom')
        self.map.append_column([], colname = 'start')
        self.map.append_column([], colname = 'end')
        self.map.append_column([], colname = 'name')
        self.map.append_column([], colname = 'dist' )
        self.map.append_column([], colname = 'dir' )
        
        # get the chroms of the gene annotation table
        GeneT.sort()
        ChIP.sort()
        chroms = GeneT.get_chroms()
        chroms_ChIP = ChIP.get_chroms()
        chroms.sort()
        
        # iterate through the chromosomes
        n = 1
        for chrom in chroms:
            
	    # gene
            txStart = GeneT[chrom]['txStart']
            txEnd = GeneT[chrom]['txEnd']
            strand = GeneT[chrom]['strand']
            name = GeneT[chrom]['name']
            n_gene = len( txStart )
            
            # currently, only name is used.
            try:
                name2 = GeneT[chrom]['name2']
            except KeyError:
                name2 = GeneT[chrom]['name']

	    # if this chromosome is not in the ChIP list
	    if chrom not in chroms_ChIP:
		for nm, nm2, S, E, ss in itertools.izip( name, name2, txStart, txEnd, strand ):
		    self.map.append_row( [nm, nm2, S, E, ss, chrom, "NA", "NA", "NA", "NA", "NA"] )
		continue

            # get the ChIP start and end then the center of the regions
            ChIP_start = ChIP[chrom]['start']
            ChIP_end = ChIP[chrom]['end']
            ChIP_center = map( lambda x, y: (x+y)/2, ChIP_start, ChIP_end ) 
	    
	    # get the how many binding sites.
            n_ChIP = len( ChIP_start )
            
            # name
            try:
                ChIP_name = ChIP[chrom]['name']
            except KeyError: 
                ChIP_name = range(n, n+n_ChIP+1)
                ChIP_name = map(lambda x: str(x), ChIP_name)
                n = n+n_ChIP+1

	    # sort by the order of ChIP_center.
	    #ChIP_start = map( lambda x: ChIP_start[x], sort_ix_center )
	    #ChIP_end = map( lambda x: ChIP_end[x], sort_ix_center )
	    #ChIP_name = map( lambda x: ChIP_name[x], sort_ix_center )

            # get the TSS considering the strand
            genes = self.extract_txStarts( txStart, txEnd, strand, name, name2, sort=False )

            # get the distance matrix
            matrix = self.make_dist_matrix( genes, ChIP_center )
            
	    # get the shortest distance from each peak to genes. Warning shortest has signs
            shortest, ix_shortest = self.return_shortest_dist( matrix, genes, lo=lo, up=up )

	    # determin up or down stream of the peaks
	    updown = self.determine_updown( shortest )

	    # get the gene information
	    Cstart, Cend, Cname = self.get_ChIP_info( ix_shortest, ChIP_start, ChIP_end, ChIP_name )

	    # update the table: add rows
	    for nm, nm2, S, E, ss, ChIP_s, ChIP_e, ChIP_n, shrt, ud in itertools.izip(name, name2, txStart, txEnd, strand, Cstart, Cend, Cname, shortest, updown ):
                if shrt == None:
                    self.map.append_row( [nm, nm2, S, E, ss, chrom, ChIP_s, ChIP_e, ChIP_n, "NA", ud] )
                else:
                    self.map.append_row([nm, nm2, S, E, ss, chrom, ChIP_s, ChIP_e, ChIP_n, abs(shrt), ud])
		#self.map.append_row([chrom, ChIP_s, ChIP_e, ChIP_n, nm, S, E, ss, shrt, ud])
	    
    
    def extract_txStarts(self, txS, txE, strand, name, name2, sort=True):
        """Extract txStarts given 'txStart', 'txEnd' and 'strand' of a gene annotation table.
        
        Parameters:
        1. txS: 'txStart'
        2. txE: 'txEnd'
        3. starnd: 'strand'
        4. name: 'name'
        5. name2: 'name2' - gene symbols
        6. sort: True=sort by value False=just extract and return
        
        Return:
        a list, refseqs = [( txStart1, strand1, name1, name2 ),...( txStartn, strandn, namen, name2n )] 
        """

        genes=[]
        for s,e,st,n, n2 in itertools.izip(txS,txE,strand,name, name2):
            if st=='+':
                genes.append((s,e,st,n, n2))
            else:
                genes.append((e,s,st,n, n2))
        
        if sort:
            genes=sorted(genes,key=operator.itemgetter(0))
        
        return genes


    def make_dist_matrix(self, genes, ChIP_center ):
        """make a distance matrix of ChIP_centers and genes. This matrix is going to be used for the shortest distance within a given regional window.
        
        Input:
        1. genes: gene annotation along with name.
        2. ChIP_center: centers of the binding sites.
        3. name2=True: gene symbol is used.

            e.g.
            matrix = self.make_dist_matrix( ChIPs, genes, name2=False )
        
        """

        n_genes = len( genes )
        TSS = map ( lambda x: operator.itemgetter( 0 )( x ), genes )
        name = map( lambda x: operator.itemgetter( 3 )( x ), genes )
        name2 = map( lambda x: operator.itemgetter( 4 )( x ), genes )
        
        # for genesymbol: This part is not perfectly accurate. I just used the first transcript (refseq) as a representative for genes with the same gene symbol. 
        matrix = {}
        for t, nm, nm2 in itertools.izip( TSS, name, name2 ):
            matrix[ nm ] = [nm2, map( lambda x: x - t, ChIP_center )]

        return matrix


    def return_shortest_dist( self, matrix, genes, lo=0, up=1e9 ):
        """Return the shortest distance considering the window.
        
           name2=True: gene symbols will be used. Note that even though matrix from make_dist_matrix has unique gene symbols or index, shortest, ix_shortest will
           have redundant gene symbols. 

           shortest, ix_shortest = self.return_shortest_dist( matrix, lo=0, up=1e9)

           Note that lo and up are in kb.
        """

        name = map( operator.itemgetter(3), genes )
        
        shortest = []
        ix_shortest = []
        
        for nm in name:

            # in case the gene is located downstream of the binding site
            mn_do, ix_do = argmax_window( matrix[nm][1], lo= -1 * up * 1000, up = -1 * lo * 1000 )

            # in case the gene is located upstream of the binding site
            mn_up, ix_up = argmin_window( matrix[nm][1], lo = lo * 1000, up = up * 1000 )

            # what if there are no elements within the range?
            if not mn_do and not mn_up:
                shortest.append( None )
                ix_shortest.append( None )
            elif not mn_do and mn_up:
                shortest.append( mn_up )
                ix_shortest.append( ix_up )
            elif mn_do and not mn_up:
                shortest.append( mn_do )
                ix_shortest.append( ix_do )
            else:
                m = [mn_do, mn_up]
                ix = [ix_do, ix_up]
                i = cl.argmin( map( lambda x: abs(x), m ) )
                mn = m[i]
                i = ix[i]
            
                shortest.append( mn )
                ix_shortest.append( i )

        return shortest, ix_shortest

	
    def get_ChIP_info( self, ix, ChIP_start, ChIP_end, ChIP_name ):
        """Return the information of the genes that are closest to the binding sites.
           
           e.g.
           Cstart, Cend, Cname = self.get_ChIP_info( ix_shortest, ChIP_start, ChIP_end, ChIP_name )

        """
	Cname = []
	Cstart = []
	Cend = []
	
	for i in ix:
            if i == None:
                Cname.append( "NA" )
                Cstart.append( "NA" )
                Cend.append( "NA" )
            else:
                Cname.append( ChIP_name[i] )
                Cstart.append( ChIP_start[i] )
                Cend.append( ChIP_end[i] )
	
	return Cstart, Cend, Cname


    def determine_updown( self, shortest ):
        """Determine whether the closest gene is upstream or downstream of each binding site

        e.g.
        up_down = self.determine_up_down( shortest )
        """
        
        updown = []
        for s in shortest:
            if s == None:
                updown.append( "NA" )
            else:
                if s < 0:
                    updown.append( 'up' )
                else:
                    updown.append( 'down' )

        return updown
     
    
    def write(self, fn, description=True):
        """Write the gene-centered annotation result in a TXT file with XLS extension
        
        Parameters:
        1. fn: file name. XLS extension will be added automatically.
        2. description: If True, add a header of desciption of columns
        """
        
        # if description is True, put comment (header)
        if description == True:
            
            comment = "\n".join(("# refseq: nearest gene name", \
                                 "# genesymbol: near gene name (gene symbol)", \
                                 "# txStart: the txStart of the nearest gene", \
                                 "# txEnd: the txEnd of the nearest gene",\
                                 "# strand: the stand where the gene is", \
		                 "# chrom: chromosome", \
                                 "# start: starting edge of each binding site",\
                                 "# end: ending edge of each bindign site", \
                                 "# name: peak name", \
                                 "# dist: shortest distance from a binding site to the nearest gene", \
		                 "# dir: up: the nearest gene is upstream of the binding site, down: the nearest gene is downstream of the binding site."))
               
            comment += "\n"               
        else:
            comment = ""
        
        self.map.write(fn = fn + '.xls', comment = comment)


# ------------------------------------
# Main function
# ------------------------------------

def main():
    
    # read the options and validate them
    options=opt_validate(prepare_optparser())
    jobcount=1
    
    # read the gene annotation table
    info("#%d read the gene annotation table..." %jobcount)
    GeneT = inout.GeneTable()
    try:
        GeneT.read(Host = options.Host, User= options.User, Db=options.Db, annotation='GeneTable', \
    	           columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds', 'name2'), which="", where=())
    except Exception, e:
        # if 'name2' does not exist and the user wants to use 'name2', error; otherwise, 
        if re.search(r'column.*name2', str(e)):
            error("The gene annotation table does not have 'name2.'")
            sys.exit(1)
        else:
            raise  
    
    # sort the genes by chromsome and TSS.
    GeneT.sort()
    chroms_GeneT=GeneT.get_chroms()
    chroms_GeneT=filter_chroms(chroms_GeneT,'_[A-Za-z0-9]*')
    jobcount+=1
    
    # read the ChIP bed file.
    info("#%d read a ChIP region BED file..." %jobcount)
    ChIP = inout.Bed()
    ChIP.read(options.bed)
    jobcount+=1

    # read the gene sets.
    info("#%d read gene sets..." %jobcount)
    gene_sets = read_gene_sets( options.gset )
    if options.rbg:  # if bg set is given.
        bg_set = read_a_gene_set( options.rbg )
    jobcount += 1

    # perform gene-centered annotation.
    info("#%d perform gene-centered annotation..." %jobcount)
    GAnnotator=GeneAnnotator()
    GAnnotator.annotate( GeneT, ChIP, lo=options.lo, up=options.up )
    GAnnotator.map.set_name(options.name)
    jobcount+=1

    # select the distances of the genes. 
    info("#%d get the distances of genes in the gene sets..." %jobcount)

    if options.name2 == False:
        
        # make a reference table. 
        cross = {}
        for i, x in enumerate(GAnnotator.map["refseq"]):
            cross[ x ] = i
        
        # bg gene set
        bg_dist = []
        if options.rbg: # if a bg file is given.
            for g in bg_set:
                if cross.has_key(g):
                    bg_dist.append(GAnnotator.map["dist"][cross[g]])
                else:
                    bg_dist.append("NA")
        else: # if a bg file is NOT given, all the refseq genes are used.
            bg_dist = GAnnotator.map["dist"]

        # get the distances of gene sets.
        dist_genesets = []
        for geneset in gene_sets:
            tmp = []
            for g in geneset:
                if cross.has_key(g):
                    tmp.append(GAnnotator.map["dist"][cross[g]])
                else:
                    tmp.append("NA")
            dist_genesets.append(tmp)

    else: # when gene symbols were used.
        cross = {}
        for i, x in enumerate( GAnnotator.map["genesymbol"]):
            if not cross.has_key( x.lower() ):
                cross[ x.lower() ] = i

        # bg gene set
        bg_dist = []
        if options.rbg: # if a bg file is given.
            for g in bg_set:
                if cross.has_key(g.lower()):
                    bg_dist.append(GAnnotator.map["dist"][cross[g.lower()]])
                else:
                    bg_dist.append("NA")
        else: # if a bg file is NOT given, all the refseq genes are used.
            bg_dist = GAnnotator.map["dist"]

        dist_genesets = []
        for geneset in gene_sets:
            tmp = []
            for g in geneset:
                if cross.has_key(g.lower()):
                    tmp.append(GAnnotator.map["dist"][cross[g.lower()]])
                else:
                    tmp.append("NA")
            dist_genesets.append(tmp)
            
        jobcount += 1
    
    #write a R script for drawing the cumulative distributions and perform KS test.
    info("#%d write a R script for drawing cumulative distributions and performing KS test..." %jobcount)
    if options.rbg:
        bgfn = options.rbg
    else:
        bgfn = "All genes"
    rscript = write_R(options.name + ".R", bg_dist, dist_genesets, options.label, options.bed, options.gset, bgfn, options.min, options.max)
    out = open(options.name + ".R", "w")
    out.write(rscript)
    out.close()
    jobcount += 1

    info("#%d see %s" %(jobcount, options.name + '.R...'))
    jobcount += 1
    
    # run the R script.
    info("#%d run the R script..." %jobcount)
    # Run R directly - if any exceptions, just pass
    try:
        p = subprocess.Popen("R" + " --vanilla < %s"  %(options.name+'.R'), shell=True)
        sts = os.waitpid(p.pid, 0)
        info ('#... DONE! See %s for the graphical results of ChIPAssoc!' %(options.name+'.pdf'))
    except:
        info ('#... OOPS! Run %s using R for the graphical results of ChIPAssoc for yourself! ChIPAssoc could not run R directly.' %(options.name+'.R'))

    
if __name__=="__main__":
    try:
        main()
    except KeyboardInterrupt:
        warn("User interrupts me! ;-) See you!")
        sys.exit(0)
