#!/usr/bin/python
# ------------------------------------
# python modules
# ------------------------------------
import os
import sys
import re
import logging
import subprocess
from subprocess import call as subpcall
import warnings
from optparse import OptionParser
import exonCEAS.inout as inout
#import exonCEAS.siteprofiler as sitepro
from pexpect import run
import exonCEAS.exonplot as exonplot

# ------------------------------------
# constants
# ------------------------------------
logging.basicConfig(level=20,
                    format='%(levelname)-5s @ %(asctime)s: %(message)s ',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    stream=sys.stderr,
                    filemode="w"
                    )

# ------------------------------------
# Misc functions
# ------------------------------------
error   = logging.critical		# function alias
warn    = logging.warning
debug   = logging.debug
info    = logging.info
def run_cmd ( command ):
    info ("Run: %s" % command)
    subpcall (command,shell=True)
    return

# ------------------------------------
# Main function
# ------------------------------------

def main():
        
    # read the options and validate them
    options=opt_validate(prepare_optparser())
    # read the gene annotation table

    jobcount=1
    info("#%d read the gene table..." %jobcount)
    GeneT = inout.GeneTable()
    GeneT.read(Host = None, User= None, Db=options.gdb, annotation='GeneTable', columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds', 'name2'))
    GeneT.sort()
    chroms_GeneT=GeneT.get_chroms()
    chroms_GeneT=filter_chroms(chroms_GeneT,'_[A-Za-z0-9]*')

    jobcount+=1
    info("-check the resolution..")
#    options.pf_res=check_resolution_with_wiggle(options)
    info("#%d group gene and calculate profile"%jobcount)
    if(group_genes_and_plot_exons(options,GeneT,chroms_GeneT)==True):
        info("#..Ok.Group genes and make Rscript successfully")
 # Run R directly - if any exceptions, just pass
    try:
        p = subprocess.Popen("R" + " --vanilla < %s"  %(options.name+'_CI.R'), shell=True)
        sts = os.waitpid(p.pid, 0)
        info ('#... Great! See %s for the graphical results of CEAS!' %(options.name+'_CI.pdf'))
    except:
        info ('#... Oops! Run %s using R for the graphical results of CEAS! CEAS could not run R directly.' %(options.name+'.R'))
    	

def check_resolution_with_wiggle(options):
    """Check whether the resolution is too small to cause a wrong result
    check it with the span in wiggle
    choose a larger one in spans in wiggle as well as options

    Return:
    the larger result

    """
    file_wig=open(options.wig,"r")
    # check if resolution is smaller than wiggle
    search_span_in=re.compile(r'span=(\S+)\s')
    search_scope=file_wig.read()
    spans_list=search_span_in.findall(search_scope)
    file_wig.close()
    spans_list=map(int,spans_list)
    max_span=max(spans_list)

    if options.pf_res<max_span:
            logging.info(("#%d WARNING:resolution too small to fit the step in wiggle file")%(jobcount))
            logging.info("#%d. adjust resolution from %d to %d"%(jobcount,options.pf_res,min_span))
            return min_span
    else:
        return options.pf_res
     
    
    
def filter_chroms(chroms,regex):
    """Get rid of chromosome names with a user-specified re
    
    Parameters:
    1. chroms: chromosome names
    2. re: regular expression as a raw string
    
    Return:
    filtered_chrom: chromosome names after filtering
    
    """
    filtered_chroms=[]
    for chrom in chroms:
        if not re.search(regex, chrom):
            filtered_chroms.append(chrom)
    
    return filtered_chroms



def read_wiggle(wiggle_path):

    wiggle_file=open(wiggle_path,"r")
    wiggle_content=wiggle_file.read()
    wiggle_chroms_re=re.compile(r'chrom=(\S+)\s')

    wiggle_chroms=wiggle_chroms_re.findall(wiggle_content)

    wiggle_splited=wiggle_content.split("chrom=")[1:]
    wiggle_profile_re=re.compile(r'(\d+)\t([\-]*\d+[\.]*\d*)')

    wiggle_dictionary={}

    for (one_section,one_chrom) in zip(wiggle_splited,wiggle_chroms):
        info( "reading wiggle file (%s)'s %s"%(wiggle_path,one_chrom))
        profile_list=wiggle_profile_re.findall(one_section)
        transposed_section=zip(*((int(i),float(j)) for (i,j) in profile_list))
        wiggle_dictionary[one_chrom]=transposed_section
    wiggle_file.close()
    return wiggle_dictionary

def read_bed(bed_path):
    
    count = 0
    bed_file=open(bed_path,"rU")
    bed_list=[]

    for line in bed_file:
        if line.startswith("#") or not line.strip():
            continue
        line=line.split()
        bed_list.append(line)
        count+=1
    bed_file.close()
    info("Read file<%s> OK! <%d>peaks."%(bed_path,count-1))

    return bed_list

def group_genes_and_plot_exons(options,GeneT,chroms_GeneT):

    exonplot_biggerspan=max(options.ex_ispan,options.ex_espan)
    exonplot_lbin=options.ex_ispan / options.pf_res
    exonplot_rbin=options.ex_espan / options.pf_res
    #some parameter for exonplot


    if options.gn_groups:
        #If there are gene groups input, just graph by groups
        
        for (i,j) in zip(options.gn_groups,options.gn_names):
            subsets = inout.read_gene_subsets2([i])
            espath='./'+options.name+'_'+j+'_es.bed'
            eepath='./'+options.name+'_'+j+'_ee.bed'
            esfile=open(espath,'w')
            eefile=open(eepath,'w')
            for chr in chroms_GeneT:
                ixs, subsets = exonplot.get_gene_indicies_by_groups(GeneT[chr]['name'], subsets)        
                list=exonplot.get_exons_byindex(GeneT,ixs,chr,options)
                if list!=False:
                    exonplot.paired_bed_make(list,esfile,eefile,chr)
                    #Get the exons of the genes listed in the group,and output as a BED file,whether to take direction and exon length into consideration can be changed as options
            esfile.close()
            eefile.close()
            
            sitepro.CenterProfile("--span=%s --pf-res=%s --dump --name=%s -w %s -l %s_es -b %s -l %s_ee -b %s --dir"\
                             %(exonplot_biggerspan,options.pf_res,options.name+'_'+j,options.wig,options.name+'_'+j,espath,options.name+'_'+j,eepath))
            #use siteprofiler to dump profile data
        
    elif options.gn_groups==None:
        info("#No gene groups are input.Auto-grouping will begin!")
        #if there is not any gene group input, just make them into two groups by score(profile)

        options.gn_names=("withpeak","withoutpeak")
        espath_withpeak='./'+options.name+"_"+options.gn_names[0]+'_es.bed'
        edpath_withpeak='./'+options.name+"_"+options.gn_names[0]+'_ee.bed'
        espath_withoutpeak='./'+options.name+"_"+options.gn_names[1]+'_es.bed'
        edpath_withoutpeak='./'+options.name+"_"+options.gn_names[1]+'_ee.bed'

        eswfile=open(espath_withpeak,'w')
        edwfile=open(edpath_withpeak,'w')
        esofile=open(espath_withoutpeak,'w')
        edofile=open(edpath_withoutpeak,'w')

        if not options.bed:
            info("#No bed file is input.Auto group by wiggle profile")
            wiggle_dict=read_wiggle(options.wig)

            cutoff=1.5
            for chr in chroms_GeneT:
                (ixs1,ixs2,count1,count2)=exonplot.get_gene_indicies_by_wiggleprofile(GeneT,wiggle_dict,chr,cutoff)

                list1=exonplot.get_exons_byindex(GeneT, ixs1, chr, options)
                list2=exonplot.get_exons_byindex(GeneT, ixs2, chr, options)
                if (list1 and list2) !=False:
                    exonplot.paired_bed_make(list1, eswfile,edwfile , chr)
                    exonplot.paired_bed_make(list2, esofile,edofile , chr)
        else:
            info("Bed file(result of peak calling) is input.Auto group by bed peaks")
            bed_list=read_bed(options.bed)
            
            output=run("bigWigInfo {0} -chroms".format(options.wig))
            chroms_GeneT_inbw=re.compile("\r\n\t(chr\w+)").findall(output)
            chroms_GeneT=set(chroms_GeneT)&set(chroms_GeneT_inbw)
            for chr in chroms_GeneT:
                info("#getting genes on %s"%chr)
                (ixs1,ixs2,count1,count2)=exonplot.get_gene_indicies_by_bedpeaks(GeneT,bed_list,chr)
                list1=exonplot.get_exons_byindex(GeneT, ixs1, chr, options)
                list2=exonplot.get_exons_byindex(GeneT, ixs2, chr, options)
                if (list1 and list2) !=False:
                    exonplot.paired_bed_make(list1, eswfile,edwfile , chr)
                    exonplot.paired_bed_make(list2, esofile,edofile , chr)
                
        eswfile.close()
        edwfile.close()
        esofile.close()
        edofile.close() 
        

        commandline_withpeak="{siteproBW} --span={span} --pf-res={res} --dump --name={name} -w {wig} -l {label1} -b {bed1} -l {label2} -b {bed2} --dir"\
            .format(siteproBW="siteproBW",span=exonplot_biggerspan,res=options.pf_res,name=options.name+"withpeak",\
                         wig=options.wig,label1=options.name+"_withpeak_es",bed1=espath_withpeak,label2=options.name+"_withpeak_ee",bed2=edpath_withpeak)
        run_cmd(commandline_withpeak)
        
        print commandline_withpeak        

        commandline_withoutpeak="{siteproBW} --span={span} --pf-res={res} --dump --name={name} -w {wig} -l {label1} -b {bed1} -l {label2} -b {bed2} --dir"\
            .format(siteproBW="siteproBW",span=exonplot_biggerspan,res=options.pf_res,name=options.name+"withoutpeak",\
                         wig=options.wig,label1=options.name+"_withoutpeak_es",bed1=espath_withoutpeak,label2=options.name+"_withoutpeak_ee",bed2=edpath_withoutpeak)


        run_cmd(commandline_withoutpeak)
        print commandline_withoutpeak
        
    Rscript_text=exonplot.make_Rscript_with_CI(options.gn_names,options.name,exonplot_lbin, exonplot_rbin,options.pf_res,exonplot_biggerspan)
    #use 'exonplot' to draw all groups in one page and calculate confidence interval        

    ofhd=open(options.name+'_CI.R','w')
    ofhd.write(Rscript_text)
    ofhd.write('dev.off()')
    ofhd.close()
    info("R script have been writen")
    if not options.dump:
        clean_dumps(options)
            #if dump is not need,just clean all temp file
    return True
   
def clean_dumps(options):
    for a_group in options.gn_names:
        os.remove('./'+options.name+"_"+a_group+'_es_dump.txt')
        os.remove('./'+options.name+"_"+a_group+'_ee_dump.txt')
        os.remove('./'+options.name+"_"+a_group+'_es.bed')
        os.remove('./'+options.name+"_"+a_group+'_ee.bed')        
    return True

def opt_validate (optparser):
    """Validate options from a OptParser object.

    Ret: Validated options object.
    """
    (options,args) = optparser.parse_args()
    
    # if gdb not given, print help, either BED or WIG must be given 
    if not options.gdb and not options.bed and not options.wig:
        optparser.print_help()
        sys.exit(1)
    elif not options.gdb:
        error('A gene table file must be given through -g (--gt).')
        sys.exit(1)
    elif options.gdb and not options.bed and not options.wig:
        error('Either a BED file or a WIG file must be given.')
        sys.exit(1)
   
    ##
    # check what inputs are given and determine which modules will operate
    ##
    
    #
    # check gene annotation table database
    # 
    HAVELOCALGDB = os.path.isfile(options.gdb)
    if not HAVELOCALGDB:
        error("No such gene table file as '%s'" %options.gdb)
        sys.exit(1)
    else:
        options.gdbtype = 'localdb'
        options.Host = None
        options.User = None
    
    #
    #check the ChIP bed file
    #
    if options.bed:
        HAVEBED = os.path.isfile(options.bed)
        if not HAVEBED:
            error("Check -b (--bed). No such bed file as '%s'" %options.bed)
            sys.exit(1)
        if os.path.getsize(options.bed) > 5000000:
            warnings.warn("ChIP bed file size may be too large to run CEAS with. Make sure it is a 'peak' file!")
            #error("ChIP bed file size is too big to handle! The file size is limmited to 5M bytes.")
            #sys.exit(1)
    else: HAVEBED = False
    
    #
    # check the wig file
    # 
    if options.wig:
        HAVEWIG=os.path.isfile(options.wig)
        if not HAVEWIG:
            error("Check -w (--wig). No such wig file as '%s'" %options.wig)
            sys.exit(1)
    else: HAVEWIG=False
        
    # get the experiment name
    #
    # if options.name is not given, BED and WIG file names will be used in order
    if not options.name:
        if HAVEBED:
            options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0]
        elif HAVEWIG:
            options.name=os.path.split(options.wig)[-1].rsplit('.wig',2)[0]
    

    # Average profiling related parameters
    #
    #check if name2 is going to be used instead of name
    if options.name2 and options.gn_groups:
        options.name2 = True
    else:
        options.name2 = False
                    
    # check the gene group files    
    if options.gn_groups:
        parsed=options.gn_groups.rsplit(',')
        for p in parsed:
            if not os.path.isfile(p):
                error("Check --gn-groups. No such file as '%s'" %p)
                sys.exit(0)
        options.gn_groups=parsed
        
        # gene group names. If not given, Group 1, Group 2, ... Group n will be used
        if options.gn_names:
            parsed_names=options.gn_names.rsplit(',')
            if len(parsed_names) < len(options.gn_groups):
                error('There must be the equal or more group names to or than gene groups')
                sys.exit(0)
            options.gn_names=parsed_names
        else:
            options.gn_names=[]
            for i in range(len(options.gn_groups)):
                options.gn_names.append('Group_%d' %(i+1))
    
    
    # profiling resolution
    return options
# ------------------------------------
# functions
# ------------------------------------
  
def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    
    usage = "usage: %prog < input files > [options]"
    description = "Exon CEAS (Cis-regulatory Element Annotation System for Exon)"
    
    optparser = OptionParser(version="%prog -- 0.9.9.8 beta (package version 1.0.2)",description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    optparser.add_option("-b","--bed",dest="bed",type="string",
                         help="BED file of ChIP regions.")
    optparser.add_option("-w","--wig",dest="wig",type="string",
                         help="WIG file for either wig profiling or genome background annotation. WARNING: --bg flag must be set for genome background re-annotation.")
    optparser.add_option("-g","--gt",dest="gdb",type="string",
                         help="Gene annotation table (eg, a refGene table in sqlite3 db format provided through the CEAS web, http://liulab.dfci.harvard.edu/CEAS/download.html).")
    optparser.add_option("--name",dest="name",\
                         help="Experiment name. This will be used to name the output files. If an experiment name is not given, the stem of the input BED file name will be used instead (eg, if 'peaks.bed', 'peaks' will be used as a name.)")
    optparser.add_option("--gn-group-names", dest="gn_names",type="string",\
                         help="The names of the gene groups in --gn-groups. The gene group names are separated by commas. (eg, --gn-group-names='top 10%,bottom 10%'). These group names appear in the legends of the wig profiling plots. If no group names given, the groups are represented as 'Group 1, Group2,...Group n'.")
    optparser.add_option("--gname2", action="store_true", dest="name2",\
                         help="Whether or not use the 'name2' column of the gene annotation table when reading the gene IDs in the files given through --gn-groups. This flag is meaningful only with --gn-groups.",default=False)
    optparser.add_option("--gn-groups",dest="gn_groups",type="string",\
                         help="Gene-groups of particular interest in wig profiling. Each gene group file must have gene names in the 1st column. The file names are separated by commas w/ no space (eg, --gn-groups=top10.txt,bottom10.txt)") 
    optparser.add_option("--pf-res", dest="pf_res", type="int",\
                          help="Wig profiling resolution, DEFAULT: 50bp. WARNING: Value smaller than the wig interval (resolution) may cause aliasing error.", default=50) 

    optparser.add_option("--utr", action="store_true", dest="exon_utr",\
                         help="Whether to select the first and the last exon(next to 5' utr or 3' utr).If set,these exons will be included.",default=False)
    optparser.add_option("--espan", dest="ex_espan", type="int",\
                         help="Span from exon boundaries to exon region,DEFAULT=500bp", default=300)
    optparser.add_option("--ispan", dest="ex_ispan", type="int",\
                         help="Span from exon boundaries to ,DEFAULT=500bp", default=300)

    optparser.add_option("--dump", action="store_true", dest="dump",\
                     help="Whether to save the raw profiles of near exon boundary. The file names have a suffix of XXX, and YYY after the name.",default=False)
    return optparser
                    
if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        warn("User interrupts me! ;-) See you!")
        sys.exit(0)
