#!/hive/groups/encode/dcc/bin/python
import sys, os, shutil, stat, argparse, datetime
from ucscgenomics import track, ra, soft, cv

"""
mkGeoPkg - create a soft file and upload script for a given track, so that it
may be sumbitted to GEO.

To invoke the script, you must pass the composite and track name:
    mkGeoPkg hg19 wgEncodeSomeTrack
    
This is typically not enough however; most tracks are not completely covered
by their metadata, and it is necessary to supply additional information. The
most commonly needed information is:
    !Series_summary - this is taken from the track's html page description. 
        In most cases it can be copied, one paragraph per line.
    !Series_overall_design - this is taken from the Methods section on the
        track's page. As with summary, 1 paragraph per line.
    !Series_contributor - this is taken from the contributors list on the 
        track's page. It has a special format: "Last,First" where each person
        is listed on a separate line.
    !Sample_instrument_model - this is a bit more difficult, as it technically
        supposed to be a per-sample field. Occasionally this appears in the
        metaDb for newer tracks, if so, it's automatically added in. Otherwise
        it must be either specified on a whole-series basis, or added to the
        metadata. In many cases, we don't actually know all of them. This is
        okay. GEO will allow us to submit whatever information we do have, and
        they can take care of the rest. The instrumentModels dictionary below
        gives a list of the allowable entered values (the keys). The values
        map to the "human readable" version used by GEO.
        
You can supply all of the above information in an additional file and specify
it using the r (replace) option:
    mkGeoPkg -r somefile hg19 wgEncodeSomeTrack
    
The replace file must be organized with each of the above key value pairs as
in a soft file:
    !Series_summary = some summary stuff ...
    !Series_summary = another paragraph of summary ...
    !Series_overall_design = ...
    !Series_contributor = Researcher,Some
    !Series_contributor = Researcher,Another
    !Sample_instrument_model = Illumina_GA2
    
There is a template for this, named replaceTemplate in the directory.

You may need to only submit a part of the track, in this case you can specify
experiment ID numbers to include:

One problem you may run into while running the script is that the DataType is
not valid. This means that the huge dictionary called datatypes below isn't
filled in for that entry. If you can get ahold of the information, modify the
script and push the changes back out.

You may also get an error when trying to submit MicroArray data. This is to be
expected: MicroArray is currently not functional at all. We have no way as of
current to map our data to the format expected by GEO, so we've punted on this
issue for the time being.

Once you've successfully run the script, you'll have generated a soft file and
a script that will handle the uploading process. All of this will be put into
a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the
upload, simply cd into the directoy and run upload.sh. This will start the
aspera copy program, copying files, a files list, and the soft file itself.

For each submission, you need to email GEO. Our current contact is:
    Steve Wilhite, wilhite@ncbi.nlm.nih.gov
    
You must specify which track you're submitting. GEO will only allow us 1TB of
space dedicated to ENCODE, so you must break down submissions larger than 1TB
and only submit as many submissions as they have room to process at any given
time. In a few days, GEO will get back to you with a list of accession numbers
which need to be put back into our metadata (see encodeAddGeoAccessions).

"""

class DataType(object):

    def __init__(self, molecule, strategy, source, selection, soft):
        self.molecule = molecule
        self.strategy = strategy
        self.source = source
        self.selection = selection
        self.soft = soft

datatypes = {
    'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile),
    'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile),
    'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile),
    'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
    'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
    'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
    'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile),
    'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile),
    'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
    'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile),
    'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile),
    'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile),
    
    #these need to be curated
    '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile),
    'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Genotype': DataType('genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'RipGeneSt': DataType('OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', soft.HighThroughputSoftFile), #this isn't correct
    'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
    'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
}

#compare this to the source in datatype, give GP ids depending on the type
gpIds = {
    'human genomic': '63443',
    'human transcriptomic': '30709',
    'human protein': '63447',
    
    'mouse genomic': '63471',
    'mouse transcriptomic': '66167',
    'mouse protein': '63475'
}

cvDetails = {
    'cell':    [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
    'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
}

#if the term appears in the mdb and must overriding the value in the cv
cvOverride = [ 'sex' ]

#talk to Venkat lol
cvPretend = { 'antibody Input': 'control' }

#if its not in cvDetails, which things should we check by default
cvDefaults = [ 'description' ]

mdbWhitelist = [
    'age',
    'bioRep',
    'control',
    'controlId',
    'fragSize',
    'labExpId',
    'labVersion',
    'mapAlgorithm',
    'obtainedBy',
    'phase',
    'readType',
    'region',
    'replicate',
    'restrictionEnzyme',
    'run',
    'softwareVersion',
    'spikeInPool',
    'strain'
]

# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields
# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current)
# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. 
rnaExtractMapping = {
    'shortPolyA': 'polyA RNA', 
    'longPolyA': 'polyA RNA', 
    'polyA': 'polyA RNA'
}

localizationMapping = {
    'cytosol': 'cytoplasmic RNA', 
    'polysome': 'cytoplasmic RNA',
    'membraneFraction': 'cytoplasmic RNA',
    'mitochondria': 'cytoplasmic RNA',
    'nucleus': 'nuclear RNA', 
    'nucleolus': 'nuclear RNA', 
    'nucleoplasm': 'nuclear RNA', 
    'nuclearMatrix': 'nuclear RNA', 
    'chromatin': 'nuclear RNA',
    'cell': 'total RNA'
}

# map our instrument names to GEO's names
instrumentModels = {
    'Illumina_GA2x': 'Illumina Genome Analyzer II',
    'Illumina_GA2': 'Illumina Genome Analyzer II',
    'Illumina_HiSeq_2000': 'Illumina HiSeq 2000',
    'Illumina_GA1': 'Illumina Genome Analyzer',
    'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II',
    'SOLiD_Unknown': 'SOLiD',
    'Unknown': 'Illumina Genome Analyzer'
}

    
def isRawFile(file):
    return (file.extension == 'fastq' or file.extension == 'fasta')
    
def isSupplimentaryFile(file):
    return not isRawFile(file)
    
def sampleTitle(stanza, expVars, warn=False):
    concat = stanza[expVars[0]].replace('-m', '')
    for expVar in expVars[1:len(expVars)]:
        if expVar in stanza and stanza[expVar] != 'None':
            concat += '_' + stanza[expVar]
        elif warn:
            print 'warning: %s is None or not in %s' % (expVar, stanza.name)
    return concat
    
def linkName(file, track):
    return '%s_%s' % (track.database, file.name)
    
def createMappings(mdb):
    expIds = dict()
    geoMapping = dict()
    expVars = None
    series = None
    datatype = None
    
    for stanza in mdb.itervalues():
        
        if 'objType' in stanza and stanza['objType'] == 'composite':
            series = stanza
            expVars = stanza['expVars'].split(',')
            continue

        if 'expId' not in stanza:
            print stanza.name + ': no expId'
            continue

        if 'geoSampleAccession' not in stanza or 1:
            # if this hasn't been submitted to GEO yet, we'll add it to the submission list
            if stanza['expId'] not in expIds:
                expIds[stanza['expId']] = list()
                
            expIds[stanza['expId']].append(stanza)
        
        else:
            # otherwise we keep track of the geo number for partially submitted samples
            if stanza['expId'] not in geoMapping:
                geoMapping[stanza['expId']] = stanza['geoSampleAccession']
            elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
                geoMapping[stanza['expId']] = 'Inconsistent'
                print stanza.name + ': inconsistent geo mapping'
        
        if datatype == None and 'dataType' in stanza:
            datatype = stanza['dataType']
        elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
            raise KeyError(stanza.name + ': inconsistent data type') 

    try:
        dt = datatype
        datatype = datatypes[dt]
        datatype.name = dt
    except KeyError:
        raise KeyError(datatype)
    
    return expIds, expVars, geoMapping, series, datatype
    
    
def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit):
    
    if 'geoSeriesAccession' in series:
        print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
        return
        
    print 'Writing series ' + series['composite']
    
    seriesStanza = soft.SeriesStanza()
    seriesStanza['^SERIES'] = series['composite']
    seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
    
    if '!Series_summary' in replace:
        seriesStanza['!Series_summary'] = replace['!Series_summary']
    else:
        print 'warning: no series summary found. Please include in replace file.'
        seriesStanza['!Series_summary'] = '[REPLACE]'
        if audit:
            print seriesStanza.name + ': no summary'
        
    if '!Series_overall_design' in replace:
        seriesStanza['!Series_overall_design'] = replace['!Series_overall_design']
    else:
        print 'no series overall design found. Please include in replace file.'
        seriesStanza['!Series_overall_design'] = '[REPLACE]'
        if audit:
            print seriesStanza.name + ': no overall design'
            
    seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
    
    if '!Series_contributor' in replace:
        seriesStanza['!Series_contributor'] = replace['!Series_contributor']
    else:
        seriesStanza['!Series_contributor'] = '[REPLACE]'
        if audit:
            print seriesStanza.name + ': no contributor'
        
    seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source]
    
    # could use !Series_variable_* and !Series_repeats_*
    
    seriesStanza['!Series_sample_id'] = list()
    
    for idNum in expIds.iterkeys():
        if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
            seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
        else:
            seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars))
    
    softfile[series['composite']] = seriesStanza
    
def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit):
    
    print 'Creating HighThroughput soft file'

    softfile = soft.HighThroughputSoftFile()
    fileList = list()
    
    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit)
        
    for idNum in expIds.iterkeys():
        
        expId = expIds[idNum]
        firstStanza = expId[0]
        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
        sample = soft.HighThroughputSampleStanza()

        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
        sample['!Sample_type'] = 'SRA'
        sample['!Sample_title'] = sample['^SAMPLE']
        
        if 'geoSeriesAccession' in series:
            sample['!Sample_series_id'] = series['geoSeriesAccession']
            
        count = 1
        
        #figure out if the instrument model is consistent across the entire sample
        instrumentModel = None
        for stanza in expId:    
            if 'seqPlatform' in stanza:
                if instrumentModel == None:
                    instrumentModel = stanza['seqPlatform']
                else:
                    if instrumentModel != stanza['seqPlatform']:
                        instrumentModel = None
                        if audit:
                            print 'expId' + str(expId) + ': inconsistent instrument model'
                        break
        
        for stanza in expId:
        
            for fname in stanza['fileName'].split(','):
                file = compositeTrack.files[fname]
                
                if isRawFile(file):
                    sample['!Sample_raw_file_' + str(count)] = linkName(file, compositeTrack)
                    sample['!Sample_raw_file_type_' + str(count)] = file.extension
                    
                    if file.md5sum != None:
                        sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum

                    if instrumentModel == None and 'seqPlatform' in stanza:
                        sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform']
                        
                    fileList.append(file)    
                    count = count + 1
            
        count = 1
            
        for stanza in expId:
        
            for fname in stanza['fileName'].split(','):
                file = compositeTrack.files[fname]
        
                if isSupplimentaryFile(file):
                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
                    
                    if file.md5sum != None:
                        sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
                    
                    sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
                    
                    if instrumentModel == None and 'seqPlatform' in stanza:
                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform']
                    
                    fileList.append(file)
                    count = count + 1
            
        sample['!Sample_source_name'] = firstStanza['cell']
        sample['!Sample_organism'] = compositeTrack.organism
        
        sample['!Sample_characteristics'] = list()
        allVars = expVars + mdbWhitelist
        
        for var in allVars:
            if var in firstStanza:
                foobar = var
                sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var])
                for pretend in cvPretend.iterkeys():
                    if var + ' ' + firstStanza[var] == pretend:
                        foobar = cvPretend[pretend]
                if foobar in cvDetails:
                    for cvVar in cvDetails[foobar]:
                        if cvVar in cvOverride and cvVar in firstStanza:
                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar])
                        elif cvVar in cv[firstStanza[var]]:
                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar])
                else:
                    for cvVar in cvDefaults:
                        if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]:
                            sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[firstStanza[var]][cvVar])
                
        sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
        
        if 'treatment' in firstStanza:
            sample['!Sample_treatment_protocol'] = firstStanza['treatment']
        
        if 'protocol' in cv[firstStanza['cell']]:
            for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
                    key, val = protocol.split(':')
                    if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
                        sample['!Sample_growth_protocol'] = val
        
        if datatype.molecule == 'OVERRIDE RNA':
            if firstStanza['rnaExtract'] in rnaExtractMapping:
                sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']]
            elif firstStanza['localization'] in localizationMapping:
                sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']]
                
        else:
            sample['!Sample_molecule'] = datatype.molecule
            
        if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
        else:
            sample['!Sample_extract_protocol'] = compositeTrack.url
        sample['!Sample_library_strategy'] = datatype.strategy
        sample['!Sample_library_source'] = datatype.source
        sample['!Sample_library_selection'] = datatype.selection
        
        # if the instrumentModel is consistent, just use that
        # otherwise take the first seqPlatform value from metadata
        # if that still fails, check the replacement file
        # finally just make it say [REPLACE]
        if instrumentModel != None:
            sample['!Sample_instrument_model'] = instrumentModel
        else:
            for stanza in expId:    
                if 'seqPlatform' in stanza:
                    sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']]
                    break
            if '!Sample_instrument_model' not in sample:
                if '!Sample_instrument_model' in replace:
                    sample['!Sample_instrument_model'] = instrumentModels[replace['!Sample_instrument_model'][0]]
            if '!Sample_instrument_model' not in sample:
                sample['!Sample_instrument_model'] = '[REPLACE]'
                if audit:
                    print stanza.name + ': no instrument'
                
        sample['!Sample_data_processing'] = compositeTrack.url

        if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
            sample['!Sample_geo_accession'] = geoMapping[idNum]
        
        softfile[firstStanza['metaObject']] = sample
        
    return softfile, fileList
        
        
def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
    
    raise KeyError('microarray')
    
    print 'Creating MicroArray soft file'

    softfile = SoftFile()
    fileList = list()
    
    createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
    
    for idNum in expIds.iterkeys():
        
        expId = expIds[idNum]
        firstStanza = expId[0]
        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
        sample = MicroArraySampleStanza()
            
        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars)
        
        if 'geoSeriesAccession' in series:
            sample['!Sample_series_id'] = series['geoSeriesAccession']    
            
        sample['!Sample_title'] = concat
        
        count = 1
            
        for stanza in expId:
        
            if isSupplimentaryFile(stanza['fileName']):
                sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
                
                if 'checksum' in stanza:
                    sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
                elif md5sums != None and stanza['fileName'] in md5sums:
                    sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
                
                fileList.append(stanza['fileName'])
                count = count + 1

        # sample['!Sample_table'] = KeyOptional # CEL file
        sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
        # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
        # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
        sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
        sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus
        sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus
        sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus
        sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus
        sample['!Sample_platform_id'] = '[REPLACE]'
        # sample['!Sample_geo_accession'] = KeyOptional
        # sample['!Sample_anchor'] = KeyRequired SAGE ONLY
        # sample['!Sample_type'] = KeyRequired SAGE ONLY
        # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY
        # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY
        sample['!Sample_table_begin'] = ''
        sample['!Sample_table_end'] = ''
    
        softfile[firstStanza['accession']] = sample
        
    return softfile, fileList
    
    
def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
    softfile = SoftFile()
    fileList = list()
    
    createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
    
    for idNum in expIds.iterkeys():
        
        expId = expIds[idNum]
        firstStanza = expId[0]
        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
        sample = HighThroughputSampleStanza()
        
        hasbigwig = 0
        for stanza in expId:
        
            if getFileType(stanza['fileName']) == 'bigWig':
                hasbigwig = 1
                
        if hasbigwig == 0:
            continue
        
        sample['^SAMPLE'] = firstStanza['geoSampleAccession']
        
        if 'geoSeriesAccession' in series:
            sample['!Sample_series_id'] = series['geoSeriesAccession']
            
        sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession']
        
        count = 1
            
        for stanza in expId:
        
            if getFileType(stanza['fileName']) == 'bigWig':
                sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
                
                if 'checksum' in stanza:
                    sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
                elif md5sums != None and stanza['fileName'] in md5sums:
                    sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
                
                # sample['!Sample_supplementary_file_build_' + str(count)] = database
                
                fileList.append(stanza['fileName'])
                count = count + 1
                
        softfile[firstStanza['geoSampleAccession']] = sample
        
    return softfile, fileList
        
def main():

    parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.')
    parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
    parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model')
    parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors')
    parser.add_argument('database', help='The database, typically hg19 or mm9')
    parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
    parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file')
    
    if len(sys.argv) == 1:
        parser.print_usage()
        return
    
    args = parser.parse_args(sys.argv[1:])
        
    compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath)

    cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra'
    controlledVocab = cv.CvFile(cvPath)
    
    replace = dict()
    if args.replace != None:
        for line in open(args.replace):
            if line == '':
                continue
            key, val = map(str.strip, line.split('=', 1))
            if key not in replace:
                replace[key] = list()
            replace[key].append(val)
        
    
    ids = list()
    
    for id in args.expIds:
        if '-' in id:
            start, end = id.split('-', 1)
            ids.extend(range(int(start), int(end) + 1))
        else:
            ids.append(int(id))
    
    expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb)
    
    submission = dict()
    if len(ids) == 0:
        submission = expIds
    else:
        for expId in ids:
            submission[str(expId)] = expIds[str(expId)]
    
    expIdStr = ' '
    for id in args.expIds:
        expIdStr = expIdStr + id + ',' 
    expIdStr = expIdStr[:len(expIdStr) - 1]
    print 'Generating soft using expIds ' + expIdStr
    
    if datatype.soft == soft.HighThroughputSoftFile:
        softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit)
    elif datatype.soft == soft.MicroArraySoftFile:
        softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype)
    else:
        raise KeyError('unsupported type ' + datatype.name)
    
    if not args.audit:
        print 'Creating directory'
    
        d = datetime.datetime.today()
        datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day)
        
        dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring)
        linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
    
        os.mkdir(dirname)
        os.mkdir(dirname + linkdirname)
    
        print 'Writing file'
        
        outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name)
        outfile = open(outfileName, 'w')
        outfile.write(str(softfile))
        fileslistname = '%sfiles.txt' % (dirname + linkdirname)
        fileslist = open(fileslistname, 'w')
        scriptname = '%supload.sh' % dirname
        outscript = open(scriptname, 'w')
        
        outscript.write('#!/bin/sh\n\n')
        outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
        outscript.close()
        
    for file in fileList:
        if not os.path.exists(file.path):
            print IOError(file.path + ' does not exist')
        elif not args.audit:
            linkname = linkName(file, compositeTrack)
            linkpath = linkdirname + linkname
            os.symlink(file.fullname, dirname + linkpath)
        
            #outscript.write(linkpath + ' \\\n')
            fileslist.write(linkname + '\n')
    
    if not args.audit:
        #outscript.write()
        
        fileslist.close()

        os.system('chmod +x ' + scriptname)
        
        print 'Finished!'
    
if __name__ == '__main__':
    main()