#!/usr/bin/env perl

use warnings;
use strict;

use Getopt::Long;
use Cwd;

use lib "/cluster/bin/scripts";
use Encode;
use RAFile;
use HgAutomate;
use HgDb;

use vars qw/
    $opt_configDir
    $opt_verbose
    $opt_instance
    /;

sub usage {
    print STDERR <<END;
usage: loadCellTbl database table [term]
    Populate the a CV table in the given database from the cv.ra file.    

    -instance=s         Use ENCODE pipeline instance s (default prod).
    -configDir=dir      Path of configuration directory, containing metadata
                        .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
    -verbose=num        Set verbose level to num (default 1).
END
exit 1;
}

################################################################################

my $commentedout = <<EOS;
# cell table
CREATE TABLE cell (
    name VARCHAR(255) NOT NULL,
    tag VARCHAR(255) NOT NULL,
    organism VARCHAR(255) NOT NULL,
    Category VARCHAR(20),
    description TEXT,
    tissue VARCHAR(255),
    vendorName TEXT,
    vendorId VARCHAR(255),
    orderUrl VARCHAR(255),
    karyotype VARCHAR(255),
    lineage TEXT,
    termId VARCHAR(255),
    termUrl VARCHAR(255),
    color VARCHAR(255),
    sex VARCHAR(20),
    tier INT,
    protocol VARCHAR(255),
    PRIMARY KEY (name)
);

#antibody table
CREATE TABLE antibody (
    name VARCHAR(255) NOT NULL,
    tag VARCHAR(255) NOT NULL,
    displayName VARCHAR(255) DEFAULT "Not Decided",
    antibodyDescription TEXT,
    validation VARCHAR(255),
    lab VARCHAR(255),
    targetId VARCHAR(255),
    targetDescription TEXT,
    targetUrl VARCHAR(255),
    geneId VARCHAR(255),
    vendorName VARCHAR(255),
    vendorId VARCHAR(255),
    lots VARCHAR(255),
    orderUrl VARCHAR(255),
    PRIMARY KEY (name)
);

#protocol table
CREATE TABLE protocol (
    name VARCHAR(255) NOT NULL,
    tag VARCHAR(255) NOT NULL,
    description VARCHAR(255) NOT NULL
);

#treatment table
CREATE TABLE treatment (
    name VARCHAR(255) NOT NULL,
    tag VARCHAR(255) NOT NULL,
    description VARCHAR(255),
    label VARCHAR(255) DEFAULT "Not Decided"
);

#control table
CREATE TABLE control (
    name VARCHAR(255) NOT NULL,
    tag VARCHAR(255) NOT NULL,
    description VARCHAR(255)
);

EOS

################################################################################

my %termToColumnMapper = (
    "term" => "name",
    "Lots" => "lots" # not sure why the term is spelled different need to investigate
);

sub mapTermToColumn { 
    my $term = shift;
    if (exists $termToColumnMapper{$term}) {
        return $termToColumnMapper{$term};
    } else {
        return $term;
    }
}

sub mapColumnToTerm {
    my $column = shift;
    for my $k (keys %termToColumnMapper) {
        if ($column eq $termToColumnMapper{$k}) {
            return $k;
        }
    }

    return $column;
}

# return a list of warning
sub getDbWarnings {
    my $dbHandle = shift;
    my @warnings = ();
    my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr;
    while (my @row = $results->fetchrow_array()) {
        push @warnings, $row[2]; 
    }
    return @warnings;
}

# return a list of the fields in a database table
sub getDbFields {
    my $dbHandle = shift;
    my $tableName = shift;

    my @fieldsList = ();

    my $cmd = "SHOW COLUMNS FROM $tableName";
    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
    if ($results) {
        while (my @row = $results->fetchrow_array()) {
            push @fieldsList, $row[0]; 
        }
    } else {
        die $dbHandle->errstr;
    }

    return sort @fieldsList;
}

# returns all the possible fields for the given CV terms
sub getCvFields {
    my %records = @_;

    my %fieldsHash = ();

    for my $k (keys %records) {
        my @fields = keys(%{$records{$k}});
        for my $f (@fields) {
            $fieldsHash{$f} = 0;
        }
    }
    
    return sort keys %fieldsHash;
}

sub checkUpdateFields {
    my $dbHandle = shift;
    my $tableName = shift;
    my %termInfo = @_;

    my $name = $termInfo{"term"};
    my @columnNames = ();
    my @cvValues = ();
    my @dbValues = ();

    # get all columns that are in the CV
    for my $k (keys %termInfo) {
        next if ($k eq "type");
        push @columnNames,  mapTermToColumn($k);
        if (defined $termInfo{$k}) {
            push @cvValues, $termInfo{$k};
        } else {
            push @cvValues, "";
        }
    }

    # get all columns fron the database
    my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name);
    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
    if ($results) {
        my @row = $results->fetchrow_array();
        for (my $i = 0; $i < scalar(@row); ++$i) {
            # check if database has empty column
            if (not defined $row[$i] or $row[$i] eq "") {
                # check if there is anything in the CV to back fill
                if ($cvValues[$i] ne "") {
                    HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n");
                    my $updateCmd = "UPDATE $tableName" .
                                    " SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) .
                                    " WHERE name = " . $dbHandle->quote($name);
                    $dbHandle->execute($updateCmd) or die $dbHandle->errstr;
                    my @warnings = getDbWarnings($dbHandle);
                    if (scalar(@warnings) > 0) {
                        print STDERR "Got a warning when executing $cmd.\n";
                        for my $w (@warnings) {
                            print STDERR "\t$w\n";
                        }
                    }
                }
            } elsif ($cvValues[$i] ne $row[$i]) {    # check to make sure the CV and database tables are equal
                print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n";
            }
        }
    } else {
        die $dbHandle->errstr;
    }
}

# add a new term to the database table
sub addNewTerm {
    my $dbHandle = shift;
    my $tableName = shift;
    my %termInfo = @_;

    my @columnNames = ();
    my @columnValues = ();

    # get the defined column and it's value
    for my $k (keys %termInfo) {
        next if ($k eq "type");
        push @columnNames,  mapTermToColumn($k);
        if (defined $termInfo{$k}) {
            push @columnValues, $dbHandle->quote($termInfo{$k});
        } else {
            push @columnValues, $dbHandle->quote("");
        }
    }

    my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames)  . ") VALUES " .
                                     "(" . join(",", @columnValues) . ")";
    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
    my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS");
    my @warnings = getDbWarnings($dbHandle);
    if (scalar(@warnings) > 0) {
        print STDERR "Got a warning when executing $cmd.\n";
        for my $w (@warnings) {
            print STDERR "\t$w\n";
        }
    }
}

############################################################################
# Main
############################################################################

my $now = time();
my $wd = cwd();
my $ok = GetOptions("instance=s",
                    "configDir=s",
                    "verbose=i"
                    );
# parse options
usage() if (!$ok);
usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3);
# get options or set defaults
if (not defined $opt_instance) {
    $opt_instance = "prod";
}
my $configPath;
if (defined $opt_configDir) {
    if ($opt_configDir =~ /^\//) {
        $configPath = $opt_configDir;
    } else {
        $configPath = "$wd/$opt_configDir";
    }
} else {
    $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
}
if(!(-d $configPath)) {
    die "configPath '$configPath' is invalid; Can't find the config directory\n";
}
if (not defined $opt_verbose) {
    $opt_verbose = 1;
}
HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");

my $database  = $ARGV[0];
my $tableName = $ARGV[1];
my $termName;
if (scalar(@ARGV) == 3) {
    $termName = $ARGV[2];
} else {
    $termName = $tableName;
}

# read the cv.ra file
my %cvTerms = Encode::getControlledVocab($configPath);

# connect to the database and read the metadata table for the obj
my $dbHandle = HgDb->new(DB => $database);

# get hashes contraining the fields and term names
my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName);
my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}});
delete $cvFields{"type"};

# check if there are any fields in the CV that aren't in the table
my $fatalError = 0;
for my $f (keys %cvFields) {
    if (defined $dbFields{mapTermToColumn($f)}) {
    } else {
        print STDERR "ERROR: Found field $f in CV that is missing from database table.\n";
        $fatalError = 1;
    }
}

# check if there are any fields in the table that aren't in the CV
for my $f (keys %dbFields) {
    if (defined $cvFields{mapColumnToTerm($f)}) {
    } else {
        print STDERR "WARNING: Found field $f in database table that is missing from CV.\n";
    }
}

if ($fatalError) {
    exit 10;
}

# read the names present in the database
my %dbNames = ();
my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr;
if ($results) {
    while(my @row = $results->fetchrow_array()) {
        my $name = $row[0];
        $dbNames{$name} = 0;
    }
}

# read the names from the CV
my %cvNames = ();
my %terms = %{$cvTerms{$termName}};
for my $t (keys %terms) {
    $cvNames{$t} = 0;   # store the name for later diff'ing
    my %termInfo = %{$terms{$t}};
    if (defined $dbNames{$t}) {     # if it's already in the table, 
        HgAutomate::verbose(3, "Term $t already in table $tableName.\n");
        checkUpdateFields($dbHandle, $tableName, %termInfo);
    } else {        # add it
        HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n");
        addNewTerm($dbHandle, $tableName, %termInfo);
    }
}

# check for terms in the database that are not in the cv
for my $n (sort keys %dbNames) {
    if (defined $cvNames{$n}) {     # if in cv
        # do nothing
    } else {        # otherwise give a warning
        print STDERR "Term $n found in table $tableName but not in cv.\n";
    }
}
