#!/usr/local/bin/perl
# File: combineSeqPrimerPos
# Author: Terry Furey
# Date: 10/2001
# Description:  Combines placemnets of STS by full sequence and primers
# into a single, unified file

# $Id: combineSeqPrimerPos,v 1.2 2006/09/09 22:30:57 hiram Exp $

# Usage message
if ($#ARGV < 1) {
  print STDERR "USAGE: combineSeqPrimerPos <seq file> <primers file>\n";
  exit(1);
}

# Read in parameters
my $seq = shift(@ARGV);
if (!-e "$seq") { die("Can not open $seq: $!"); }
my $primers = shift(@ARGV);
if (!-e"$primers") { die("Can not open $primers: $!"); }

# Set maximum number of placements to consider
my $MAX_PLACE = 50;

# Make sure they are sorted by UCSC ID number
open(TEMP, ">$$.header") or die "Can not write to $$.header: $!";
print TEMP "chr\tstart\tend\tidentNo\tacc\n";
print TEMP "25S\t10N\t10N\t10N\t25S\n";
close(TEMP);
`cat $$.header $seq | sorttbl identNo chr start | headchg -del > $$.seq`;
`cat $$.header $primers | sorttbl identNo chr start | headchg -del > $$.primers`;
open(SEQ, "<$$.seq") or die "Can not read from $$.seq: $!";
open(PRIMER, "<$$.primers") or die "Can not read from $$.primers: $!";

# Read in sequence and primers files and create initial output file
open(TEMP, ">$$.temp") or die "Can not write to $$.temp: $!";
print TEMP "chr\tstart\tend\tidentNo\tplace\tacc\totherAcc\n";
print TEMP "15S\t10N\t10N\t10N\t5N\t15S\t666S\n";
my $currid = 1;
my $seqid = 1;
my $schr = "";
my $primid = 1;
my $pchr = "";
my $done = 0;
my $sstart = 0;
my $send = 0;
my $sacc = 0;
while (!$done) {
  # Record sequence info, if matches current ID
  my @seqchr;
  my @seqstart;
  my @seqend;
  my @seqacc;
  if (($seqid == $currid) && ($schr)) {
    @seqchr = (@seqchr, $schr);
    @seqstart = (@seqstart, $sstart);
    @seqend = (@seqend, $send);
    @seqacc = (@seqacc, $sacc);
  }
  # Record primers info, if matches current ID
  my @primchr;
  my @primstart;
  my @primend;
  my @primacc;
  if (($primid == $currid) && ($pchr)) {
    @primchr = (@primchr, $pchr);
    @primstart = (@primstart, $pstart);
    @primend = (@primend, $pend);
    @primacc = (@primacc, $pacc);
  }

  # Get all sequence lines for this ID
  while ($seqid == $currid) {
    if (my $line = <SEQ>) {
      chomp($line);
      ($schr, $sstart, $send, $seqid, $scov, $sid, $sacc) = split("\t",$line);
      if ($seqid == $currid) {
	@seqchr = (@seqchr, $schr);
	@seqstart = (@seqstart, $sstart);
	@seqend = (@seqend, $send);
	@seqacc = (@seqacc, $sacc);
      }
    } else {
      $seqid = -1;
    }
  }

  # Get all primers lines for this ID
  while ($primid == $currid) {
    if (my $line = <PRIMER>) {
      chomp($line);
      ($pchr, $pstart, $pend, $primid, $pcov, $pid, $pacc) = split("\t",$line);
      if ($primid == $currid) {
	@primchr = (@primchr, $pchr);
	@primstart = (@primstart, $pstart);
	@primend = (@primend, $pend);
	@primacc = (@primacc, $pacc);
      }
    } else {
      $primid = -1;
    }
  }

  # Combine the lines for this ID from each file
  my @chr;
  my @start;
  my @end;
  my @acc;
  if (($#primchr >=0) || ($#seqchr >=0)) {
    my $sindex = 0;
    my $pindex = 0;
    # Process all lines
    while ((($sindex <= $#seqchr) || ($pindex <= $#primchr)) && ($#chr <= $MAX_PLACE)) {
      # If no sequence placements left
      if ($sindex > $#seqchr) {
	# Print out primer placements if no sequence placements
	if ($#seqchr < 0) {
	  while (($pindex <= $#primchr)  && ($#chr <= $MAX_PLACE)) {
	    @chr = (@chr, $primchr[$pindex]);
	    @start = (@start, $primstart[$pindex]);
	    @end = (@end, $primend[$pindex]);
	    @acc = (@acc, $primacc[$pindex]);
	    $pindex++;
	  }
	} else {
	  $pindex = $#primchr + 1;
	}

      # If no primer placements left, print out remaining sequence placements
      } elsif ($pindex > $#primchr) {
	while (($sindex <= $#seqchr)  && ($#chr <= $MAX_PLACE)) {
	  @chr = (@chr, $seqchr[$sindex]);
	  @start = (@start, $seqstart[$sindex]);
	  @end = (@end, $seqend[$sindex]);
	  @acc = (@acc, $seqacc[$sindex]);
	  $sindex ++;
	}
		
      # Combine placements if they overlap
      } else {
	if ($seqchr[$sindex] eq $primchr[$pindex]) {
	  if ($seqend[$sindex] < $primstart[$pindex]) {
	    @chr = (@chr, $seqchr[$sindex]);
	    @start = (@start, $seqstart[$sindex]);
	    @end = (@end, $seqend[$sindex]);
	    @acc = (@acc, $seqacc[$sindex]);
	    $sindex ++;
	  } elsif ($primend[$pindex] < $seqstart[$sindex]) {
	    $pindex++;
	  } else {
	    @chr = (@chr, $seqchr[$sindex]);
	    if ($seqstart[$sindex] < $primstart[$pindex]) {
	      @start = (@start, $seqstart[$sindex]);
	    } else {
	      @start = (@start, $primstart[$pindex]);
	    } 
	    if ($seqend[$sindex] > $primend[$pindex]) {
	      @end = (@end, $seqend[$sindex]);
	    } else {
	      @end = (@end, $primend[$pindex]);
	    }
	    @acc = (@acc, $seqacc[$sindex]);
	    $sindex++;
	    $pindex++;
	  }
	} else {
	  if ($seqchr[$sindex] lt $primchr[$pindex]) {
	    @chr = (@chr, $seqchr[$sindex]);
	    @start = (@start, $seqstart[$sindex]);
	    @end = (@end, $seqend[$sindex]);
	    @acc = (@acc, $seqacc[$sindex]);
	    $sindex ++;
	  } elsif ($primchr[$pindex] lt $seqchr[$sindex]) {
	    $pindex++;
	  }
	}
      }	
    }
    $place = $#chr + 1;
    # Compile a list of other accessions this marker is placed on
    if ($place <= $MAX_PLACE) {
      for (my $i = 0; $i < $place; $i++) {
	my $oacc = "";
	for (my $j = 0; $j <= $#acc; $j++) {
	  if ($acc[$i] ne $acc[$j]) {
	    if ($oacc) {
	      $oacc .= ";$acc[$j]";
	    } else {
	      $oacc = $acc[$j];
	    }
	  }
	}
	if (!$oacc) {
	  $oacc = "-";
	}
	print TEMP "$chr[$i]\t$start[$i]\t$end[$i]\t$currid\t$place\t$acc[$i]\t$oacc\n";
      }
    }
  }
  $currid++;
  if (($seqid == -1) && ($primid == -1)) {
    $done = 1;
  }
}
close(SEQ);
close(PRIMER);
close(TEMP);

# Sort into final output file
`sorttbl chr start identNo < $$.temp > stsMarkers_pos.rdb`;
`rm $$.*`;
