package Splice;

#################################################################	
# Purpose	: Setting splice boundary parameters		#
#								#
# Usage		: $splice = Splice->new($a, $b, $c, $d, $e)	#
#								#
# $a		: Number of steps for the inference process	#
# $b		: Length of the motifs used in the inference	#
# $c		: Program that performs inference 		#
# $d		: Directory where the genome is			#
# $e		: Number of matches around junction		#
#################################################################


use strict;
use warnings;
use IO::File;
use POSIX qw(tmpnam);
use Boundary;

sub new {
    	my $that = shift;
    	my $class = ref($that) || $that;
                                                                                                                               
    	my ($a, $b, $c, $d, $e) = @_;
                                                                                                                               
    	unless (-e $c) {
        	die "Inference program not found\n";
    	}
	unless (-d $d) {
		die "Genome directory not found\n";
	}

	my $self = {
		nSteps		=> $a,
		mLength		=> $b,
		program		=> $c,
		chromDir	=> $d,
		padding		=> $e
    	};

	bless($self, $class);
     	return $self;
}

sub load_chromosome {
	my $self = shift;
        my $chromN = shift;
	my $chromS = shift;

        my $cfile = $self->{chromDir} . $chromN . '.fa';
                                                                                                   
        open(CHR, $cfile) || die "could not open $cfile : $!\n";
        $_ = <CHR>; #name
        $$chromS = '';
        while(<CHR>) {
                $_ =~ s/\s//g;
                $$chromS .= $_;
        }
        close(CHR);
}


sub set_mapping_boundaries {
        my $key = shift;
	my $fileInfo =shift;
	my $chromS = shift;
	my $boundaryA = shift;
	my $boundaryNumber = shift; 
	my $log = shift;
	#my $pcount = shift;
        my ($l, $l0, $l1, $l2);
        my ($s1, $s2, $s3, $s4, $s5, $s6);
        my $boundary;
        my $seq;
	my $count = 0;
        my $last = $#{$fileInfo->{$key}[3]};
	#print "$last\n";
        if ($fileInfo->{$key}[2] eq '+') {
                $l0 = length($fileInfo->{$key}[3][0]); # $l0 is added at the beginning
                $l2 = length($fileInfo->{$key}[3][$last]); # $l2 is added at the end
                $l1 = $fileInfo->{$key}[4][0]-$l0; # $l1 is the offset
                $seq = lc(substr($$chromS, $l1-1, $fileInfo->{$key}[4][$last]+$l2-$l1+1));
                for (my $i = 0; $i <= $last-1; $i = $i+2) {
                        ($s1, $s2) = ($fileInfo->{$key}[3][$i], $fileInfo->{$key}[3][$i+1]);
			($s5, $s6) = ($fileInfo->{$key}[5][$i], $fileInfo->{$key}[5][$i+1]);
                        $l = $l0 = length($s1);
                        $l2 = length($s2);
                        if ($l0 < $l2) {
                                $l = $l0;
                                $s2 = substr($s2, 0, $l);
				$s6 = substr($s6, 0, $l);
                        }
                        elsif ($l2 < $l0) {
                                $l = $l2;
                                $s1 = substr($s1, -$l);
				$s5 = substr($s5, -$l);
                        }
                        $s3 = substr($seq, $fileInfo->{$key}[4][$i]-$l+1-$l1, 2*$l+1);
                        $s4 = substr($seq, $fileInfo->{$key}[4][$i+1]-$l-1-$l1, 2*$l+1);
                        #print "$l $$boundaryNumber $s1 $s2 $s3 $s4 $fileInfo->{$key}[$#{$fileInfo->{$key}}-1]\n";
                        $boundary = Boundary->new($l, $s1, $s2, $s3, $s4);
                        if (($boundary->check($log) > 0) && ($s1 eq $s5) && ($s2 eq $s6)) {
				$count++;
                                $boundary->set_tetramers();
                                $boundaryA->{$$boundaryNumber} = $boundary;
                                $$boundaryNumber++;
				#print $pcount "$fileInfo->{$key}[$#{$fileInfo->{$key}}]\t$fileInfo->{$key}[0]\t$$boundaryNumber\n";
                        }
                        else {
                                print $log "for $fileInfo->{$key}[0] $s5 $s6\n";
                        }
                }
        }
        else {
                $l0 = length($fileInfo->{$key}[3][0]); # $l0 is added at the beginning
                $l2 = length($fileInfo->{$key}[3][$last]); # $l2 is added at the end
                $l1 = $fileInfo->{$key}[4][$last]-$l2; # $l1 is the offset
                #print "$l0 $l2 $l1\n";
                $seq = lc(reverseC(substr($$chromS, $l1-1, $fileInfo->{$key}[4][0]+$l0+1-$l1)));
                my $offset = $fileInfo->{$key}[4][$last] + $l0 - $l2;
                #print "$seq\n";
                for (my $i = 0; $i <= $last-1; $i = $i+2) {
                        ($s1, $s2) = ($fileInfo->{$key}[3][$i], $fileInfo->{$key}[3][$i+1]);
			($s5, $s6) = ($fileInfo->{$key}[5][$i], $fileInfo->{$key}[5][$i+1]);
                        $l = $l0 = length($s1);
                        $l2 = length($s2);
                        if ($l0 < $l2) {
                                $l = $l0;
                                $s2 = substr($s2, 0, $l);
				$s6 = substr($s6, 0, $l);
                        }
                        elsif ($l2 < $l0) {
                                $l = $l2;
                                $s1 = substr($s1, -$l);
				$s5 = substr($s5, -$l);
                        }
                        $s3 = substr($seq, $offset-$l-$l1+1, 2*$l+1);
                        $offset += $fileInfo->{$key}[4][$i]-$fileInfo->{$key}[4][$i+1];
                        $s4 = substr($seq, $offset-$l-1-$l1, 2*$l+1);
                        if ($i <= $last-2) {
                                $offset += $fileInfo->{$key}[4][$i+1]-$fileInfo->{$key}[4][$i+2];
                        }
                        #print "$l $$boundaryNumber $s1 $s2 $s3 $s4\n";
			$boundary = Boundary->new($l, $s1, $s2, $s3, $s4);
                        if (($boundary->check($log) > 0) && ($s1 eq $s5) && ($s2 eq $s6)) {
				$count++;
                                $boundary->set_tetramers();
                                $boundaryA->{$$boundaryNumber} = $boundary;
                                $$boundaryNumber++;
				#print $pcount "$fileInfo->{$key}[$#{$fileInfo->{$key}}]\t$fileInfo->{$key}[0]\t$$boundaryNumber\n";
                        }
                        else {
                                print $log "in $fileInfo->{$key}[0] $s5 $s6\n";
                        }
                }
        }
	$boundaryA->{0} = $$boundaryNumber-1;
	print $log "Count : $count perfect junctions in $fileInfo->{$key}[0]\n";
	delete($fileInfo->{$key});
}


sub process_boundaries {
	my $self = shift;
	my $fileInfo = shift;
	my $boundaryA = shift;
	my $boundaryNumber = shift;
	my $log = "out1.log";
	#my $perfectSpliceCount = 'perfectSpliceCount_new.txt';
        my $pchromN = '';
        my $chromN;
        my $chromS = '';
        my $k;
	#open(PCOUNT, ">$perfectSpliceCount") || die "Cannot open $perfectSpliceCount : $!\n";
	open(LOG, ">$log") || die "Cannot open $log : $!\n";
        foreach $k (sort keys %$fileInfo) {
                $chromN = $fileInfo->{$k}[1];
                if ($chromN ne $pchromN) {
                        $pchromN = $chromN;
                        load_chromosome($self, $chromN, \$chromS);
                }
                #set_mapping_boundaries($k, $fileInfo, \$chromS, $boundaryA, $boundaryNumber, *LOG, *PCOUNT);
		set_mapping_boundaries($k, $fileInfo, \$chromS, $boundaryA, $boundaryNumber, *LOG);
        }
	close(LOG);
	#close(PCOUNT);
}

sub reverseC {
    my $sseq = shift;
    $sseq =~ tr/[ACGTNRYWSMKBDHVacgtnrywsmkbdhv]/[TGCANYRWSKMVDHBtgcanyrwskmvdhb]/;
    return reverse($sseq);
}


sub set {
	my $self = shift;
  	my $fileInfo = shift;
	my $chromS;
	my $numberOfRuns = 20;
	my ($k, $status);
	my $i = 1;
	my $boundaryNumber = 1;
	my %boundaryA = ();
	process_boundaries($self, $fileInfo, \%boundaryA, \$boundaryNumber);	
	my ($infile, $outfile, $ifh, $ofh);
	print "Starting inference :\n";	
	while ($i <= $numberOfRuns) {

		do { 
			$infile = tmpnam();
		} until $ifh = IO::File->new($infile, O_RDWR|O_CREAT|O_EXCL);
		END { if (defined($infile) && -e $infile) {unlink($infile);} }

		do {
                	$outfile = tmpnam();
        	} until $ofh = IO::File->new($outfile, O_RDWR|O_CREAT|O_EXCL);
        	END { if (defined($outfile) && -e $outfile) {unlink($outfile);}}
	
		foreach $k (sort {$a <=> $b} keys %boundaryA) {
	  		if ($k == 0) {
				print $ifh "$boundaryA{$k}\n";
			}
			else {	
                		$boundaryA{$k}->print($ifh);
			}
        	}
		$ifh->close(); 
        	$ofh->close();
	
		print "command : $self->{program} $infile $self->{nSteps} $self->{mLength} $outfile\n";
        	if (($status = system("$self->{program} $infile $self->{nSteps} $self->{mLength} $outfile >> out.log") >> 8) != 0) {
			die "Inference Program fails\n";
		}
		collect_splice($self, $outfile);
		$i++;
	}
	mean_splice($self, $numberOfRuns);
}	


sub collect_splice {
   	my $self = shift;
	my $file = shift;
	my @line;

	open(IN, $file) || die "Could not open $file : $!\n";
	while (<IN>) {
		if (/^\s*$/) {
			next;
		}
		chomp;
		@line = split(" ", $_);
		#print "Add : $line[0] $line[1]\n";
		if (exists($self->{lc($line[0])})) {
			$self->{lc($line[0])} += $line[1];
		}
		else {
			$self->{lc($line[0])} = $line[1];
		}
	}
	close(IN);
}

sub new_unbiased {
	my $that = shift;
	my $class = ref($that) || $that;

	my $gtag = shift;
	my $gcag = shift;
	my $atac = shift;

	if ($gtag <= 0.0 || $gcag <= 0.0 || $atac <= 0) {
		die "Splice::new_unbiased -> wrong input values : $gtag-$gcag-$atac\n";
	}
	
	my $self = {
	};

	my $rval = 1.0 - $gtag - $gcag - $atac;
	my $sval = $rval/253;
	my @tr = ('t', 'c', 'a', 'g');	
	my ($ro, $ri, $li, $lo);
	my $k;
        for (my $i = 0; $i < 256; $i++) {
		$ro = $i%4;
                $ri = (int($i/4)) % 4;
                $li = (int($i/16)) %4;
                $lo = int($i/64);
                $k = "$tr[$lo]$tr[$li]$tr[$ri]$tr[$ro]";
		if ($k eq 'gtag') {
			$self->{$k} = $gtag;
		}
		elsif ($k eq 'gcag') {
			$self->{$k} = $gcag;
		}
		elsif ($k eq 'atac') {
			$self->{$k} = $atac;
		}
		else {
			$self->{$k} = $sval;
		}
	}
	bless($self, $class);
	return $self;
}	
	 

sub mean_splice {
        my $self = shift;
        my $number = shift;
	# Sometimes the sum of all splice scores equals to 1
	# and then there is no room left to assign scores to
	# the other splice boundaries
	# To avoid that we normalize the scores by 1+min/2
	my $min = 1.0;
	my $sum = 0;
               
	if ($number == 0) {
		die "Bad number of runs\n";
	}
	my @tr = ('t', 'c', 'a', 'g');
	my ($ro, $ri, $li, $lo);
	my $k;
	for (my $i = 0; $i < 256; $i++) {
         	$ro = $i%4;
         	$ri = (int($i/4)) % 4;
         	$li = (int($i/16)) %4;
         	$lo = int($i/64);
         	$k = "$tr[$lo]$tr[$li]$tr[$ri]$tr[$ro]";
		if (exists($self->{$k})) {
			$self->{$k} = $self->{$k}/$number;
			$sum += $self->{$k};
			if ($self->{$k} < $min) {
				$min = $self->{$k};
			} 
         	}
        }
	if ($sum >= 1.0) {
		# Let's normalize
		$min = $min/2;
		for (my $i = 0; $i < 256; $i++) {
                	$ro = $i%4;
                	$ri = (int($i/4)) % 4;
                	$li = (int($i/16)) %4;
                	$lo = int($i/64);
                	$k = "$tr[$lo]$tr[$li]$tr[$ri]$tr[$ro]";
                	if (exists($self->{$k})) {
                        	$self->{$k} = $self->{$k}/($sum+$min);
                        }
                }
        }
}



sub print {
    my $self = shift;

    my @tr = ('t', 'c', 'a', 'g');
    my ($ro, $ri, $li, $lo);
    my $k;

    for (my $i = 0; $i < 256; $i++) {
         $ro = $i%4;
         $ri = (int($i/4)) % 4;
         $li = (int($i/16)) %4;
         $lo = int($i/64);
         $k = "$tr[$lo]$tr[$li]$tr[$ri]$tr[$ro]";
         print "$k : $self->{$k}\n";
    }
}

sub print_data {
    my $self = shift;
    my $out = shift;

    my @tr = ('t', 'c', 'a', 'g');
    my ($ro, $ri, $li, $lo);
    my $k;
                                                                                                                               
    for (my $i = 0; $i < 256; $i++) {
         $ro = $i%4;
         $ri = (int($i/4)) % 4;
         $li = (int($i/16)) %4;
         $lo = int($i/64);
         $k = "$tr[$lo]$tr[$li]$tr[$ri]$tr[$ro]";
	 if (exists($self->{$k})) {
         	printf $out "%-18s %.10f\n", "score_splice_$k", $self->{$k};
	 }
	 else {
		print $out "# score_splice_$k\n";
	 }
    }
    print $out "\n\n\n";
}


1;
