#!/usr/bin/perl

use FileHandle;
use Getopt::Long;

&GetOptions("html", "chrom=s", "out=s");

#################################################################################
# Perl script to read the files output by Stubb and extract the free energy peaks
# along with their dictionaries.
#
# The command-line options are 
# -html    : Output of this program is in html format (plaintext by default)
# -chrom=c : c is the name of the (fasta) file on which Stubb was run.
# -out=fnm : fnm is the name of the (html or text) file where the output will go. 
#            The file will be placed in the directory defined by $data_path below.
#
# Please make necessary changes to the file-names hard-coded in this script.
#
# Written by Saurabh Sinha, Oct 2003.
# Derived from a similar perl script written by N. Rajewsky, Feb 2002.
##################################################################################


######### The following variables may be changed by the user as needed #######################
##############################################################################################

$ROOTDIR          = "/replace/this/with/installation/directory/"; 

# the wtmx file used by Stubb 
$wtm_table	  = $ROOTDIR. "/sample/gap_wtmx";	
# where Stubb output resides
$data_path	  = $ROOTDIR. "/sample/test_output/single_species/";
# where graphical files output by this script will reside
# this directory must exist
$grph_output_path = $ROOTDIR. "/sample/test_output/single_species/gr_output/";
# relative path of above directory; relative to where the output file will go ($data_path)
$rel_grph_output_path  = "gr_output/"; 

# set 1 if the Stubb output being processed was for stubbh01f (i.e., with correlation parameters)
$Correlations     = 0; 
# set 1 to report peaks in descending order of correlation effect
# (Acts only if $Correlations is also set to 1)
$sort_by_correffect = 1;
# set 1 if the Stubb output being processed was for stubbms (i.e., two species Stubb)
$MultipleSpecies  = 0;

# Only those free energy peaks satisfying certain criteria on
# their dictionaries will be reported. 

# How many TF's (at least) must have sites in the peak
# (background is counted as factor)
$Min_number_factors        = 3; 
# a TF is counted as having sites in a module if the
# module's dictionary entry for that TF is above this
# threshold
$Factor_min_occ            = 0.5; 

# Only peaks above this threshold will be reported
$Free_energy_cutoff        = 10.0; 

# the window size and shift parameters used when Stubb was run
$Window_size               = 500; 
$Window_shift              = 100; 
# cutoff for extracting gff entry when parsing profiles
# this must be greater than or equal to the "-ot " parameter 
# value used when Stubb was run
$Binding_site_cutoff       = 0.3; 

# set 0 to not run gff2ps 
$Graphics         = 1; 

######## No more variables need to be changed by user ########################################
##############################################################################################

#certain tools used by this program
$gff2ps_tool	  = $ROOTDIR. "helpers/gff2ps";      # translate gff to ps
$gff_config_file  = $ROOTDIR. "helpers/Config_file"; # used by gff2ps
$profile_parser   = $ROOTDIR. "helpers/prof2gff.pl"; # parses profiles into gff

# read in the command-line arguments
$html = 0; $html = 1 if (defined($opt_html)); 

if (defined($opt_chrom)) {
    $chrom 		= $opt_chrom;
}
else {
    die "Error: specify the name of the file on which Stubb was run\n";
}

if (defined($opt_out)) {
    $outfile = $opt_out;
}
else {
    $outfile = "peaks.html";
}
$outfile = $data_path . $outfile;
open(OUT, ">$outfile") or die "Couldnt open $outfile for writing\n";

$energy         = $data_path . $chrom . ".fen";
$dictionary     = $data_path . $chrom . ".dict";
$profile        = $data_path . $chrom . ".prof";
$corr           = $data_path . $chrom . ".corr";

## print html header

if ($html){print OUT "<html><PRE>\n";}

## print params

print OUT "# Min_number_factors $Min_number_factors\n";
print OUT "# Free_energy_cutoff $Free_energy_cutoff Factor_min_occ $Factor_min_occ\n";
print OUT "# Window_shift $Window_shift Window_size $Window_size\n";


#### read free energy  #####################

open (IN, "<$energy") || die "could not open $energy\n";
warn  "# reading $energy\n";
@all = <IN>;
warn  "# finished reading $energy\n";
close IN;

#### read in dictionary #####################

open(DICT, "<$dictionary") || die "Could not open $dictionary\n";
warn  "# reading $dictionary\n";

if ($MultipleSpecies==1) {
    $seq1name = '';
}
$position = -1;
$reading_ref_species = 1;

while (<DICT>){
    if (/^>(\S+)/){
        $chr = $1; 
	if ($MultipleSpecies == 1) {
	    if ($seq1name eq '') {
		$seq1name = $chr;
		$reading_ref_species = 1;
	    }
	    else {
		if ($seq1name =~ /$chr/) {
		    $reading_ref_species = 1;
		}
		else {
		    $reading_ref_species = 0;
		}
	    }
	}
	if ($reading_ref_species==1 and $position>=0) {
	    if ($count >= $Min_number_factors){
		$all_index = int($position/$Window_shift);
		($pos, $free_energy) = split(/\t/, $all[$all_index]);
		if ($pos ne $position) {die "# window_pos $position has no free energy? (pos = $pos)\n";}
		if ($free_energy >= $Free_energy_cutoff) {
		    push (@{$hash{$position}}, @x);
		    $fe{$position} = $free_energy;
		}
	    }
	}
	if ($reading_ref_species==1) {
	    @x = ();
	    $count = 0;
	}
        while(<DICT>){
            last if /^</;
            if (/Position: (\d+)/){
		if ($reading_ref_species==1) {
		    $position = $1;
		}
		else {
		    s/Word_av_length.*//g;
		}
		push (@x, "$chr\t$_");
                next;
            }
            push (@x, $_);
            ($motif, $prob, $occ) = split(/\s+/, $_);
            next if ($occ < $Factor_min_occ);
            $count ++;
        }
    }
}
if ($position >= 0) {
    if ($count >= $Min_number_factors){
	$all_index = int($position/$Window_shift);
	($pos, $free_energy) = split(/\t/, $all[$all_index]);
	if ($pos ne $position) {die "# window_pos $position has no free energy? (pos = $pos)\n";}
	if ($free_energy >= $Free_energy_cutoff) {
	    push (@{$hash{$position}}, @x);
	    warn "found entry at $position\n";
	    $fe{$position} = $free_energy;
	}
    }
}
close DICT;
	

#### read in correlations #####################

if ($Correlations == 1) {
    open(CORR, "<$corr") || die "Could not open $corr\n";
    warn  "# reading $corr\n";
    while (<CORR>){
	if (/^>Sequence (\S+)/){
	    @x = ();
	    $chr = $1; 
	    my $position, $oldfen, $newfen;
	    if (/Position (\d+)/) {
		$position = $1;
	    }
	    else {
		die "Position not found where expected in correlation file";
	    }
	    while(<CORR>){
		if (/:/) {
		    next;
		}
		if (/</) {
		    last;
		}
		if (/(\d+\.\d+) -> (\d+\.\d+)/) {
		    $oldfen = $1;
		    $newfen = $2;
		}
	    }
	    $all_index = int($position/$Window_shift);
	    ($pos, $free_energy) = split(/\t/, $all[$all_index]);
	    if ($pos ne $position) {die "# window_pos $position has no free energy? (pos = $pos)\n";}
	    $fendiff{$position} = $newfen-$oldfen;
	}
    }
}
   
#############################################
# get local maxima

@po = sort { $a <=> $b } keys %hash;
@local_max_pos = ();

for ($i = 0; $i <= $#po; $i=$j){
    $max_pos = $po[$i];
    $max_value = $fe{$max_pos};
    for ($j = $i+1; $j <= $#po; $j ++){
	$curr_pos = $po[$j];
	last if ($curr_pos-$max_pos >= $Window_size);
	if ($fe{$curr_pos} >= $max_value){
	    $max_value = $fe{$curr_pos};
	    $max_pos = $curr_pos;
	}
    }
    push (@local_max_pos, $max_pos);
}
warn "# found ", @local_max_pos + 0, " local maxima over threshold $Free_energy_cutoff\n";

##############################################

if ($Correlations==0 or $sort_by_correffect==0) {
    # sort by free energy
    @sorted_local_max_pos = sort { $fe{$b} <=> $fe{$a} } @local_max_pos;  
}
else {
    # sort by correlation effect
    @sorted_local_max_pos = sort { $fendiff{$b} <=> $fendiff{$a} } @local_max_pos;  
}

$rk = 0;
foreach (@sorted_local_max_pos){
    $rk ++;

    # print info about hit
    print_dict($rk, $_, $fe{$_}, $fendiff{$_}, @{$hash{$_}});

    #output module to module gff
    $start = $_;
    $end = $_ + $Window_size-1;
    $gff_modfile = $grph_output_path . "$chrom" . "_mod.gff";
    $mod = join("\t", ($chrom,"Stubb","module",$_,$end,chomp($fe{$_}),"\.","\.","module"));
    warn "$gff_modfile\t$mod\n";
    open MODGFF, ">> $gff_modfile";
    print MODGFF "$mod\n";
    close MODGFF;
    # make gff file
    $gff_outputfile = $grph_output_path . "$chrom" . "_$_" . ".gff";
    system ("cat $profile | $profile_parser -window $_ -cutoff $Binding_site_cutoff -dictionary $wtm_table >$gff_outputfile");
    # make postscript file
    if ($Graphics==1){
	$end = $_ + $Window_size;
	$title = "\"Chromosome $chrom, Position: $_, Free energy: $fe{$_}\"";
	$ps_out = $grph_output_path . "$chrom" . "_$_" . ".ps";
	system("echo \"zoom=$_..$end\" > _tmp_");
	system("cat $gff_config_file _tmp_ > $gff_config_file".".mod");
	system ("cat $gff_outputfile | $gff2ps_tool -O -o -C $gff_config_file".".mod -p -s a5 -v  -T $title -N $Window_size >$ps_out");
	system("rm  $gff_config_file".".mod _tmp_");
    }
}

if ($html){print OUT "</PRE></html>";}

##############################################
##############################################

sub print_dict
{
    my ($rk, $absolut_position, $en, $fendiff, @x) = @_;
    my ($line);
    chomp $en;
    print OUT ">\n";
    if ($html){
	$file=  $rel_grph_output_path . "$chrom" . "_$absolut_position" . ".gff";
	$psfile = $rel_grph_output_path .  "$chrom" . "_$absolut_position" . ".ps";
	print OUT "*********************** <A HREF=\"$file\">Rank $rk</A> [Score ".sprintf("%.4f",$en)."] <A HREF=\"$psfile\">PostScript Profile</A>";
	if ($Correlations == 1) {
	    print OUT "[Correlation effect ".sprintf("%.4f",$fendiff)."]";
	}
	print OUT " *******************\n";
    }
    else{
	print OUT "*********************** Rank $rk [Score ".sprintf("%.4f",$en)."]";
	if ($Correlations == 1) {
	    print OUT "[Correlation effect ".sprintf("%.4f",$fendiff)."]";
	}
	print OUT " *******************\n";
    }
    foreach $line (@x){
	print OUT "$line";
    }
    print OUT "<\n";

    return;
}

