#!/import/bc2/soft/bin/perl5/perl 

#########################################################################
# Purpose : generate spa_parameters file				#
#									#
# Usage : ./generateParameters.pl <config ini file> <output file>	#
#									#
#########################################################################

use Config::IniFiles;
use Insertion;
use Deletion;
use Intron;
use Splice;
use Match;
use Statistics;
use strict;
use warnings;

if (scalar(@ARGV) != 2) {
    die "Usage : generateParameters.pl <configIniFile> <output parameter file>\n";
}

unless ( -e $ARGV[0]) {
         die "Config file not found\n";
}

my $cfg = Config::IniFiles->new( -file => "$ARGV[0]");

my $Nbin		= 100;
my $splSpaD             = $cfg->val('dir', 'spaDir');
my $splchromD           = $cfg->val('dir', 'chromDir');

my $splSteps		= $cfg->val('splice', 'number_of_steps');
my $splMotif		= $cfg->val('splice', 'motif_length');
my $splProg		= $cfg->val('splice', 'program');
my $splPadding		= $cfg->val('splice', 'padding');
my $dradius		= $cfg->val('params', 'define_radius');
my $llimit		= $cfg->val('params', 'retile_genomic_gap_lower_limit');
my $ulimit		= $cfg->val('params', 'retile_genomic_gap_upper_limit');
my $poly_a		= $cfg->val('params', 'retile_poly_a_percentage');
my $rtilesize		= $cfg->val('params', 'retile_tilesize');
my $rtileoverlay	= $cfg->val('params', 'retile_tileoverlay');
my $rextension		= $cfg->val('params', 'retile_exon_extension');

my $init_cell_number_cutoff 		= $cfg->val('params', 'init_cell_number_cutoff');
my $extension_cell_number_cutoff 	= $cfg->val('params', 'extension_cell_number_cutoff');
my $inside_retiling_cell_number_cutoff	= $cfg->val('params', 'inside_retiling_cell_number_cutoff'); 
my $level = $cfg->val('options', 'executionLevel');

# Unbiased parameters
my $unbiased_mm;
my $gtag_score;
my $gcag_score;
my $atac_score;
my $eps_insertion;
my $eps_deletion;
my $eps_intron;
my $shortest_intron_length;
my $longest_intron_length;
my $mean_length_del_ins;
my $min_diff_misorientation;

# stat variables
my $match;
my $deletion;
my $insertion;
my $intron;
my $splice;
my $statistics;

if ($level == 0) {
	$unbiased_mm = $cfg->val('unbiased', 'mismatch');
        $gtag_score = $cfg->val('unbiased', 'gtag_score');
        $gcag_score = $cfg->val('unbiased', 'gcag_score');
        $atac_score = $cfg->val('unbiased', 'atac_score');
        $eps_insertion = $cfg->val('unbiased', 'eps_insertion');
        $eps_deletion = $cfg->val('unbiased', 'eps_deletion');
        $eps_intron = $cfg->val('unbiased', 'eps_intron');
        $mean_length_del_ins = $cfg->val('unbiased', 'mean_length_del_ins');
        $shortest_intron_length = $cfg->val('unbiased', 'shortest_intron_length');
        $longest_intron_length = $cfg->val('unbiased', 'longest_intron_length');
        $min_diff_misorientation = $cfg->val('unbiased', 'min_diff_misorientation');
	set_unbiased_parameters();	
}
else {
	set_parameters();
}

sub set_parameters {
	print "Loading Statistics ... \n";
	$statistics = Statistics->new();
	$statistics->load($splSpaD, $splPadding);
	$statistics->set_eRate();

	print "Loading Insertion ...\n";
	$insertion = Insertion->new($statistics->{insertion});
	$statistics->free_insertion();
                                                                                                                                                         
	print "Loading Deletion ...\n";
	$deletion = Deletion->new($statistics->{deletion});
	$statistics->free_deletion();
                                                                                                                                                         
	print "Loading Intron ...\n";
	$intron = Intron->new($statistics->{intron}, $Nbin);
	$statistics->free_intron();
                                                                                                                                                         
	print "Loading Junction ...\n";
	$splice = Splice->new($splSteps, $splMotif, $splProg, $splchromD, $splPadding);
	$splice->set($statistics->{fileInfo});
                                                                                                                                                         
	print "Loading Matches ...\n";
	$match  = Match->new($statistics->{lmismatchScore});
	print "Print data out ...\n";
                                                                                                                                                         
	open(OUT, ">$ARGV[1]") || die "Could not open $ARGV[1] : $!\n";       
	print_parameters(*OUT);
	close(OUT);
}


sub set_unbiased_parameters {
	open(OUT, ">$ARGV[1]") || die "Could not open $ARGV[1] : $!\n";
	print_unbiased_parameter(*OUT);
	close(OUT);
}

sub print_unbiased_parameter {
	my $out = shift;
	print_top($out);
	print_diagonal_extension_parameter($out);
	print_cutoff_parameters($out);
	print_unbiased_parameter_scores($out);
	print_end($out);
}

sub print_parameters {
	my $out = shift;
	print_top($out);
	print_diagonal_extension_parameter($out);
	print_cutoff_parameters($out);
	print_parameter_scores($out);
	print_end($out);
}

sub print_top {
    my $out = shift;
    print $out "######################################################################\n";
    print $out "# SPA PARAMETER FILE\n";
    print $out "######################################################################\n\n";
    print $out "######## GENERAL REMARKS ###############################\n";
    print $out "# Spa uses a probabilistic Bayesian models to score alignments\n"; 
    print $out "# Dynamic programming is used to find the optimal alignment under this model\n";
    print $out "# To keep running times feasible we use the BLAT gfserver to identify genomic loci\n";
    print $out "# and only try to align those loci.\n";
    print $out "# In addition, for each locus, we first find regions of homology between clone and genome\n";
    print $out "# and only do the dynamic programming for a subset of positions in the dynamic programming matrix\n";
    print $out "# that are in or near such regions of homology. We call these 'defined positions' of the dynamic\n";
    print $out "# programming matrix.\n";
    print $out "# The parameters in this file control\n";
    print $out "# 1. the specifics of the Bayesian model (i.e. likelihoods of various gene structures and sequencing errors)\n";
    print $out "# 2. The heuristics that determine defined positions in the dynamic programming matrix (which trade-off running\n";
    print $out "#    time against alignment accuracy.\n\n";
}

sub print_diagonal_extension_parameter {
    my $out = shift;
    print $out "######################################################################\n";
    print $out "# Diagonal Extension Parameters\n\n";
    print $out "define_radius $dradius\n";
    print $out "# This parameter controls how much 'fuzz' is added to the end of diagonals in the dynamic programming matrix\n";
    print $out "# Larger numbers mean longer running time and more accurate alignments.\n\n";
}

sub print_cutoff_parameters {
    my $out = shift;
    print $out "######################################################################\n";
    print $out "# Control the the starting tile size\n";
    print $out "# if number of cells > init_cell_number_cutoff\n";
    print $out "# the tile size will be increased to keep the number of cell\n";
    print $out "# less than this cutoff\n";
    print $out "init_cell_number_cutoff $init_cell_number_cutoff\n";
    print $out "#total number of positions in the dynamic programming matrix that are allowed.\n";
    print $out "#Larger number mean longer running time and more accurate alignments\n\n\n";
    print $out "# Control the number of cells occuring from an extension of the locus\n";
    print $out "extension_cell_number_cutoff $extension_cell_number_cutoff\n";
    print $out "#number of extra positions in the dynamic programming matrix that are allowed when doing extensions.\n";
    print $out "#Larger numbers mean longer running time and more accurate alignments\n\n";
    print $out "# Control the number of cells occuring during retiling\n";
    print $out "inside_retiling_cell_number_cutoff $inside_retiling_cell_number_cutoff\n";
    print $out "#Number of extra positions in the dynamic programming matrix that are allowed at each retiling step.\n\n";
}

sub print_unbiased_parameter_scores {
	my $out = shift;
	print $out "######################################################################\n";
    	print $out "# Scoring Parameters\n\n";
    	print $out "# score_match/score_mismatch - probability per base\n";
    	print $out "# for a mismatch to be introduced by sequencing error\n";
   	print $out "# (or SNP)\n";
    	print $out "# (if both defined, must sum to 1.0; if one defined, other will be\n";
    	print $out "# set to make both to sum to 1.0)\n";
	$match = Match->new_unbiased($unbiased_mm);
	$match->print_data($out);
	print $out "# score_splice_NNNN - relative probability of various splice juctions\n";
    	print $out "# (first two and last two bases of the intron GT-AG is the canonical boundary)\n";
    	print $out "# (if all are defined, must sum to 1.0; otherwise sum must not exceed\n";
    	print $out "# 1.0, and undefined terms will be set equal to each other so as to\n";
    	print $out "# yield a total sum of 1.0)\n";
  	my $splice = Splice->new_unbiased($gtag_score, $gcag_score, $atac_score);
	$splice->print_data($out);
	print $out "# This parameter defines misorientation\n";
     	print $out "min_diff_misorientation $min_diff_misorientation\n\n";
	print $out "# genome non-intron jump scoring curve parameters\n";
    	print $out "# (specified as log(probability))\n";
    	print $out "#These parameters control the probability of sequencing errors leading\n";
    	print $out "#to deletions (from the clone) of various lengths.\n\n";
	$deletion = Deletion->new_unbiased($eps_deletion, $mean_length_del_ins);
	$deletion->print_data($out);
	print $out "\n\n# clone non-intron jump scoring curve parameters\n";
    	print $out "# (specified as log(probability))\n";
    	print $out "#These parameters control the probability of sequencing errors leading\n";
    	print $out "#to insertions (into the clone) of various lengths.\n";	
	$insertion = Insertion->new_unbiased($eps_insertion, $mean_length_del_ins);
	$insertion->print_data($out);
	print $out "\n\n# Intron parameters\n\n";
	$intron = Intron->new_unbiased($eps_intron, $shortest_intron_length, $longest_intron_length, $Nbin);
	$intron->print_data($out);
}


sub print_parameter_scores {
    my $out = shift;
    print $out "######################################################################\n";
    print $out "# Scoring Parameters\n\n";
    print $out "# score_match/score_mismatch - probability per base\n";
    print $out "# for a mismatch to be introduced by sequencing error\n";
    print $out "# (or SNP)\n";
    print $out "# (if both defined, must sum to 1.0; if one defined, other will be\n";
    print $out "# set to make both to sum to 1.0)\n";
    $match->print_data($out);
    print $out "# score_splice_NNNN - relative probability of various splice juctions\n";
    print $out "# (first two and last two bases of the intron GT-AG is the canonical boundary)\n";
    print $out "# (if all are defined, must sum to 1.0; otherwise sum must not exceed\n";
    print $out "# 1.0, and undefined terms will be set equal to each other so as to\n";
    print $out "# yield a total sum of 1.0)\n";
    $splice->print_data($out);
    if ($statistics->{totNbMisClones} == 0) {
	$min_diff_misorientation = -5.0;
    } 
    else {
    	$min_diff_misorientation = log($statistics->{totNbMisClones}/$statistics->{totalNbClones});
    }
    print $out "# This parameter defines misorientation\n";
    print $out "min_diff_misorientation $min_diff_misorientation\n\n";
    print $out "# genome non-intron jump scoring curve parameters\n";
    print $out "# (specified as log(probability))\n";
    print $out "#These parameters control the probability of sequencing errors leading\n";
    print $out "#to deletions (from the clone) of various lengths.\n\n";
    $deletion->print_data($out);
    print $out "\n\n# clone non-intron jump scoring curve parameters\n";
    print $out "# (specified as log(probability))\n";
    print $out "#These parameters control the probability of sequencing errors leading\n";
    print $out "#to insertions (into the clone) of various lengths.\n";
    $insertion->print_data($out);
    print $out "\n\n# Intron parameters\n\n";
    $intron->print_data($out);
}

         
sub print_end {
    my $out = shift;
    print $out "######################################################################\n";
    print $out "# Dynamic Retiling Parameters\n\n";           
    print $out "# retile_genomic_gap_lower_limit (any genomic gaps shorter than this\n";
    print $out "#               will not be retiled in an effort to align them)\n";
    print $out "#gaps this small should have been covered by the 'fuzz' at the end of diagonals\n";
    print $out "retile_genomic_gap_lower_limit $llimit\n\n";
    print $out "# retile_genomic_gap_upper_limit (any genomic gaps longer than this\n";
    print $out "#               will not be retiled in an effort to align them)\n";
    print $out "retile_genomic_gap_upper_limit $ulimit\n\n";
    print $out "# retile_poly_a_percentage (the percentage of A (or T in complement)\n";
    print $out "#               bases in unaligned 5' tail (or 3' tail) to disqualify\n";
    print $out "#               tail from consideration with dynamic tile resizing)\n";
    print $out "#If the tail of the clone fails to map and contains more than this percentage of As\n";
    print $out "#we will assume that it is a poly-A tail that was not removed\n";
    print $out "retile_poly_a_percentage $poly_a\n\n";
    print $out "# retile_tilesize (finegrain tilesize to resolve genome gaps)\n";
    print $out "#               values outside of the range [1,15] yield error\n";
    print $out "#this is now superseded by a recursive lowering of tilesize in retiling\n";
    print $out "retile_tilesize $rtilesize\n\n";
    print $out "# retile_tileoverlay (tileoverlay used to resolve genome gaps)\n";
    print $out "#               values outside of [1,retile_tilesize] yield error\n";
    print $out "#how many overlapping tiles per sequence segment\n";
    print $out "#(2 means that a tile of size l is shifted by l/2)\n";
    print $out "retile_tileoverlay $rtileoverlay\n\n";
    print $out "# retile_exon_extension (when constructing exons for the retiling,\n";
    print $out "#               an exon will include all aligned genome positions\n";
    print $out "#               which are adjacent to another aligned genome position\n";
    print $out "#               in the exon, and where the difference in offset in\n";
    print $out "#               either clone offset or genome offset is no more than\n";
    print $out "#               this value + 1 (i.e. 0 means each offset must be\n";
    print $out "#               exactly adjacent to the adjacent aligned position)\n";
    print $out "#When deciding what region to retile we first construct a set of exons\n";
    print $out "#we consider neighboring aligned areas separated by less than retile_exon_extension\n";
    print $out "#bases as part of the same exon.\n";
    print $out "retile_exon_extension $rextension\n\n";
    print $out "######################################################################\n";
    print $out "# end SPA PARAMETER FILE\n";
}

