/*****************************************************************
        Copyright by Rockefeller University,
can not be reproduced or distributed without written permission of
copyright holder.  Version of October 2003.

Written by Saurabh Sinha (contact person), Erik van Nimwegen, and 
Eric Siggia.

The program stubb (and its relatives) implement an algorithm for
finding likely cis-regulatory modules, described in the following
paper:
"A Probabilistic Method to Detect Regulatory Modules"
by Saurabh Sinha, Erik van Nimwegen and Eric Siggia. 
Eleventh International Conference on Intelligent Systems for
Molecular Biology, Brisbane, Australia, July 2003, pg 292-301.

The file sample/gap_wtmx that comes with this distribution includes 
a sample set of transcription factor weight matrices (PWM's) that 
were reported in :
"Computational detection of genomic cis-regulatory modules applied
to body patterning in the early Drosophila embryo"
by N. Rajewsky, M. Vergassola, U. Gaul and E. Siggia.
BMC Bioinformatics 3 (30) 2002.
******************************************************************/

#include <stdio.h>
#include <assert.h>
#include <math.h>

#include "util.h"
#include "sequence.h"
#include "parameters.h"
#include "fastafile.h"

int globalid;
WtMx *global_background;

#define ZSCORETHRESHOLD 1.0
#define AC_IJ_THRESHOLD 1.0

main(int argc, char **argv)
{
  if (argc < 5) {
    printf("usage: %s <sequencefile> <wtmxfile> <windowsize> <shiftsize> [-od <output_dir>] [-b <background file>] [-ft <energy thresold for printing>] [-ot <motif occurrence threshold for printing]\n",argv[0]);
    exit(1);
  }
#ifdef _CYCLIC_WINDOWS
  printf("Error: stubbh01 does not support the _CYCLIC_WINDOWS option. Recompile the entire program with this option unset in Makefile.ss\n");
  exit(1);
#endif
  globalid = 0;
  global_background = NULL;

  // Read in the sequence  
  char *fastafile = argv[1];
  FastaFile sequences;
  sequences.ReadFasta(fastafile);

  // Read in the weight matrices
  char *wmcfile = argv[2];
  WtMxCollection wmc(wmcfile);

  // Read in the other two compulsory parameters
  int windowsize = atoi(argv[3]);
  int shiftsize = atoi(argv[4]);

  // Read in the optional arguments
  int argbase = 5;
  struct Options *opt = ReadOptionalArguments(argbase, argc, argv);

  if (opt->bkg_file != NULL) {
    Sequence *bkg_seq = new Sequence(opt->bkg_file);
    Window *bkg_window = new Window(bkg_seq,0,bkg_seq->Length()-1);
    global_background = Parameters::TrainWtMx(bkg_window);
    delete bkg_window;
    delete bkg_seq;
  }

  // Prepare for various printing
  FILE *prof = OpenProfile(fastafile, opt->output_dir);
  FILE *dict = OpenDictionary(fastafile, opt->output_dir);
  FILE *ener = OpenOutput (fastafile, opt->output_dir);
  FILE *corr = OpenCorrelation(fastafile, opt->output_dir);
  PrintParameters(fastafile,wmcfile,windowsize, shiftsize, opt); 
  
  for (int seqnum=0; seqnum < sequences.Size(); seqnum++) {
    Sequence *seq = sequences[seqnum];
    int seq_len = seq->Length();
    if (seq_len < 200) continue;

    // Declare that the matrices in wmc are not going to be modified, and cache the probabilities
#ifdef _OPTIMIZE_CACHESEQUENCEPROBABILITIES
    int cache_expires_at = -1;
#endif

    // now start iterating through the windows
    WindowIterator wi(seq);
    bool did_begin = false;
#ifdef _EXIST_SMALL_SEQUENCES
    for (did_begin = wi.Begin(min(windowsize,seq_len),shiftsize); did_begin && !wi.End(); wi.Next()) {
#else
    for (did_begin = wi.Begin(windowsize,shiftsize); did_begin && !wi.End(); wi.Next()) {
#endif
      vector<Window *> *wl = new vector<Window *>;
      wi.CurrentWindowList(wl);

      // verify that the windows are good
      if (wl->size() < 1) {
         delete wl;
         continue;
      }

      if (!IsValidWindowList(wl)) {
	int totalLen = 0;
	int startpos = (*wl)[0]->Start();
	for (int windex=0; windex<wl->size(); windex++) {
	  Window *win = (*wl)[windex];
	  totalLen += win->Length();
	  delete win;
	}
	delete wl;
	fprintf(ener,"%d\t0.000000\t0.000000\t%d\t0\n",startpos,totalLen);
	continue;
      }
	
#ifdef _OPTIMIZE_CACHESEQUENCEPROBABILITIES
      int current_position = (*wl)[0]->Start();
      if (current_position > cache_expires_at) {
	Parameters::DeleteCacheSubsequenceProbabilities(seq);
#ifdef _WTMX_BIAS
	Parameters::CacheSubsequenceProbabilities(seq,&wmc,current_position,windowsize);
	cache_expires_at = current_position + windowsize - 1;
#else 
	Parameters::CacheSubsequenceProbabilities(seq,&wmc,current_position,PROBABILITY_CACHE_SIZE);
	cache_expires_at = current_position + PROBABILITY_CACHE_SIZE - 1;
#endif
      }
#endif

      Parameters_H0 *param0 = new Parameters_H0;
      int bkgIndex = param0->BackgroundIndex(&wmc);
      param0->Initialize(wl,&wmc,bkgIndex);
      param0->Train();   

      Parameters_H1 *param1 = new Parameters_H1;
      param1->UpgradeInitialize(wl,&wmc,param0);
      param1->DoDynamicProgramming();
      bkgIndex = param1->BackgroundIndex();

      int param1_numWM = param1->NumWM();
      int *correlation = new int[2*param1_numWM*param1_numWM];
      int cpos = 0;
      DTYPE  *expectations;
      for (int i=0; i<param1_numWM; i++) {
	for (int j=0; j<param1_numWM; j++) {
	  if (i==bkgIndex || j==bkgIndex) continue;
	  // if (i==j) continue;
	  DTYPE  ac = param1->ComputeAverageCount(i,j);
	  DTYPE  eac = param1->ComputeExpectedAverageCount(i,j,param0,expectations);
	  if (eac < AC_IJ_THRESHOLD) {
	    delete [] expectations;	  
	    continue;
	  }
	  DTYPE  vac = param1->ComputeVarianceOfCount(i,j,param0,expectations);
	  delete [] expectations;
	  
	  if (vac <= 0) continue;
	  DTYPE  zsc = (ac-eac)/sqrt(vac);
	  if (abs(zsc) > ZSCORETHRESHOLD) {
	    char seqname[1024]; seq->Name(seqname);
	    fprintf(corr,">Sequence %s\tPosition %d\ncorrelation (%g) between %d and %d: %.2f %.2f %.2f %.2f\n",seqname,(*wl)[0]->Start(),zsc,i,j,ac,eac,vac,zsc);
	    param0->PrintAverageCounts(corr);
	    correlation[cpos++] = i;
	    correlation[cpos++] = j;
	  }
	}
      }

      if (cpos > 0) {
#ifdef _CYCLIC_WINDOWS
	float **initial = param0->GetLastMotifs();
#endif
	Parameters_H01 *p01_1 = new Parameters_H01;
	p01_1->UpgradeInitialize(wl,&wmc,param0);
	p01_1->EnableCorrelation(correlation, cpos/2);
#ifdef _CYCLIC_WINDOWS
	p01_1->SetInitial(initial);
#endif
	p01_1->Train();
#ifdef _CYCLIC_WINDOWS
	param0->DeleteSpaceForLastMotifs(initial);
	initial = NULL;
#endif
	p01_1->Print(ener);
	if (p01_1->Free_Energy_Differential() > opt->fen_threshold) 	
	  p01_1->PrintProfile(prof,dict,opt->motif_occurrence_threshold);
	fprintf(corr,"%.4f -> %.4f\n",param0->Free_Energy_Differential(),p01_1->Free_Energy_Differential());
	p01_1->PrintProbabilities(corr);
	fprintf(corr,"<\n");
	delete p01_1;
      }
      else {
	param0->Print(ener);
	if (param0->Free_Energy_Differential() > opt->fen_threshold) 
	  param0->PrintProfile(prof,dict,opt->motif_occurrence_threshold);
      }

      delete [] correlation;
      delete param1;
      
      delete param0;
      for (int windex=0; windex<wl->size(); windex++) {
	Window *win = (*wl)[windex];
	delete win;
      }
      delete wl;
    }
    
#ifdef _OPTIMIZE_CACHESEQUENCEPROBABILITIES
    Parameters::DeleteCacheSubsequenceProbabilities(seq);
#endif
  }
  
  fclose(corr);
  fclose(ener);
  fclose(dict);
  fclose(prof);
  delete opt;
}
