/*****************************************************************
        Copyright by Rockefeller University,
can not be reproduced or distributed without written permission of
copyright holder.  Version of October 2003.

Written by Saurabh Sinha (contact person), Erik van Nimwegen, and 
Eric Siggia.

The program stubb (and its relatives) implement an algorithm for
finding likely cis-regulatory modules, described in the following
paper:
"A Probabilistic Method to Detect Regulatory Modules"
by Saurabh Sinha, Erik van Nimwegen and Eric Siggia. 
Eleventh International Conference on Intelligent Systems for
Molecular Biology, Brisbane, Australia, July 2003, pg 292-301.

The file sample/gap_wtmx that comes with this distribution includes 
a sample set of transcription factor weight matrices (PWM's) that 
were reported in :
"Computational detection of genomic cis-regulatory modules applied
to body patterning in the early Drosophila embryo"
by N. Rajewsky, M. Vergassola, U. Gaul and E. Siggia.
BMC Bioinformatics 3 (30) 2002.
******************************************************************/

#include <stdio.h>
#include <assert.h>
#include <math.h>

#include "util.h"
#include "sequence.h"
#include "parameters.h"
#include "fastafile.h"

int globalid;
WtMx * global_background;

extern Sequence **ReadLaganOutput(char *fastaname, char *anchname, int &numSequences);
// define the file to be used as the default phylogeny file.
#define PHYLOGENY_FILE "sample/phylogeny.txt"

main(int argc, char **argv)
{
  if (argc < 8) {
    printf("usage: %s <sequencefile> <wtmxfile> <windowsize> -pf <phylogeny file> -af <alignments file>\n",argv[0]);
    exit(1);
  }
  globalid = 0;
  global_background = NULL;

  // Read in the weight matrices
  char *wmcfile = argv[2];
  WtMxCollection wmc(wmcfile);
  int max_wm_len = wmc.MaxLength();
  int numWM = wmc.Size();

  // Read in the other two compulsory parameters
  int windowsize = atoi(argv[3]);
  int shiftsize = windowsize;

  // Read in the optional arguments
  int argbase = 4;
  struct Options *opt = ReadOptionalArguments(argbase, argc, argv);

  if (opt->bkg_file != NULL) {
    Sequence *bkg_seq = new Sequence(opt->bkg_file);
    Window *bkg_window = new Window(bkg_seq,0,bkg_seq->Length()-1);
    global_background = Parameters::TrainWtMx(bkg_window);
    delete bkg_window;
    delete bkg_seq;
  }

  // Read in the sequences
  int numSequences;
  char *sequencefile = argv[1];
  Sequence **seqs;
  if (opt->anchors_file == NULL) {
    printf("No anchors file read in\n");
    exit(1);
  }
  else {
    seqs = ReadLaganOutput(sequencefile, opt->anchors_file, numSequences);
  }
  if (numSequences < 1) {
    printf("No sequence read from Lagan output\n");
    exit(1);
  }

  // Enter the phylogeny information into the system
  int numSpecies = numSequences;
  float *mu = new float[numSpecies]; 
  FILE *fpp = fopen(opt->phylogeny_file,"r");
  for (int i=0; i<numSequences; i++) {
    float mut;
    if (fscanf(fpp,"%f ",&mut) < 1) {
      printf("Error reading phylogeny file\n");
      exit(1);
    }
    mu[i] = mut;
  }
  fclose(fpp);
  Parameters::SetPhylogeny(mu,numSpecies);
  delete [] mu;


  // Declare that the matrices in wmc are not going to be modified, and cache the probabilities
#ifdef _OPTIMIZE_CACHESEQUENCEPROBABILITIES
  Parameters::CacheSubsequenceProbabilities(seqs[0],&wmc,0,seqs[0]->Length()-1);
#endif

  // create a copy of the entire alignment list
  Alignment *al_copy = new Alignment(seqs[0]->_alignments);

  // now start iterating through the windows
#ifdef _FIXED_WIDTH
#ifdef _REF_ONLY 
  MSWindowIteratorFixedShift wi(seqs,numSequences,true);
#else 
  MSWindowIteratorFixedShift wi(seqs,numSequences,false);
#endif
#else
  MSWindowIteratorAlignmentPunctuated wi(seqs,numSequences);
#endif

  bool did_begin = false;
  for (did_begin = wi.Begin(windowsize,shiftsize); did_begin && !wi.End(); wi.Next()) {
    vector<Window *> *wl = new vector<Window *>;
    wi.CurrentWindowList(wl, max_wm_len+1);

    // verify that the windows are good
    if (wl->size() < 1) {
      delete wl;
      continue;
    }	

    int totalLen = 0;
    for (vector<Window *>::iterator iter = wl->begin(); iter < wl->end(); iter++) {
      Window *win = (Window *)(*iter);
      if (win->Length() > 2*windowsize) {
	wl->erase(iter);
	delete win;
      }
      else totalLen += win->Length();
      // printf("window fragment %d: %d to %d\n",windex,win->Start(),win->Stop());
    }

    if (!IsValidWindowList(wl)) {
      int startpos = (*wl)[0]->Start();
      for (int windex=0; windex<wl->size(); windex++) {
	Window *win = (*wl)[windex];
	delete win;
      }
      delete wl;
      continue;
    }	

    // train with all matrices
    Parameters_H0 *param0 = new Parameters_H0;
    int bkgIndex = param0->BackgroundIndex(&wmc);
    param0->Initialize(wl,&wmc,bkgIndex);
    param0->Train();   

    // for each alignment in this window, check if a motif overlaps it
    // retrieve the start point (or end point of such motifs)
    // modify the alignment
    Window *mainwindow = (*wl)[0];
    AlignmentNode *algnlist = al_copy->GetAlignmentNodeList(mainwindow->Seq(),mainwindow->Start(), mainwindow->Stop());
    for (AlignmentNode *curAlgn = algnlist; curAlgn != NULL; curAlgn = curAlgn->_next) {
      int left_start = param0->MaximumLeftOverlap(curAlgn);
      if (left_start > 0) {
	curAlgn->ExtendToLeft(left_start);
      }
      int right_start = param0->MaximumRightOverlap(curAlgn);
      if (right_start > 0) {
	curAlgn->ExtendToRight(right_start);
      }
    }

    // clean up
    for (int windex=0; windex<wl->size(); windex++) {
      Window *win =  (*wl)[windex];
      delete win;
    }
    delete wl;
  }

#ifdef _OPTIMIZE_CACHESEQUENCEPROBABILITIES
  Parameters::DeleteCacheSubsequenceProbabilities(seqs[0]);
#endif

  if (opt->anchors_file) {
    char filename[1024];
    strcpy(filename,opt->anchors_file);
    strcat(filename,".mod");
    FILE *almod = fopen(filename,"w");
    al_copy->PrintAnchs(almod);
    fclose(almod);
  }

  for (int i=0; i<numSequences; i++) delete seqs[i];
  delete [] seqs;

  delete opt;
}



