/*****************************************************************
        Copyright by Rockefeller University,
can not be reproduced or distributed without written permission of
copyright holder.  Version of October 2003.

Written by Saurabh Sinha (contact person), Erik van Nimwegen, and 
Eric Siggia.

The program stubb (and its relatives) implement an algorithm for
finding likely cis-regulatory modules, described in the following
paper:
"A Probabilistic Method to Detect Regulatory Modules"
by Saurabh Sinha, Erik van Nimwegen and Eric Siggia. 
Eleventh International Conference on Intelligent Systems for
Molecular Biology, Brisbane, Australia, July 2003, pg 292-301.

The file sample/gap_wtmx that comes with this distribution includes 
a sample set of transcription factor weight matrices (PWM's) that 
were reported in :
"Computational detection of genomic cis-regulatory modules applied
to body patterning in the early Drosophila embryo"
by N. Rajewsky, M. Vergassola, U. Gaul and E. Siggia.
BMC Bioinformatics 3 (30) 2002.
******************************************************************/

#include "util.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "parameters.h"

void Warn(const char *str)
{
  fprintf(stderr,str);
}

FILE *OpenProfile(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".prof");
  FILE *prof = fopen(filename,"w");
  return prof;
}

FILE *OpenOutput(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".fen");
  FILE *out = fopen(filename,"w");
  return out;
}

FILE *OpenCorrelation(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".corr");
  FILE *corr = fopen(filename,"w");
  return corr;
}

FILE *OpenDictionary(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".dict");
  FILE *dict = fopen(filename,"w");
  return dict;
}

FILE *OpenAlignments(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".align");
  FILE *align = fopen(filename,"w");
  return align;
}

FILE *OpenProbabilities(char *file, char *output_dir)
{
  char filename[1024];
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(file);
    int pos = len - 1;
    while (pos >= 0) {
      if (file[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,file);
    }
    else {
      strcat(filename, &(file[pos+1]));
    }
  }
  else {
    strcpy(filename,file);
  }
  strcat(filename,".fitprobs");
  FILE *out = fopen(filename,"w");
  return out;
}

void PrintParameters(char *seqfile, char *wmcfile, int windowsize, int shiftsize, struct Options *opt)
{
  char filename[1024];
  char *output_dir = opt->output_dir;
  if (output_dir != NULL) {
    strcpy(filename, output_dir);
    strcat(filename, "/");
    int len = strlen(seqfile);
    int pos = len - 1;
    while (pos >= 0) {
      if (seqfile[pos]=='/') break;
      pos--;
    }
    if (pos < 0) {
      strcat(filename,seqfile);
    }
    else {
      strcat(filename, &(seqfile[pos+1]));
    }
  }
  else {
    strcpy(filename,seqfile);
  }
  strcat(filename,".parameters");
  FILE *par = fopen(filename,"w");

  fprintf(par,"External Parameters\n\nsequence file: %s\nweight matrix file: %s\nwindow width: %d\nwindow shift: %d\nbackground file: %s\nfree energy threshold for profiling: %.2f\nMotif count threshold for profiling: %.2f\n",seqfile,wmcfile,windowsize,shiftsize,opt->bkg_file,opt->fen_threshold, opt->motif_occurrence_threshold);
  if (opt->corr_file != NULL) {
    fprintf(par,"Forced correlations file: %s\n",opt->corr_file);
  }
  if (opt->corr_list != NULL) {
    fprintf(par,"Correlation list: \n");
    for (int i=0; i<opt->corr_list_size; i++) {
      fprintf(par,"%d\t%d\n",opt->corr_list[2*i],opt->corr_list[2*i+1]);
    }
  }
  fprintf(par,"\nInternal Parameters\n\nMARKOV_ORDER: %d (%d+1)-mers are counted\nCONTEXT_SIZE: %.2f\nBKG_FORWARD_ONLY: %d\nALMOST_ONE: %.2f\nParameters_H0 convergence threshold: %f\nParameters_H1 convergence threshold: %f\n",MARKOV_ORDER, MARKOV_ORDER, Parameters::GetContextWidthFactor(), Parameters::GetBackgroundOrientation(), Parameters::GetAlmostOne(), Parameters_H0::GetThreshold(), Parameters_H1::GetThreshold());
#ifdef _MARKOV
  fprintf(par,"Markov used\n");
#endif
#ifdef _MULTIPLE_SEQUENCES
  fprintf(par,"Multiple Sequences considered\n");
  fprintf(par,"Phylogeny file: %s\n",opt->phylogeny_file);
  int numSpecies;
  const float *mu = Parameters::GetPhylogeny(numSpecies);
  fprintf(par,"mutation rates (mu): ");
  for (int i=0; i<numSpecies; i++) fprintf(par,"%.2f ",mu[i]);
  fprintf(par,"\n");
  if (opt->anchors_file != NULL) {
    fprintf(par,"Anchors file (LAGAN output): %s\n",opt->anchors_file);
  }
#endif
#ifdef _FIXED_WIDTH
  fprintf(par,"Fixed Width windows\n");
#endif
#ifdef _REF_ONLY
  fprintf(par,"Reference Sequence only\n");
#endif
#ifdef ZSCORETHRESHOLD
  fprintf(par,"Z-score threshold for correlation detection: %.2f\n",ZSCORETHRESHOLD);
#endif
#ifdef AC_IJ_THRESHOLD
  fprintf(par,"AC_IJ threshold: %.2f\n", AC_IJ_THRESHOLD);
#endif
#ifdef _ONE_WINDOW
  fprintf(par,"entire sequence considered as one window\n");
#endif

  fclose(par);
}

struct Options *ReadOptionalArguments(int &argbase, int argc, char **argv)
{
  struct Options *opt = new struct Options;
  while (argbase < argc && argv[argbase][0]=='-') {
    if (strstr(argv[argbase],"-od")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -od must be followed by a path\n");
	exit(1);
      }
      opt->output_dir = argv[argbase];
      argbase++;
      continue;
    }    
    if (strstr(argv[argbase],"-b")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -b must be followed by a file name\n");
	exit(1);
      }
      opt->bkg_file = argv[argbase];
      argbase++;
      continue;
    }
    if (strstr(argv[argbase],"-ft")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -ft must be followed by a number\n");
	exit(1);
      }
      opt->fen_threshold = atof(argv[argbase]);
      argbase++;
      continue;
    }
    if (strstr(argv[argbase],"-ot")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -ot must be followed by a number\n");
	exit(1);
      }
      opt->motif_occurrence_threshold = atof(argv[argbase]);
      argbase++;
      continue;
    }
    if (strstr(argv[argbase],"-pf")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -pf must be followed by a file name\n");
	exit(1);
      }
      opt->phylogeny_file = argv[argbase];
      argbase++;
      continue;
    }
    if (strstr(argv[argbase],"-af")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -af must be followed by a file name\n");
	exit(1);
      }
      opt->anchors_file = argv[argbase];
      argbase++;
      continue;
    }
    if (strstr(argv[argbase],"-cf")) {
      printf("Error: this argument is not supported yet\n");
      exit(1);
      // opt->corr_file = argv[++argbase];
      // argbase++;
      // continue;
    }
    if (strstr(argv[argbase],"-cl")) {
      argbase++;
      if (argbase >= argc) {
	printf("Error: -cl must be followed by a number\n");
	exit(1);
      }
      int numpairs = atoi(argv[argbase]);
      if (numpairs < 1) {
	printf("Error: correlation list size must be positive\n");
	exit(1);
      }
      opt->corr_list_size = numpairs;
      opt->corr_list = new int[2*numpairs];
      int j = 0;
      for (int i=0; i<numpairs; i++) {
	argbase++;
	if (argbase >= argc) {
	  printf("Error: Invalid correlation list -- fewer pairs than expected\n");
	  exit(1);
	}
	opt->corr_list[j++] = atoi(argv[argbase]);
	argbase++;
	if (argbase >= argc) {
	  printf("Error: Invalid correlation list -- incomplete pair\n");
	  exit(1);
	}
	opt->corr_list[j++] = atoi(argv[argbase]);
      }
      argbase++;
      continue;
    }
    printf("Error: option %s not recognized\n",argv[argbase]);
    exit(1);
  }

  return opt;
}

bool IsValidWindowList(vector<Window *> *wl)
{
  int numSpecific = 0;
  int numTotal = 0;
  for (int i=0; i<wl->size(); i++) {
    Window *win = (*wl)[i];
    numSpecific += win->NumSpecificCharacters();
    numTotal += win->Length();
  }
  if (!(numTotal > MIN_WINDOW_LENGTH && float(numSpecific) > MIN_SPEC_FRACTION*float(numTotal))) return false;
  return true;
}

