/*                        PhyloGibbs                                  */

/*   Algorithm developed by Rahul Siddharthan, Erik van Nimwegen      * 
 *   and Eric D. Siggia at The Rockefeller University, New York       * 
 *                                                                    *
 *   This code copyright (C) 2004 Rahul Siddharthan <rsidd@online.fr> * 
 *   Licensed under the GNU General Public License (see COPYING)      */ 

/* $Id: initroutines.c,v 1.9 2005/05/23 15:07:09 rsidd Exp $ */


#include <stdio.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <sys/time.h>
#include <gsl/gsl_sf_gamma.h>
#include "interspecies.h"
#include "fasta.h"
#include "readfasta.h"
#include "readDialign.h"
#include "setbgcount.h"
#include "seq2nums.h"
#include "initwindows.h"
#include "initpermblocked.h"
#include "initbins.h"
#include "initbinconfig.h"
#include "initbinnumconfig.h"
#include "inittrackedbins.h"
#include "binbasecount.h"
#include "commonroutines.h"
#include "estimate_prior_params.h"
#include "tree_routines.h"
#include "readmotiffile.h"

void initvalues(params *v)
{
  v->nbgc=0;
    v->gethelp=0;
    strcpy(v->seqfile,"");
    v->wwidth=10;
    v->pseudocount=1.0;    
    v->usedialign=-1;
    v->usedir=1;
    v->rcsymm=0;
    v->quiet=0;
    v->verbose=0;
    v->beta=1.0;
    v->betaincr=-1;
    v->mu=0.0;
    v->lambda=0.0;
    v->deswin=-1;
    v->descol=-1;
    v->numtfs=-1;
    v->Ncolm=0;
    v->Nwinm=-1;
    v->Nshiftm=-1;
    v->writeatend=1;
    v->printreverse=0;
    strcpy(v->posfile,"");
    strcpy(v->blockedfile,"");
    strcpy(v->trackedbinfile,"");
    strcpy(v->trackedprintfile,"tracked_output");
    strcpy(v->outfile,"output");
    v->totaliters = -1;
    v->transientiters = -1;
    v->annealiters = -1;
    v->deepquenchiters= -1;
    v->nomutprob=-1.0;
    v->autotrack=1;
    v->trackingcutoff=0.05;
    v->bgpscount=-100.0;
    strcpy(v->bgfile,"");
    strcpy(v->motiffile,"");
    v->bin=NULL;
    v->fseqs=NULL;
    v->bgseqs=NULL;
    v->occset=NULL;
    v->seqmuset=NULL;
    v->trackedbins=NULL;
    v->givenbgcounts=NULL;
    v->binnums=NULL;
    v->labels=NULL;
    v->iterthres=0.001;
    v->tcsteps=0;
    v->twsteps=0;
    v->tssteps=0;
    v->seed=0;
    v->labeltree=NULL;
}



int processvalues(params *v) 
{
    int n,*speciesnum;
    double fnorm,ctr,mu;
    char *label;
    const gsl_rng_type *T;
    struct timeval tp;
    fastaseq *oneseq;
    GArray *newseqnum,*onebgcount;
    GPtrArray *leaves;
    GNode *oneleaf;

    /* no anneal = no autotracking */
    if (v->betaincr==0.0)
        v->autotrack=0;

    /* if bgpscount is not set, use 1.0 if bg file specified, 0.0 otherwise */
    if (v->bgpscount<-1.0) {
        if (strlen(v->bgfile)==0)
            v->bgpscount=1.0;
        else
            v->bgpscount=0.0;
    }
    
    if (v->labeltree != NULL) { /* construct labels and phylohistlist
                                 * from leaves of tree, and then use these
                                 * further below; also, add integers (species
                                 * index) to the leaf->data */
      if(v->usedialign == 0)
        {
          fprintf(stderr,"Error: Inconsistent parameters\nYou specified that the sequences are not alig\ned (-D 0)\nbut at the same time specified a phylogenetic tree with -L\nExiting\n");
          return 1;
        }
    
        leaves=g_ptr_array_new();
        get_gtree_leaves(v->labeltree,&leaves);
        if (v->labels != NULL) {
            fprintf(stderr,"Warning: ignoring -l option since -L was specified\n");
            g_ptr_array_free(v->labels,TRUE);
        }
        if (v->seqmuset != NULL) {
            fprintf(stderr,"Warning: ignoring -H option since -L was specified\n");
            g_array_free(v->seqmuset,TRUE);
        }
        if (v->nomutprob > -0.0000001) {
            fprintf(stderr,"Warning: -G has no effect when -L is specified\n");
            v->nomutprob= -1;
        }
        v->labels=g_ptr_array_new();
        v->seqmuset=g_array_new(TRUE,TRUE,sizeof(double));
        for (n=0; n<leaves->len; n++) {
            oneleaf=g_ptr_array_index(leaves,n);
            mu= *((double *)g_ptr_array_index((GPtrArray *)(oneleaf->data),0));
            label= g_ptr_array_index((GPtrArray *)(oneleaf->data),1);
            g_ptr_array_add(v->labels,label);
            g_array_append_val(v->seqmuset,mu);
            speciesnum = (int *)malloc(sizeof(int));
            *speciesnum=n;
            g_ptr_array_add((GPtrArray *)(oneleaf->data),speciesnum);
       } 
    }

    if (v->rcsymm) { /* no separate rc moves */
        v->usedir=0;
    }
    gsl_rng_env_setup();
    T=gsl_rng_default;
    v->gslrand=gsl_rng_alloc(T);

    if (getenv("GSL_RNG_SEED")==NULL) {
        if (gettimeofday(&tp, (struct timezone *)0) == -1){
            if (!v->quiet) fprintf(stderr, "Could not gettimeofday to initialize seed, using 0.\n");
            gsl_rng_set(v->gslrand,0);
        } else {
            v->seed=tp.tv_usec/1000;
            gsl_rng_set(v->gslrand,v->seed);
            if (!v->quiet) fprintf(stdout, "Using GSL seed: %ld\n",v->seed);
        }
    }
    else
        sscanf(getenv("GSL_RNG_SEED"),"%ld",&v->seed);

    /* pseudocount stuff */
    v->priors=g_array_new(TRUE, TRUE, sizeof(double));
    fnorm=v->pseudocount;
    g_array_append_val(v->priors,fnorm);
    g_array_append_val(v->priors,fnorm);
    g_array_append_val(v->priors,fnorm);
    g_array_append_val(v->priors,fnorm);
    fnorm=4.0*(v->pseudocount);
    g_array_append_val(v->priors,fnorm);
    fnorm=gsl_sf_lngamma(4.0*(v->pseudocount));
    fnorm=fnorm-4.0*gsl_sf_lngamma(v->pseudocount);
    g_array_append_val(v->priors,fnorm);

    if (strlen(v->seqfile)==0) {
        fprintf(stderr,"Must supply sequence file via -f\n");
        return 1;
    }
    if(!v->quiet)
      printf("reading input file\n");
    if (readDialign(v)) {
        fprintf(stderr,"Error reading sequence file %s: exiting\n",v->seqfile);
        return 1;
    }
    

    /**no bg file, copy input sequences as bgsequences***/
    if(!v->quiet)
      printf("initializing background model\n");
    if (strlen(v->bgfile)==0)
        v->bgseqs=v->fseqs;
    else 
      {
        if (readfasta(v->bgfile,&(v->bgseqs))) {
	  fprintf(stderr,"Error reading background seqs from %s: %d seqs read\n",v->bgfile,(v->bgseqs)->len);
        }
	/***clear the input sequences (no longer needed) ******/
	for (n=0; n<(v->fseqs)->len; n++)
	  {
	    oneseq=&g_array_index(v->fseqs,fastaseq,n);
	    g_string_free(oneseq->header,TRUE);
	    g_string_free(oneseq->seq,TRUE);
	  }
	g_array_free((v->fseqs),TRUE);
      }
    if ((v->bgseqs)->len==0) {
      fprintf(stderr,"Unable to read background sequences: exiting\n");
      return 1;
    }
    /***remove dashes from bg sequences****/
    v->bgseqs=stripdash(v->bgseqs);
    /***replace symbols with numbers***/
    for (n=0; n<(v->bgseqs)->len; n++) {
        oneseq=&g_array_index((v->bgseqs),fastaseq,n);
        newseqnum=NULL;
        seq2nums(oneseq->seq,&newseqnum);
        oneseq->seqnums=newseqnum;
    }
    /***given background counts****/
    if(!v->quiet)
      printf("setting background counts\n");
    if ((v->givenbgcounts)!=NULL) {
        v->bgcount=g_ptr_array_new();
        for (n=0; n<4; n++) {
            ctr=g_array_index((v->givenbgcounts),double,n);
            onebgcount=g_array_new(TRUE,TRUE,sizeof(double));
            g_array_append_val(onebgcount,ctr);
            g_ptr_array_add((v->bgcount),onebgcount);
        }
    } 
    /***counts from a file****/
    else 
      setbgcount(v->nbgc, v->bgseqs,&(v->bgcount),v->bgpscount);
    
    /***now the background sequences can be removed****/
    for (n=0; n<(v->bgseqs)->len; n++)
      {
	oneseq=&g_array_index(v->bgseqs,fastaseq,n);
	g_string_free(oneseq->header,TRUE);
	g_string_free(oneseq->seq,TRUE);
	g_array_free(oneseq->seqnums,TRUE);
      }
    g_array_free((v->bgseqs),TRUE);


    /* now treat nbgc= -1, -2 same as 0 */
    if (v->nbgc<=-1)
        v->nbgc=0;

    if (v->seqmuset==NULL)
        v->seqmuset=g_array_new(TRUE,TRUE,sizeof(double));


    /*** read in the motif file if it exists****/
    if(strlen(v->motiffile)==0)
      {
	v->priorbinbase = g_ptr_array_new();
	v->priorbinname = g_ptr_array_new();
	/**make a list of length zero***/
      }
    else
      {
	if(readmotiffile(v))
	  {
	    fprintf(stderr,"Error reading motif file %s: exiting\n",v->motiffile);
	    return 1;
	  }
      }

    return 0;
    
}


int setupwindows(params *v) 
{
    int n,nwin,nbin,one,npercol,nextra;
       
    one=1;
    
    if (initwindows(v))
        return 1;

    /**If number of desired windows and colors not given set them from input file or -I option***/
    if(v->deswin < 0 || v->descol < 0)
      {
	/**if file get from file **/
	if (strlen(v->posfile)>0)
	  {
	    initwincolnum(v);
	  }
	/**if numbers given set from those**/
	else if((v->occset)!=NULL)
	  {
	    /**set colors according to what was given with -I if missing values***/
	    if(v->descol < 0){
	      v->descol = (v->occset)->len;
	    }
	    if(v->deswin < 0)
	      {
		v->deswin = 0;
		for(n=0;n<((v->occset)->len);++n)
		  {
		    v->deswin += g_array_index((v->occset),int,n);
		  }
	      }
	    /**if given more colors than windows set number of colors to number of windows***/
	    if(v->descol > v->deswin)
	      {
		fprintf(stderr,"Warning: you specified more windows than colors.\n");
                fprintf(stderr,"Will set number of colors to number of windows\n");
                v->descol = v->deswin;
              }
	  }  
	/**else make a guess**/
	else
	  {
	    if(v->deswin < 0)
	      v->deswin = (int) ((v->win)->len)/(4*(v->wwidth));
	    if ((v->deswin)<2) 
	      v->deswin=2;
	    /**completely arbitrarily set number of colors to 3 when nothing is specified****/
	    if(v->descol < 0)
	      v->descol = 3;
	    if(v->descol > v->deswin)
	      v->descol = v->deswin;
	  }
      }

    /***some sanity check***/
    if(v->descol > v->deswin)
      {
	fprintf(stderr,"Warning you desired more colors than windows.\nWill set number of colors equal to number of windows\n");
	v->descol = v->deswin;
      }


    /*double check that things are working***/
    if(v->deswin <= 0 || v->descol <= 0 || v->deswin >= (v->win)->len)
      {
	fprintf(stderr,"something wrong with window number %d and color number %d after initializing\n",v->deswin,v->descol);
	return 1;
      }
    
    /**if using WM file check sanity of the parameters***/
    if((v->priorbinbase)->len > 0)
      {
        if((v->numtfs + (v->priorbinbase)->len) < v->descol)
          {
            fprintf(stderr,"Warning: you want more colors than you specified TFs existing\n");
            fprintf(stderr,"Will assume that beyond the %d TFs in the WM file there are %d other WMs\n", (v->priorbinbase)->len, v->descol-(v->priorbinbase)->len);
            v->numtfs = v->descol-(v->priorbinbase)->len;
          }
      }


    /***if doing color moves we estimate chemical potentials, otherwise set them zero ****/
    if(v->Ncolm == 0)
      {
        v->lambda = 0;
        v->mu = 0;
      }
    else
      {
        if(estimatepriorparams(v))
          {
            fprintf(stderr,"it is not guaranteed that the number of desired windows of specified length can be fitted to the input sequences, choose a smaller number of sequences or smaller windows\n");
            return 1;
          }
      }
    

    if (strlen(v->blockedfile)>0) 
        initpermblocked(v);

    /**this seems to be an extra call that is not needed***/
    initbins(v);

    if (strlen(v->posfile)>0) {  /* initfile supplied for starting
                                 window config */
        initbinconfig(v);
        nbin=(v->bin)->len-1;
        if (nbin==0)
            fprintf(stderr,"Error with init file %s: no initial windows selected\n",v->posfile);
        nwin=0;
        for (n=1; n<(v->bin)->len; n++)
            nwin=nwin+(*((GPtrArray *)(g_ptr_array_index(v->bin,n)))).len;
    }
    else if ((v->occset)!=NULL) {  /* initial window occs supplied with -I */
      if(initbinnumconfig(v))
	{
	  fprintf(stderr,"Error with init option -I: failed to assign all necessary windows\n");
	  return 1;
	}
        nbin=(v->bin)->len-1;
        if (nbin==0)
            fprintf(stderr,"Error with init option -I: no initial windows selected\n");
        nwin=0;
        for (n=1; n<(v->bin)->len; n++)
            nwin=nwin+(*((GPtrArray *)(g_ptr_array_index(v->bin,n)))).len;
    }
    /**set based on the desired number of colors and windows****/
    else { /* make a reasonable guess: roughly 1 motif per 25*wwidth windows
              and some not-too-large number of motifs searched
              simultaneously */
      if(v->descol > 0){
	nbin = v->descol;
      }
      else{
	nbin = 3;
      }
      if(v->deswin > 0){
	nwin = v->deswin;
      }
      else{
        nwin=((v->win)->len)/(4*(v->wwidth));
      	if (nwin<2) nwin=2;
      }
      if(nbin > nwin) nbin = nwin;
      nextra = (nwin % nbin);
      npercol = (int) (nwin/nbin);
      v->occset=g_array_new(TRUE,TRUE,sizeof(int));
      nextra = npercol+nextra;
      g_array_append_val(v->occset,nextra);
      for (n=1; n<nbin; n++)
	{
	  g_array_append_val(v->occset,npercol);
	}
      if(initbinnumconfig(v))
	{
	  fprintf(stderr,"Error with guessed number of windows: failed to assign all necessary windows\n");
	  return 1;
	}
    }
    
    if (v->bin->len == 1) {
      fprintf(stderr,"Error: no windows initialised\n");
      return 1;
    }
    

    if (strlen(v->trackedbinfile)>0) {
        inittrackedbins(v);
    }

    if (v->Ncolm < 0)
        v->Ncolm = nwin;
            
    if (v->Nwinm==-1) 
        v->Nwinm=nwin;

    if (v->Nshiftm==-1)
        v->Nshiftm=(v->descol)*2;
   
    if ((v->deepquenchiters == -1) &&(v->annealiters >= 0))
        v->deepquenchiters = (long int) (v->annealiters/30);

    if ((v->transientiters == -1) && (v->annealiters>=0))
        v->transientiters = (long int) (v->annealiters/10);

    if (v->annealiters == -1) {
        if (v->totaliters > 0) {
            v->annealiters = v->totaliters;
            v->transientiters = (long int) (v->annealiters/10);
            v->deepquenchiters = (long int) (v->annealiters/30);
        } else {
            v->totaliters = 100;
            v->annealiters = 100;
            v->transientiters = 10;
            v->deepquenchiters = 3;
        }
    }
    
    if (v->transientiters < 2)
        v->transientiters = 2;
    if (v->deepquenchiters < 2)
        v->deepquenchiters = 2;

    v->binbase=NULL;
    makebinbasecount(v);
    
    return 0;
}



void initprint(params *v) 
{
    int n;
    GPtrArray *binwin1;
    
    printf("Using sequence file (default seq.fna)                : %s\n",
           v->seqfile);
    printf("Using motif width (default 10)                       : %d\n",
           v->wwidth);
    if (v->usedialign==1) 
        printf("Using dialign alignment in input (default no)        : yes, \"soft constraint\"\n");
    if (v->usedialign==2) 
        printf("Using dialign alignment in input (default no)        : yes, \"hard constraint\"\n");
    if (v->rcsymm)
        printf("Searching with rev comp symmetry (default no)        : yes\n");
    else {
        if (v->usedir)
            printf("Searching on both strands (rev comp, default yes)    : yes\n");
        else
            printf("Searching on both strands (rev comp, default yes)    : no\n");
    }
    
    printf("Using pseudocount in scoring function (default 1.0)  : %f\n",
           v->pseudocount);
    
    
    if (strlen(v->bgfile)==0){
        if (v->nbgc==0)
            printf("Using bg correl : singlesite (default)\n");
        else
            printf("Using bg correl (default: singlesite)  : %d nghbr correl\n",v->nbgc);
        printf("Using pseudocount for background counts (default 1.0): %f\n",
               v->bgpscount);
        printf("Using input sequence file for background counts\n");
    } else {
        if (v->nbgc==0)
            printf("Using bg correl : singlesite (default)\n");
        else
            printf("Using bg correl (default: singlesite)  : %d nghbr correl\n",v->nbgc);
        printf("Using pseudocount for background counts (default 0.0): %f\n",v->bgpscount);
        printf("Using auxiliary file for bg counts                   : %s\n",v->bgfile);
    }
    
    
    if (v->Ncolm==0)
        printf("No colour change moves; number of colours stays fixed\n");
    else
        printf("Number of colour change moves/cycle (default auto)   : %d\n",
               v->Ncolm);
    printf("Number of single window moves/cycle (default auto)   : %d\n",
           v->Nwinm);
    printf("Number of global shift moves/cycle (default auto)    : %d\n",
           v->Nshiftm);
   
    if(v->transientiters>0)
        printf("Transient for %ld cycles\n",v->transientiters);
    
    if(v->annealiters>0)
        printf("Anneal for %ld cycles\n",v->annealiters);

    if(v->deepquenchiters>0)
        printf("Deep quench for %ld cycles\n",v->deepquenchiters);

    if (v->totaliters!=0)
        printf("Tracking for %ld cycles\n", v->totaliters);
        
    printf("Starting inverse temperature beta (default 1.0)      : %f\n",
           v->beta);
    if (v->trackedbins!=NULL)
        printf("Tracked windows listed in file (default none)        : %s\n",
               v->trackedbinfile);
    if (v->betaincr==0.0)
        printf("Not annealing (temperature fixed)\n");
    else
        printf("Using beta increment to anneal                       : %f\n",
               v->betaincr);
    if (v->mu>0.0)
        printf("Using chemical pot mu (default 0.0)                  : %f\n",
               v->mu);
    
    if(v->lambda>0.0)
      printf("Using color chem pot lambda (default 0.0)            : %f\n", v->lambda);  

    if (strlen(v->posfile)>0)
        printf("Reading initial window conf from file (default none) : %s\n",
               v->posfile);
    
    printf("Initial number of colours (nonzero) : %d\n",(v->bin)->len-1);
    if ((v->bin)->len > 1) {
        printf("Init number of windows in each colour (default auto) :");
        for (n=1; n<(v->bin)->len; n++) {
            binwin1=g_ptr_array_index(v->bin,n);
            printf(" %d",binwin1->len);
        }
        printf("\n");
    }
    
    if (strlen(v->blockedfile)>0) 
        printf("Reading blocked window conf from file (default none) : %s\n",
               v->blockedfile);
    
    if (strlen(v->outfile)==0)
        printf("Sending output to file (default output)              : none\n");
    else
        printf("Sending output to file (default output)              : %s\n",
               v->outfile);
    printf("Using GSL random number generator (default mt19937)  : %s\n",
           gsl_rng_name(v->gslrand));
    printf("Number of possible windows initialised               : %d\n",v->win->len);

}
