/*                        PhyloGibbs                                  */

/*   Algorithm developed by Rahul Siddharthan, Erik van Nimwegen      * 
 *   and Eric D. Siggia at The Rockefeller University, New York       * 
 *                                                                    *
 *   This code copyright (C) 2004 Rahul Siddharthan <rsidd@online.fr> * 
 *   Licensed under the GNU General Public License (see COPYING)      */ 

/* 
 * $Author: rsidd $  
 * $Date: 2005/05/22 13:29:54 $ 
 * $Id: trackedbinprint.c,v 1.5 2005/05/22 13:29:54 rsidd Exp $ 
 */

#include <stdio.h>
#include <math.h>
#include <ctype.h>
#include <string.h>
#include <gsl/gsl_permutation.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_sort_vector.h>
#include "interspecies.h"
#include "commonroutines.h"

double LTWO =  0.693147181;

void getseqshortstr(char *seq, char *outseq, int count) {
    int nlow,nhigh,nlen;

    nlow=0;
    nlen=strlen(seq);
    while ((seq[nlow]=='>')||(isspace(seq[nlow]))) 
        nlow++;
    nhigh=nlow+1;
    while ((!isspace(seq[nhigh]))&&(nhigh<nlow+count)&&(nhigh<nlen))
        nhigh++;
    strncpy(outseq,seq+nlow,nhigh-nlow+1);
    outseq[nhigh-nlow+1]=0;
}

int trackedbinprint(params *v, long int Niter) 
{
    int l,m,m1,n,n1,n2, currpos,thisdir,motifnumber,wmpos,thisstart,refnumber;
    double thisocc,thisrev,bestthres,nA,nC,nG,nT;
    char *dataptr, outchar, tempstr[11];
    GPtrArray *binwin1;
    GArray *thisbasecountset;
    dialignseq tempseq;

    FILE *outputfile;
    gsl_matrix *basecounts;
    GPtrArray *trackedbinset;
    GString *thisname;
    window *thiswin;
    GPtrArray *thispriorbinbase; /***base counts for a WM ***/
    thispriorbinbase = NULL;
    GArray *onebinbase; /**array with the base-counts in a color***/

    gsl_permutation *win_permut, *bin_permut;
    gsl_vector *winscorevec, *binscorevec;

	win_permut=NULL;
	winscorevec=NULL;	
	

    binscorevec=gsl_vector_alloc((v->trackedbins)->len);
    bin_permut=gsl_permutation_alloc((v->trackedbins)->len);

    basecounts=gsl_matrix_alloc(v->wwidth,4);

    
    if (strcmp(v->trackedprintfile,"stdout")==0) 
        outputfile=stdout;
    else
        outputfile=fopen(v->trackedprintfile,"wt");

    fprintf(outputfile,"Command-line arguments: %s\n",v->arguments);
    for (n=0; n<(v->seqarray)->len; n++) {
      tempseq=g_array_index((v->seqarray),dialignseq,n);
      fprintf(outputfile,"Seq %3d: %s Length %d\n",n,tempseq.name->str,tempseq.barelength);
    }
    fprintf(outputfile,"\n");
    fprintf(outputfile,"GSL random number seed %ld\n",v->seed);
    fprintf(outputfile,"No. of moves: colour %ld, single window %ld, shift %ld, total %ld\n",v->tcsteps,v->twsteps,v->tssteps,v->tcsteps+v->twsteps+v->tssteps); 
    if(v->countent > 0)
      {
	fprintf(outputfile,"Average log-posterior probability of sampled configurations: %lf\n\n",(v->meanent)/((double) v->countent));
      }
    fprintf(outputfile,"\n");

    fprintf(outputfile,"== Posterior probabilities obtained through tracking the reference state. ==\n");

    trackedbinset=g_ptr_array_new();
    /***get the top posterior probability for each window*****/
    /**and fill the trackedbinset*********/
    for (n=0; n<(v->trackedbins)->len; n++) 
      {
        bestthres=0;
        binwin1=g_ptr_array_new();
        for (m=0; m<(v->win)->len; m++) 
	  {
            thiswin=&g_array_index((v->win),window,m);
	    
            if (thiswin->trackedocc != NULL)
                if (thiswin->trackedocc->len > n) 
		  {
                    thisocc=g_array_index(thiswin->trackedocc,int,n)/(Niter*1.0);
		    if (thisocc> v->trackingcutoff) 
		      {
                        g_ptr_array_add(binwin1,thiswin);
                        if (thisocc >bestthres)
			  bestthres=thisocc;
		      }
		  }
	  }
        g_ptr_array_add(trackedbinset,binwin1);
        gsl_vector_set(binscorevec,n,bestthres);
      }
    assert(trackedbinset->len==(v->trackedbins)->len);

    gsl_sort_vector_index(bin_permut,binscorevec);

    for (n=0; n<trackedbinset->len; n++) {
        motifnumber=gsl_permutation_get(bin_permut,trackedbinset->len-n-1);
	/***only print motif when it has nonzero number of windows in it****/
	binwin1=g_ptr_array_index(trackedbinset,motifnumber);
        if (binwin1->len > 0)
	  {
	    /**get the anneal number of this bin****/
	    refnumber=g_array_index(v->binnums,int,motifnumber);
	    if((v->priorbinbase)->len < refnumber)
	      {
		fprintf(outputfile, "\nTracking stats motif %d\n--------------\n",motifnumber+1);
	      }
	    else
	      {
		thisname = g_ptr_array_index(v->priorbinname,refnumber-1);
		fprintf(outputfile, "\nTracking stats motif %d, Reference %d %s\n---------------\n",motifnumber+1,refnumber,thisname->str);
		thispriorbinbase = g_ptr_array_index(v->priorbinbase,refnumber-1);
	      }
	    
            winscorevec=gsl_vector_alloc(binwin1->len);
            win_permut=gsl_permutation_alloc(binwin1->len);
            for (m=0; m<binwin1->len; m++) 
	      {
                thiswin=g_ptr_array_index(binwin1,m);
                if (thiswin->trackedocc!=NULL)
		  gsl_vector_set(winscorevec,m,g_array_index(thiswin->trackedocc,int,motifnumber)/(Niter*1.0));
	      }
            if (binwin1->len > 1)
	      gsl_sort_vector_index(win_permut,winscorevec);
            gsl_matrix_set_zero(basecounts);
	    /***go over all windows in the color***/
	    for (m=0; m<binwin1->len; m++) 
	      {
		if (binwin1->len>1)
		  m1=gsl_permutation_get(win_permut,binwin1->len-m-1);
		else
		  m1=m;
		/**current window and its posterior***/
		thiswin=g_ptr_array_index(binwin1,m1);
		thisocc=gsl_vector_get(winscorevec,m1);
		thisrev = (double) (g_array_index(thiswin->trackedrev,int,motifnumber)/(Niter*1.0));
		if (!v->usedir)
		  thisdir=0;
		else 
		  {
		    /**choose direction that was most common in tracking****/
		    if((thisocc-thisrev) >= thisrev)
		      thisdir = 0;
		    else
		      thisdir = 1;
		  }
	    
		for (n1=0; n1<v->wwidth; n1++) 
		  {
		    /***include counts from the window both in forward and backward direction****/
		    thisbasecountset=g_ptr_array_index(thiswin->basecount,v->wwidth-1-n1);
		    for (n2=0; n2<4; n2++) 
		      gsl_matrix_set(basecounts,n1,n2,
				     gsl_matrix_get(basecounts,n1,n2)
				     +g_array_index(thisbasecountset,double,3-n2)*thisrev);
		    
		    thisbasecountset=g_ptr_array_index(thiswin->basecount,n1);
		    for (n2=0; n2<4; n2++) 
		      gsl_matrix_set(basecounts,n1,n2,
				     gsl_matrix_get(basecounts,n1,n2)
				     +g_array_index(thisbasecountset,double,n2)*(thisocc-thisrev));
		  }
		
		for (n1=0; n1<thiswin->seq->len; n1++) 
		  {
		    if (thisdir) 
		      {
			for (l=v->wwidth-1+v->wwidth/2; l>=-v->wwidth/2; l--) 
			  {
			    outchar='x'; 
			    if (!((l<=v->wwidth-1)&&(l>=0))) 
			      {
				currpos=g_array_index(thiswin->start,int,n1)+l;
				tempseq=g_array_index((v->seqarray),dialignseq,
						      g_array_index(thiswin->seq,int,n1));
				if ((currpos<0) || (currpos>=tempseq.bareseq->len))
				  outchar=' ';
			      }
			    if (outchar != ' ') 
			      {
				dataptr=g_array_index(thiswin->data,char*,n1);
				switch (*(dataptr+l)) 
				  {
				  case 'a': case 'A': outchar='T'; break;
				  case 'c': case 'C': outchar='G'; break;
				  case 'g': case 'G': outchar='C'; break;
				  case 't': case 'T': outchar='A'; break;
				  case 'r': case 'R': outchar='Y'; break;
				  case 'y': case 'Y': outchar='R'; break;
				  case 'm': case 'M': outchar='K'; break;
				  case 'k': case 'K': outchar='M'; break;
				  case 's': case 'S': outchar='S'; break;
				  case 'w': case 'W': outchar='W'; break;
				  case 'b': case 'B': outchar='V'; break;
				  case 'v': case 'V': outchar='B'; break;
				  case 'd': case 'D': outchar='H'; break;
				  case 'h': case 'H': outchar='D'; break;
				  case 'n': case 'N': outchar='N'; break;
				  }
			    
				if (!((l<=v->wwidth-1)&&(l>=0)))
				  outchar=tolower(outchar);
			      }
			    fputc(outchar,outputfile);
			  }
		    
			if (n1==0)
			  fprintf(outputfile," -- [rev] ");
			else if (n1==thiswin->seq->len-1)
			  fprintf(outputfile," `- [rev] ");
			else
			  fprintf(outputfile," |- [rev] ");
		      }
		    else 
		      {
			for (l=-v->wwidth/2; l<v->wwidth+v->wwidth/2; l++)
			  if ((l>=0)&&(l<v->wwidth)) 
			    {
			      dataptr=g_array_index(thiswin->data,char*,n1);
			      outchar=toupper(*(dataptr+l));
			      fputc(outchar,outputfile);
			    } 
			  else 
			    {
			      currpos=g_array_index(thiswin->start,int,n1)+l;
			      tempseq=g_array_index((v->seqarray),dialignseq,
						    g_array_index(thiswin->seq,int,n1));
			      if ((currpos>=0) && (currpos<tempseq.bareseq->len)) 
				{
				  dataptr=g_array_index(thiswin->data,char*,n1);
				  outchar=tolower(*(dataptr+l));
				  fputc(outchar,outputfile);
				} 
			      else 
				{
				  outchar=' ';
				  fputc(outchar,outputfile);
				}
			    }
			
			if (n1==0)
			  fprintf(outputfile," -- [fwd] ");
			else if (n1==thiswin->seq->len-1)
			  fprintf(outputfile," `- [fwd] ");
			else
			  fprintf(outputfile," |- [fwd] ");
		      }

		    tempseq=g_array_index((v->seqarray),dialignseq,g_array_index(thiswin->seq,int,n1));
		    getseqshortstr(tempseq.name->str,tempstr,10);
		    thisstart= g_array_index(thiswin->start,int,n1);
		    if (v->printreverse)
		      thisstart=-(tempseq.barelength-thisstart);
		    if (n1==0)
		      fprintf(outputfile," seq  %3d %11s  pos  %4d  prob %4.2f",
			      g_array_index(thiswin->seq,int,0), tempstr,
			      thisstart,thisocc);
		    else
		      fprintf(outputfile," seq  %3d %11s  ",
			      g_array_index(thiswin->seq,int,n1), tempstr);
		    
		    if (n1!=0)
		      fprintf(outputfile, "pos  %4d  ",thisstart);
		
		    fprintf(outputfile,"\n");
		  }
	      }
	    
            fprintf(outputfile,"-------- Weight matrix for this motif (absolute base counts)---------\n");
            fprintf(outputfile,"//\n");
	    if((v->priorbinbase)->len < refnumber)
              {
                fprintf(outputfile, "NA Motif_%d\n",motifnumber+1);
              }
            else
              {
                thisname = g_ptr_array_index(v->priorbinname,refnumber-1);
                fprintf(outputfile, "NA Motif_%d, Reference %d %s\n",motifnumber+1,refnumber,thisname->str);
                thispriorbinbase = g_ptr_array_index(v->priorbinbase,refnumber-1);
              }
            fprintf(outputfile,"%2s    %6c     %6c     %6c     %6c     %6s      %6s\n","PO",'A','C','G','T',"cons","inf");
            for (wmpos=0; wmpos<v->wwidth; wmpos++) 
	      {
                nA=gsl_matrix_get(basecounts,wmpos,0);
                nC=gsl_matrix_get(basecounts,wmpos,1);
                nG=gsl_matrix_get(basecounts,wmpos,2);
                nT=gsl_matrix_get(basecounts,wmpos,3);
		if(refnumber <= (v->priorbinbase)->len)
		  {
		    onebinbase = g_ptr_array_index(thispriorbinbase,wmpos);
		    nA += g_array_index(onebinbase,double,0);
		    nC += g_array_index(onebinbase,double,1);
		    nG += g_array_index(onebinbase,double,2);
		    nT += g_array_index(onebinbase,double,3);
		  }
                fprintf(outputfile,
                        "%2s    %6.2f     %6.2f     %6.2f     %6.2f     %6c      %6.2f\n",
                        twodigitstr(wmpos+1), nA,nC,nG,nT,
                        consensus(nA,nC,nG,nT),
                        infscore(nA,nC,nG,nT));
	      }
            fprintf(outputfile,"//\n");
            gsl_vector_free(winscorevec);
            gsl_permutation_free(win_permut);
	  
	    fprintf(outputfile,"==============================\n");
	  }
    }
    
    if (strcmp(v->trackedprintfile,"stdout")!=0)
      fclose(outputfile);
    
    gsl_vector_free(binscorevec);
    gsl_permutation_free(bin_permut);
    gsl_matrix_free(basecounts);

    return 0;
}
