/*                        PhyloGibbs                                  */

/*   Algorithm developed by Rahul Siddharthan, Erik van Nimwegen      * 
 *   and Eric D. Siggia at The Rockefeller University, New York       * 
 *                                                                    *
 *   This code copyright (C) 2004 Rahul Siddharthan <rsidd@online.fr> * 
 *   Licensed under the GNU General Public License (see COPYING)      */ 

/* 
 * $Author: rsidd $  
 * $Date: 2005/05/05 17:21:10 $ 
 * $Id: readDialign.c,v 1.3 2005/05/05 17:21:10 rsidd Exp $ 
 */

#include <glib.h>       
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "fasta.h"
#include "readfasta.h"
#include "interspecies.h"


int legal(char c, int n) 
{
    int ret;

    ret=0;
    
    if (n) 
        switch (c) {
        case 'A': case 'C': case 'G': case 'T': case 'X': case '-':
        case 'a': case 'c': case 'g': case 't': case 'x':
            ret=1;
            break;
        default:
            ret=0;
            break;
        }
    else 
        switch (c) {
        case 'A': case 'C': case 'G': case 'T': case 'R': case 'Y':
        case 'M': case 'K': case 'S': case 'W': case 'B': case 'D':
        case 'H': case 'V': case 'N': case 'X':
        case 'a': case 'c': case 'g': case 't': case 'r': case 'y':
        case 'm': case 'k': case 's': case 'w': case 'b': case 'd':
        case 'h': case 'v': case 'n': case 'x': case '-':
            ret=1;
            break;
        default:
            ret=0;
            break;
        }
    return ret;
}           
            

int readDialign(params *v) 
{
    
    /*    This will open a FASTA file output by dialign, and read it into
     *    an array (GArray) of structs of type dialignseq, and return a
     *    pointer to that array.
     */
    
    fastaseq fsequence;
    GArray *sequenceset;
    int n,nn,m,seqindex;
    int illegalchar;
    
    char nnn;
    
    dialignseq presentseq;
    
    seqindex=0;
    illegalchar=0;
    sequenceset=NULL;
    if (readfasta(v->seqfile, &(sequenceset))) {
        fprintf(stderr, "Warning: error reading sequences, %d sequences read\n",sequenceset->len);
    }
    /**these now contain all sequences in one long list****/
    /***every sequence has a header (where first > has been removed***/
    v->fseqs = sequenceset;
    if (sequenceset->len==0)
      return 1;
    
    /* assume uninitialised */
    v->seqgroups=g_array_new(TRUE,TRUE,sizeof(int));
    
    v->seqarray = g_array_new(FALSE,FALSE,sizeof(dialignseq));

    /* If no dialign check for gap symbols */
    if (v->usedialign <= 0) 
      {
	/***go over all sequences read from file**/
        for (nn=0; nn<sequenceset->len; nn++) 
	  {
	    /**current sequence ***/
            fsequence=g_array_index(sequenceset,fastaseq,nn);        
            for (n=0; n<fsequence.seq->len; n++) 
	      {
                if (fsequence.seq->str[n]=='-') {
		  if (v->usedialign==-1) {
		    v->usedialign=1;
		    fprintf(stderr,"Warning: looks like aligned sequence, assuming -D 1 (override if needed)\n");
		  }
		  else {
		    fprintf(stderr,"Warning: -D 0 specified, but dashes \"-\" found in input.  Ignoring dashes\n");
		  }
		  goto checkeddialign;
                }
            }
        }
    }
    
 checkeddialign:
    /***run over all sequences***/
    for (nn=0; nn<sequenceset->len; nn++)  
      {
        fsequence=g_array_index(sequenceset,fastaseq,nn);            
        presentseq.name=g_string_new(fsequence.header->str);
	/**if current seq name starts with > we have new group, unless first in file****/
	/**the latter is necessary because one is allowed to give a single fasta file***/
	/**without ever using >>. In that file all sequences are assumed aligned*******/
        if ((fsequence.header->str[0])=='>' && nn > 0)
	  seqindex++;
	/***lists the sequence numbers of the first member in each group ***/
        g_array_append_val((v->seqgroups),seqindex);
        presentseq.dialignlength=fsequence.seq->len;
        presentseq.dialignseq=g_array_new(TRUE,TRUE,sizeof(char));
        presentseq.bareseq=g_array_new(TRUE,TRUE,sizeof(char));
        presentseq.bare2di=g_array_new(TRUE,TRUE,sizeof(int));
        presentseq.di2bare=g_array_new(TRUE,TRUE,sizeof(int));
        for (n=0; n<fsequence.seq->len; n++) 
	  {
            if (v->usedialign>0) {
	      nnn=fsequence.seq->str[n];
	      if (legal(nnn,1))
		{
		  g_array_append_val(presentseq.dialignseq,nnn);
		}
	      else if(legal(nnn,0))
		{
		  illegalchar = 1;
		  nnn = 'X';
		  g_array_append_val(presentseq.dialignseq,nnn);
		}
	      else
		{
		  fprintf(stderr,"Character \"%c\" found in input!\nThis is not a recognized base, exiting\n",nnn);
		  return 1;
		}
	    } 
            else 
	      {
		nnn=fsequence.seq->str[n];
		/***capitalize****/
                if (((short int)nnn>=65)&& ((short int)nnn<=90)) 
		  nnn=(char)((short int)nnn+32);
                if (legal(nnn,0))
                    g_array_append_val(presentseq.dialignseq,nnn);
                else {
                    fprintf(stderr,"Error: illegal symbol \"%c\" found in input!\nWith -D 0, only IUPAC nucleotide or ambiguous symbols or \"X\" allowed\n",nnn);
                    return 1;
                }
            }
        }
      
	if(illegalchar)
	  {
	    fprintf(stderr,"Warning with -D 1 or -D 2, only -, A, C, G, T, X are allowed symbols\nReplaced other symbols with X\n");
	  }

        /* *  Make the corresponding bare sequence */
        m=0;
        for (n=0; n<presentseq.dialignlength; n++) 
	  if (g_array_index(presentseq.dialignseq,char,n)!='-')
	    { 
	      g_array_append_val(presentseq.bareseq,g_array_index(presentseq.dialignseq,char,n));
	      m++;
            }
         presentseq.barelength=m;
        
        /*  Set up bare2di and di2bare, too*/
        m=0;
        for (n=0; n<presentseq.dialignlength; n++) {
            g_array_append_val(presentseq.di2bare,m);
            if (g_array_index(presentseq.dialignseq,char,n)!='-') {
                g_array_append_val(presentseq.bare2di,n);
                m++;
            }
        }
        g_array_append_val(v->seqarray,presentseq);
      }
    return 0;
}
