/*                        PhyloGibbs                                  */

/*   Algorithm developed by Rahul Siddharthan, Erik van Nimwegen      * 
 *   and Eric D. Siggia at The Rockefeller University, New York       * 
 *                                                                    *
 *   This code copyright (C) 2004 Rahul Siddharthan <rsidd@online.fr> * 
 *   Licensed under the GNU General Public License (see COPYING)      */ 

/*
 * $Id: readfasta.c,v 1.2 2005/05/05 14:04:20 rsidd Exp $
 */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <glib.h>
#include <assert.h>
#include "fasta.h"
#include "seq2nums.h"

void chomp(char *s) {
    if (s[strlen(s)-1]=='\n')
        s[strlen(s)-1]='\0';
    if (s[strlen(s)-1]=='\r')
        s[strlen(s)-1]='\0';
}

int readfasta(const char *filename, GArray **fastaseqs) {

    FILE *seqfile;
    fastaseq oneseq;
    int n,headernext;
    char s[131072];

    /**clean if the sequences are not empty (how they could not be empty is beyond me****/
    if (*fastaseqs!=NULL) {
        for (n=0; n<(*fastaseqs)->len; n++) {
            oneseq=g_array_index((*fastaseqs),fastaseq,n);
            g_string_free(oneseq.header,TRUE);
            g_string_free(oneseq.seq,TRUE);
        }
        g_array_free((*fastaseqs),TRUE);
    }

    headernext=1;
    /**new set of sequences allocated****/
    *fastaseqs =g_array_new(TRUE,TRUE,sizeof(fastaseq));

    oneseq.seq=g_string_new("");

    seqfile=fopen(filename,"rt");
    if (seqfile==NULL)
      return 1;

    while (fgets(s,131072,seqfile)) 
      {
	/**ignore lines that start with space, semicolon, or pound sign***/
	while ((isspace(s[0]))||(s[0]==';')||(s[0]=='#')) 
	  {
	    /***get another line****/
	    if (!fgets(s,131072,seqfile)) 
	      {
		/**if we are not looking for a header and end of file****/
		if (!headernext)  
		  {
		    /**if no sequence read return error***/
		    if (oneseq.seq->len==0) 
		      return 1;
		    /**append current sequences to set of fast sequences****/
		    else 
		      {
			g_array_append_val((*fastaseqs),oneseq);
			return 0;
		      }
		  }
		/***we were looking for next header, done reading****/
		else
		  return 0;
	      }
	    /***not end of file****/
            if (oneseq.seq->len > 0) {
                g_array_append_val((*fastaseqs),oneseq);
                oneseq.seq=g_string_new("");
                headernext=1;
            }
        }

	/**if looking for header but not starting with > return error***/
        if ((s[0]!='>')&&headernext) 
            return 1;

	/**record header***/
        if (headernext) 
	  {
            chomp(s);
            oneseq.header=g_string_new(s+1);
            headernext=0;
	  } 
	/***record sequence****/
	else {
	  chomp(s);
	  if (s[0]=='>') 
	    {
	      if (oneseq.seq->len==0)
		{
		  g_string_free(oneseq.header, TRUE);
		  oneseq.header=g_string_new(s+1);
		}
	      else 
		{
		  g_array_append_val((*fastaseqs),oneseq);
		  oneseq.header=g_string_new(s+1);
		  oneseq.seq=g_string_new("");
                }
            } 
	  else
	    oneseq.seq=g_string_append(oneseq.seq, s);
        }
      }
    if (!headernext) 
      g_array_append_val((*fastaseqs),oneseq);
    
    return 0;
}

