/*                        PhyloGibbs                                  */

/*   Algorithm developed by Rahul Siddharthan, Erik van Nimwegen      * 
 *   and Eric D. Siggia at The Rockefeller University, New York, USA  *
 *   and at The Institute of Mathematical Sciences, Chennai, India    *
 *                                                                    *
 *   This code copyright (C) 2004-2006 Rahul Siddharthan              *
 *   Licensed under the GNU General Public License (see COPYING)      *
 *   For support and contact information, see the webpage:            *
 *             http://www.imsc.res.in/~rsidd/phylogibbs/              */

/* 
 * $Author: rsidd $  
 * $Date: 2006/03/27 18:44:20 $ 
 * $Id: splitwindow.c,v 1.3 2006/03/27 18:44:20 rsidd Exp $ 
 */


#include "interspecies.h"
#include <ctype.h>
#include <assert.h>

int splitwindow(int wwidth, window *win, GArray **seq, GArray **splitwinset,
				int *recurselevel)
{
	
	/*   Checks whether a window satisfies the dialign
	 *   constraints.  Basically, that means capital letters
	 *   should line up properly.
	 *
	 *   If not, pulls out a maximal set of sequences that do satisfy
	 *   the constraints, puts them into one "legal" window, puts the
	 *   remainder (if the number is > 1) into another "test" window,
	 *   and recursively runs splitwindow() on the latter.  Finally
	 *   returns a GPtrArray of split windows, in **splitwinset.
	 *
     *   Strategy: for each pair of sequences, at each position, if
     *   both sequences have capital letters, check that it's
     *   consistent.
	 *
	 */

	int nseqs,winok,n2,m,l,l2,currseq,currseq2,currpos,
		currpos2,skipthis,currstart,currstop,d;
	char *currdata;
	int n2di,n2di2;
	dialignseq tempseq,tempseq2;
	char *data,*data2;
	window thiswin,thiswin2;
	GArray *removedseqs,*dropthis;
	

	nseqs=win->seq->len;
	assert(*recurselevel <= (*seq)->len);


	removedseqs=g_array_new(TRUE,TRUE,sizeof(int));
	dropthis=g_array_new(TRUE,TRUE,sizeof(int));

	/* figure out which sequences are not consistent with the first one,
	   and "remove" them; remaining set can be assumed consistent, while
	   splitwindow() will be called recursively on the removed set */
	
	for (l=0; l<nseqs; l++) {
		
		skipthis=0;
		/* check whether this seq has already been 'removed' for
		   the consistency check */
		for (l2=0; l2<removedseqs->len; l2++) 
			if (l==g_array_index(removedseqs,int,l2)) {
				skipthis=1;
				break;
			}
		
		if (skipthis)
			continue;

		if (g_array_index(win->start,int,l)<0) {
			g_array_append_val(removedseqs,l);
			d=1;
			g_array_append_val(dropthis,d);
			continue;
		}

        if (g_array_index(win->stop,int,l)
            != (g_array_index(win->start,int,l)+wwidth-1)) {
            g_array_append_val(removedseqs,l);
            d=1;
            g_array_append_val(dropthis,d);
            continue;
        }

        /* drop if this sequence contains an X */
        data=g_array_index(win->data,char*,l);
        d=0;
        for (m=0; m<wwidth; m++)
            if ((data[m]=='x')||(data[m]=='X')) {
                g_array_append_val(removedseqs,l);
                d=1;
                g_array_append_val(dropthis,d);
                break;
            }
        if (d==1)
            continue;
                

		if (nseqs > 1) {

			for (m=0; m<wwidth; m++) {
			
				currseq=g_array_index(win->seq,int,l);
				currpos=g_array_index(win->start,int,l)+m;

				
				tempseq=g_array_index(*seq,dialignseq,currseq);
				data=g_array_index(win->data,char*,l);
				
				n2di=g_array_index(tempseq.bare2di,int,currpos);
				
				for (l2=0; l2<nseqs; l2++) {
					skipthis=0;
					if (l2==l)
						skipthis=1;
					else
						for (n2=0; n2<removedseqs->len; n2++)
							if (l2==g_array_index(removedseqs,int,n2)){
								skipthis=1;
								break;
							}
					if (skipthis)
						continue;
					currseq2= g_array_index(win->seq,int,l2);
					currpos2=g_array_index(win->start,int,l2)+m;
					tempseq2=g_array_index(*seq,dialignseq,currseq2);
					data2=g_array_index(win->data,char*,l2);
					if (currpos2 < 0 || (currpos2 >= tempseq2.barelength)) {
						g_array_append_val(removedseqs,l2);
						d=1;
						g_array_append_val(dropthis,d);
					} else {
						n2di2=g_array_index(tempseq2.bare2di,int,currpos2);
						if (((n2di2!=n2di) && (isupper(data2[m])) && (isupper(data[m])))
							|| ((g_array_index(tempseq.di2bare,int,n2di2)!=currpos)
								&& (isupper(data2[m])) && (isupper(g_array_index(tempseq.dialignseq,char,n2di2))))
							|| ((g_array_index(tempseq2.di2bare,int,n2di)!=currpos2)
								&& (isupper(data[m])) && (isupper(g_array_index(tempseq2.dialignseq,char,n2di))))) {
							g_array_append_val(removedseqs,l2);
							d=0;
							g_array_append_val(dropthis,d);
						}
					}
				}
			}
		}
	}

	/* Now split into thiswin = confirmed good window, and
	   thiswin2 = to be checked again recursively */

	thiswin.seq=g_array_new(TRUE,TRUE,sizeof(int));
	thiswin.start=g_array_new(TRUE,TRUE,sizeof(int));
	thiswin.stop=g_array_new(TRUE,TRUE,sizeof(int));
	thiswin.data=g_array_new(TRUE,TRUE,sizeof(char*));

	if (removedseqs->len > 0) {
		thiswin2.seq=g_array_new(TRUE,TRUE,sizeof(int));
		thiswin2.start=g_array_new(TRUE,TRUE,sizeof(int));
		thiswin2.stop=g_array_new(TRUE,TRUE,sizeof(int));
		thiswin2.data=g_array_new(TRUE,TRUE,sizeof(char*));
	}
	
	
	for (l=0; l<nseqs; l++) {
		currseq=g_array_index(win->seq,int,l);
		currstart=g_array_index(win->start,int,l);
		currstop=g_array_index(win->stop,int,l);
		currdata=g_array_index(win->data,char*,l);
		skipthis=0;
		for (l2=0; l2<removedseqs->len; l2++)
			if (l==g_array_index(removedseqs,int,l2)) {
				skipthis=1;
				if (g_array_index(dropthis,int,l2)==1)
					skipthis=2;
				break;
			}
		if (skipthis) {
			if (skipthis==1) {
				g_array_append_val(thiswin2.seq,currseq);
				g_array_append_val(thiswin2.start,currstart);
				g_array_append_val(thiswin2.stop,currstop);
				g_array_append_val(thiswin2.data,currdata);
			}
		} else {
			g_array_append_val(thiswin.seq,currseq);
			g_array_append_val(thiswin.start,currstart);
			g_array_append_val(thiswin.stop,currstop);
			g_array_append_val(thiswin.data,currdata);
		}
	}

	if (removedseqs->len > 0) {
		if (thiswin2.seq->len > 0) {
			*splitwinset=NULL;
			*recurselevel += 1;
			splitwindow(wwidth,&thiswin2,seq,splitwinset,recurselevel);
			*recurselevel -= 1;
		}
		else
			*splitwinset=g_array_new(TRUE,TRUE,sizeof(window));
	}
	else 
		*splitwinset=g_array_new(TRUE,TRUE,sizeof(window));

	winok=1;
	if (thiswin.seq->len==0)
		winok=0;

	if (winok)
		g_array_append_val(*splitwinset,thiswin);
	else {
		g_array_free(thiswin.seq,TRUE);
		g_array_free(thiswin.start,TRUE);
		g_array_free(thiswin.stop,TRUE);
		g_array_free(thiswin.data,TRUE);		
	}
	

	g_array_free(removedseqs,TRUE);
	g_array_free(dropthis,TRUE);
	return 0;
}
