#include "procse.h"
#include "procse_State.h"
#include "procse_Record.h"
#include "sym_eigen.h"

/* local type and compare function used for qsort() */

struct SortEl
{
	int id;
	double sig;
};

static int compare_sig(const void *a, const void *b)
{
	double diff = ((SortEl *)b)->sig - ((SortEl *)a)->sig;
	if (diff > 0) return 1;
	if (diff < 0) return -1;
	return 0;
}

TGAClusterNode::TGAClusterNode(int a_tga_id)
{
	tga_id = a_tga_id;
	join_time = -1;
	previous = 0;
	next = 0;
}

/**********************************************************************
RecordReference implementation
**********************************************************************/

Record::Record(FILE *a_log_file, long int a_log_dump_period) :
		log_file(a_log_file), log_dump_period (a_log_dump_period)
{
	last_log_time = time(0);
}

/**********************************************************************
RecordWithWeightMatrix implementation
**********************************************************************/
RecordWithWeightMatrix::RecordWithWeightMatrix(FILE *a_log_file,
		long int a_log_dump_period, double a_q, int a_num_tga,
		double a_cluster_member_sig_cutoff,
		const char *a_filename_prefix) :
		Record(a_log_file, a_log_dump_period), q(a_q),
		num_tga(a_num_tga),
		cluster_member_sig_cutoff(a_cluster_member_sig_cutoff)
{
	filename_prefix = a_filename_prefix;
	sorted_tga_id = new int[num_tga * num_tga];
	sorted_tga_sig = new double[num_tga * num_tga];
	weight_matrix = new double[num_tga * TGA::get_aw_len() * TGA_Raw::NUM_BASES];
	tga_aw_left = new int[num_tga * num_tga];
	tga_aw_sense = new bool[num_tga * num_tga];
	final_entropy = new double[num_tga];
	final_information_score = new double[num_tga];
	final_information_rank = new int[num_tga];
	if (!sorted_tga_id || !sorted_tga_sig ||
			!weight_matrix || !tga_aw_left || !tga_aw_sense || !final_entropy ||
			!final_information_score || !final_information_rank) {
		fprintf(stderr,"Error : out of memory in RecordWithWeightMatrix constructor\n");
		exit(1);
	}
	/* initialize final_entropy to large negative - if not changed, it means
	this cluster was skipped due to too few members or significance */
	int cluster;
	for (cluster = 0; cluster<num_tga; cluster++) final_entropy[cluster] = -DBL_MAX;
}

RecordWithWeightMatrix::~RecordWithWeightMatrix()
{
	delete[] final_information_rank;
	delete[] final_information_score;
	delete[] final_entropy;
	delete[] tga_aw_sense;
	delete[] tga_aw_left;
	delete[] weight_matrix;
	delete[] sorted_tga_sig;
	delete[] sorted_tga_id;
}

void RecordWithWeightMatrix::calculate_information_scores()
{
	int cluster;
	for (cluster = 0; cluster < num_tga; cluster++) {
		if (final_entropy[cluster] < 0.0) {
			final_information_score[cluster] = -DBL_MAX;
		} else {
			/* calculate info score */
			final_information_score[cluster] = 0.0;
			int start = cluster*TGA::get_aw_len()*TGA_Raw::NUM_BASES;
			int pos;
			for (pos = 0; pos < TGA::get_aw_len()*TGA_Raw::NUM_BASES; pos++) {
				double wm_entry = weight_matrix[start + pos];
				if (wm_entry != 0.0) final_information_score[cluster] += wm_entry * log(4.0 * wm_entry);
			}
		}
	}
}

void RecordWithWeightMatrix::calculate_information_ranks()
{
	SortEl *sortbuffer = new SortEl[num_tga];
	if (!sortbuffer) {
		fprintf(stderr,"Error : out of memory in RecordWithWeightMatrix::calculate_information_ranks\n");
		exit(1);
	}
	int cluster;
	for (cluster = 0; cluster < num_tga; cluster++) {
		sortbuffer[cluster].id = cluster;
		sortbuffer[cluster].sig = final_information_score[cluster];
	}
	qsort(sortbuffer,num_tga,sizeof(SortEl),&compare_sig);
	for (cluster = 0; cluster < num_tga; cluster++) {
		final_information_rank[sortbuffer[cluster].id] = cluster;
	}
	delete[] sortbuffer;
}

void RecordWithWeightMatrix::output_information_ranks()
{
	char *filename = new char[strlen(filename_prefix)+18];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordWithWeightMatrix::write_information_ranks\n");
		exit(1);
	}
	strcpy(filename,filename_prefix);
	strcat(filename,"information_ranks");
	FILE *out = fopen(filename,"w");
	if (!out) {
		fprintf(stderr, "Error: could not open %s file for writing\n",filename);
		fflush(0);
		exit(1);
	}
	int cluster;
	fprintf(out,"cluster         score    rank\n");
	for (cluster = 0; cluster < num_tga; cluster++) {
		if (final_information_score[cluster] != -DBL_MAX) fprintf(out, "%7d    %10g  %6d\n",
				cluster,
				final_information_score[cluster],
				final_information_rank[cluster]);
	}
	fclose(out);
	delete[] filename;
}

/**********************************************************************
RecordReference implementation
**********************************************************************/

const int RecordReference::PURITY_HIST_BINS = 50;

RecordReference::RecordReference(FILE *a_log_file, long int a_log_dump_period,
		double a_q, State *state, double a_cluster_member_sig_cutoff,
		const char *a_filename_prefix, double a_significance_sample_period,
		double a_sig_min_pair_presence) :
			RecordWithWeightMatrix(a_log_file, a_log_dump_period,
			a_q, state->get_num_tga(), a_cluster_member_sig_cutoff,
			a_filename_prefix),
			significance_sample_period(a_significance_sample_period),
			sig_min_pair_presence(a_sig_min_pair_presence)
{
	
	last_sample_state = 0;
	ref_state = 0;
	next_sample_time = -1.0;
	num_samples = 0.0;
	num_ref_clusters = 0;
	cluster_sig_presence_hist = 0;
	cluster_sig_purity_hist = 0;
	cluster_sig_tga = 0;
}

RecordReference::~RecordReference()
{
	if (cluster_sig_presence_hist) {
		int cluster_id;
		for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
			delete[] cluster_sig_presence_hist[cluster_id];
		}
		delete[] cluster_sig_presence_hist;
	}
	if (cluster_sig_purity_hist) {
		int cluster_id;
		for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
			delete[] cluster_sig_purity_hist[cluster_id];
		}
		delete[] cluster_sig_purity_hist;
	}
	if (cluster_sig_tga) delete[] cluster_sig_tga;
	if (ref_state) delete ref_state;
}

void RecordReference::update(State *state, double timestep)
{
	log_dump_check(state,timestep);
	last_sample_state = state;
	if (!ref_state) return; /* reference clusters not set yet */
	if (timestep < next_sample_time) return; /* not time yet for sample */
	/* register statistics for significance */
	state->accumulate_significance_metrics(ref_state,
			cluster_sig_presence_hist,
			cluster_sig_purity_hist,
			cluster_sig_tga);
	num_samples += 1.0;
	next_sample_time = timestep + significance_sample_period;
}

void RecordReference::finalize_reference()
{
	if (ref_state) {
		/* end of significance run */
		next_sample_time = DBL_MAX;
		return;
	}
	next_sample_time = significance_sample_period;
	num_ref_clusters = last_sample_state->get_num_cluster();
	/* create reference state */
	ref_state = last_sample_state->create_clone();
	cluster_sig_presence_hist = new double *[num_tga];
	cluster_sig_purity_hist = new double *[num_tga];
	cluster_sig_tga = new double[num_tga * num_tga];

	if (!cluster_sig_presence_hist || !cluster_sig_purity_hist || !cluster_sig_tga) {
		fprintf(stderr,"Error : out of memory in RecordReference::finalize_reference\n");
		exit(1);
	}
	int tga_id;
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		cluster_sig_presence_hist[cluster_id] =
				new double[cluster_size + 1];
		cluster_sig_purity_hist[cluster_id] = new double[PURITY_HIST_BINS];
		if (!cluster_sig_presence_hist[cluster_id] ||
				!cluster_sig_purity_hist[cluster_id]) {
			fprintf(stderr,"Error : out of memory in RecordReference::finalize_reference\n");
			exit(1);
		}
		for (tga_id = 0; tga_id <= cluster_size; tga_id++) 
				(*(cluster_sig_presence_hist[cluster_id] + tga_id)) = 0.0;
		for (tga_id = 0; tga_id < PURITY_HIST_BINS; tga_id++)
				(*(cluster_sig_purity_hist[cluster_id] + tga_id)) = 0.0;
		for (tga_id = 0; tga_id < num_tga; tga_id++) cluster_sig_tga[cluster_id * num_tga + tga_id] = 0.0;
	}
}

void RecordReference::write_output(State *state, bool verbose)
{
	char *filename = new char[strlen(filename_prefix)+16];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::write_output\n");
		exit(1);
	}
	strcpy(filename,filename_prefix);
	strcat(filename,"reference_state");
	FILE *out = fopen(filename,"w");
	if (!out) {
		fprintf(stderr, "Error: could not open %s file for writing\n",filename);
		fflush(0);
		exit(1);
	}
	state->write(out,verbose);
	fclose(out);
	delete[] filename;
	calculate_tga_significance_statistics();
	calculate_cluster_weight_matricies();
	calculate_information_scores();
	calculate_information_ranks();
	if (verbose) {
		output_information_ranks();
		output_cluster_significance_statistics();
	}
	output_tga_significance_statistics();
	output_cluster_weight_matricies(verbose);
}

void RecordReference::output_cluster_significance_statistics()
{
	fprintf(log_file,"Generating Cluster Significance Tables (Total samples taken: %20f)\n\n",num_samples);
	char *filename = new char[strlen(filename_prefix)+33];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::output_cluster_significance\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		if (cluster_size == 0) continue;
		/* added checks to cancel presencepurity files for filtered clusters */
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		if (cluster_size < 2) continue;
		if ((*cluster_sig_presence_hist[cluster_id] +
				*(cluster_sig_presence_hist[cluster_id] + 1))/num_samples +
				sig_min_pair_presence > 1.0) continue;
		char clusternumberstring[10];
		strcpy(filename,filename_prefix);
		strcat(filename,"cluster");
		sprintf(clusternumberstring,"%d",final_information_rank[cluster_id]);
		strcat(filename,clusternumberstring);
		strcat(filename,"_presencepurity");
		FILE *out = fopen(filename,"w");
		if (!out) {
			fprintf(stderr, "Error: could not open %s file for writing\n",filename);
			fflush(0);
			exit(1);
		}
		fprintf(out,"Reference Cluster %d (size: %d):\n",cluster_id,cluster_size);
		fprintf(out,"Cluster presence in nearest matching current cluster:\n");
		int cluster_pos;
		for (cluster_pos = 0; cluster_pos <= cluster_size; cluster_pos++) {
			fprintf(out,"%d : %.6f\n",cluster_pos,*(cluster_sig_presence_hist[cluster_id] + cluster_pos)/num_samples);
		}
		double cumulative_purity = num_samples;
		fprintf(out,"Cumulative cluster purity of nearest matching current cluster:\n");
		for (cluster_pos = 0; cluster_pos < PURITY_HIST_BINS; cluster_pos++) {
			fprintf(out,"%d%% : %.6f\n",cluster_pos * 2,
					cumulative_purity/num_samples);
			cumulative_purity -= *(cluster_sig_purity_hist[cluster_id] + cluster_pos);
		}
		fprintf(out,"\n");
		fclose(out);
	}
	delete[] filename;
}

void RecordReference::calculate_tga_significance_statistics()
{
	fprintf(log_file,"Generating Cluster TGA Probability Tables (Total samples taken: %20f)\n\n",
			num_samples);
	SortEl *sortbuffer = new SortEl[num_tga];
	if (!sortbuffer) {
		fprintf(stderr,"Error : out of memory in RecordReference::calculate_tga_significance_statistics\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		if (cluster_size < 2) continue;
		if ((*cluster_sig_presence_hist[cluster_id] +
				*(cluster_sig_presence_hist[cluster_id] + 1))/num_samples +
				sig_min_pair_presence > 1.0) continue;
		/* sort tga significances */
		int tga_id;
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			sortbuffer[tga_id].id = tga_id;
			sortbuffer[tga_id].sig = cluster_sig_tga[cluster_id * num_tga + tga_id];
		}
		qsort(sortbuffer,num_tga,sizeof(SortEl),&compare_sig);
		/* store significances into sorted array */
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			sorted_tga_id[cluster_id*num_tga+tga_id] = sortbuffer[tga_id].id;
			sorted_tga_sig[cluster_id*num_tga+tga_id] =
					(double)sortbuffer[tga_id].sig / num_samples;
		}
	}
	delete[] sortbuffer;
}

void RecordReference::output_tga_significance_statistics()
{
	fprintf(log_file,"Writing Cluster TGA Probability Tables (Total samples taken: %20f)\n\n",
			num_samples);
	char *filename = new char[strlen(filename_prefix)+29];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::output_tga_significance_statistics\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		if (cluster_size < 2) continue;
		if ((*cluster_sig_presence_hist[cluster_id] +
				*(cluster_sig_presence_hist[cluster_id] + 1))/num_samples +
				sig_min_pair_presence > 1.0) continue;
		char clusternumberstring[10];
		strcpy(filename,filename_prefix);
		strcat(filename,"cluster");
		sprintf(clusternumberstring,"%d",final_information_rank[cluster_id]);
		strcat(filename,clusternumberstring);
		strcat(filename,"_membership");
		FILE *out = fopen(filename,"w");
		if (!out) {
			fprintf(stderr, "Error: could not open %s file for writing\n",filename);
			fflush(0);
			exit(1);
		}
		fprintf(out,"Reference Cluster %d:\n",cluster_id);
		fprintf(out,"tga#  : prob      name\n");
		int tga_id;
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			if (sorted_tga_sig[cluster_id*num_tga+tga_id] < cluster_member_sig_cutoff) break;
			fprintf(out,"%4d : %.6f  (%s)\n",
				sorted_tga_id[cluster_id*num_tga+tga_id],
				sorted_tga_sig[cluster_id*num_tga+tga_id],
				ref_state->get_tga_name(sorted_tga_id[cluster_id*num_tga+tga_id]));
		}
		fclose(out);
	}
	delete[] filename;
}

void RecordReference::calculate_cluster_weight_matricies()
{
	fprintf(log_file,"Generating Cluster Weight Matricies\n\n");
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		if (cluster_size < 2) continue;
		if ((*cluster_sig_presence_hist[cluster_id] +
				*(cluster_sig_presence_hist[cluster_id] + 1))/num_samples +
				sig_min_pair_presence > 1.0) continue;
		ref_state->calculate_cluster_weight_matrix(weight_matrix+cluster_id*TGA::get_aw_len()*TGA_Raw::NUM_BASES,
				tga_aw_left+cluster_id*num_tga, tga_aw_sense+cluster_id*num_tga, final_entropy+cluster_id,
				sorted_tga_id+cluster_id*num_tga,sorted_tga_sig+cluster_id*num_tga,q);
	}
}

void RecordReference::output_cluster_weight_matricies(bool verbose)
{
	fprintf(log_file,"Writing Cluster Weight Matricies\n\n");
	char *filename = new char[strlen(filename_prefix)+32];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::output_cluster_weight_matricies\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		int cluster_size = ref_state->get_cluster_size(cluster_id);
		if (cluster_size < 2) continue;
		if ((*cluster_sig_presence_hist[cluster_id] +
				*(cluster_sig_presence_hist[cluster_id] + 1))/num_samples +
				sig_min_pair_presence > 1.0) continue;
		char clusternumberstring[10];
		strcpy(filename,filename_prefix);
		strcat(filename,"cluster");
		sprintf(clusternumberstring,"%d",final_information_rank[cluster_id]);
		strcat(filename,clusternumberstring);
		strcat(filename,"_weight_matrix");
		FILE *out = fopen(filename,"w");
		if (!out) {
			fprintf(stderr, "Error: could not open %s file for writing\n",filename);
			fflush(0);
			exit(1);
		}
		fprintf(out,"Reference Cluster %d:\n",cluster_id);
		ref_state->write_cluster_weight_matrix(weight_matrix+cluster_id*TGA::get_aw_len()*TGA_Raw::NUM_BASES,
				tga_aw_left+cluster_id*num_tga, tga_aw_sense+cluster_id*num_tga,
				final_entropy+cluster_id,out,verbose,
				sorted_tga_id+cluster_id*num_tga,sorted_tga_sig+cluster_id*num_tga,q);
		fclose(out);
	}
	delete[] filename;
}

/**********************************************************************
RecordPairwise implementation
**********************************************************************/

RecordPairwise::RecordPairwise(FILE *a_log_file, long int a_log_dump_period,
		double a_q, State *state, double a_cluster_member_sig_cutoff,
		const char *a_filename_prefix, double a_pseudocluster_cutoff) :
		RecordWithWeightMatrix(a_log_file, a_log_dump_period, a_q, state->get_num_tga(),
			a_cluster_member_sig_cutoff, a_filename_prefix),
		pseudocluster_cutoff(a_pseudocluster_cutoff)
{
	last_timestep = -1;
	tga_list = new TGAClusterNode*[num_tga];
	cluster_list = new TGAClusterNode*[num_tga];
	if (!cluster_list || !tga_list) {
		fprintf(stderr,"Error allocating memory during RecordPairwise::RecordPairwise\n");
		exit(1);
	}
	int t;
	for (t=0; t<num_tga; t++) {
		tga_list[t] = new TGAClusterNode(t);
		if (!tga_list[t]) {
			fprintf(stderr,"Error allocating memory during RecordPairwise::RecordPairwise\n");
			exit(1);
		}
		cluster_list[t] = 0;
	}
	/* add tga to its cluster */
	for (t=0; t<num_tga; t++) {
		TGAClusterNode *tga = tga_list[t];
		int start_cluster_id = state->get_tga_cluster(t);
		TGAClusterNode *start_cluster = cluster_list[start_cluster_id];
		tga->next = start_cluster;
		tga->previous = 0;
		if (start_cluster) start_cluster->previous = tga;
		cluster_list[start_cluster_id] = tga;
	}
	/* TODO : the pair_duration counters can be done with 1/2 the memory since
		the first of the pair is always the min of the two */
	pair_duration = new double[num_tga * num_tga];
	if (!pair_duration) {
		fprintf(stderr,"Error allocating memory during RecordPairwise::RecordPairwise\n");
		exit(1);
	}
	for (t = 0; t < num_tga * num_tga; t++) pair_duration[t] = 0;
	/* prepare variables for end-of-run postprocessing */
	tga_pseudocluster = new int[num_tga];
	pseudocluster_size = new int[num_tga];
	anchor_prob = new double[num_tga*num_tga];	/* chance tga is anchor of pseudocluster */
	assoc_prob = new double[num_tga*num_tga];	/* chance tga is member via association */
	if (!tga_pseudocluster || !pseudocluster_size || !anchor_prob || !assoc_prob) {
		fprintf(stderr,"Error: out of memory during RecordPairwise::output_cutoff_cluster\n");
		exit(1);
	}
}

RecordPairwise::~RecordPairwise()
{
	delete[] assoc_prob;
	delete[] anchor_prob;
	delete[] pseudocluster_size;
	delete[] tga_pseudocluster;
	delete[] pair_duration;
	int t;
	for (t=0; t<num_tga; t++) {
		delete tga_list[t];
	}
	delete[] cluster_list;
	delete[] tga_list;
}

void RecordPairwise::update(State *state, double timestep)
{
	log_dump_check(state, timestep);
	last_timestep = timestep;
	if (state->move_made()) {
#if 0
		fprintf(log_file,"%ld: tga %d from %d to %d\n",
				timestep,state->moved_tga(),
				state->moved_tga_donor(),
				state->moved_tga_acceptor());
#endif
		/* remove tga from donor list */
		int tga_id = state->moved_tga();
		TGAClusterNode *tga = tga_list[tga_id];
		if (tga->previous == 0) {
			/* first node */
			if (tga->next != 0) tga->next->previous = 0;
			cluster_list[state->moved_tga_donor()] = tga->next;
		} else {
			if (tga->next != 0) tga->next->previous = tga->previous;
			tga->previous->next = tga->next;
		}
		/* increment durations */
		TGAClusterNode *donorscan;
		for (donorscan = cluster_list[state->moved_tga_donor()]; donorscan; donorscan = donorscan->next) {
			double tga_time = tga->join_time;
			int co_tga_id = donorscan->tga_id;
			double co_tga_time = tga_list[co_tga_id]->join_time;
			int pair_index = (tga_id > co_tga_id ?
						co_tga_id * num_tga + tga_id :
						tga_id * num_tga + co_tga_id);
			pair_duration[pair_index] += timestep -
					(tga_time > co_tga_time ? tga_time : co_tga_time);
		}
		/* add tga to acceptor list */
		int acceptor_id = state->moved_tga_acceptor();
		TGAClusterNode *acceptor = cluster_list[acceptor_id];
		tga->next = acceptor;
		tga->previous = 0;
		if (acceptor) acceptor->previous = tga;
		cluster_list[acceptor_id] = tga;
		tga->join_time = timestep;
	}
}

void RecordPairwise::finalize_reference()
{
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		TGAClusterNode *clusterscan_left;
		for (clusterscan_left = cluster_list[cluster_id]; clusterscan_left;
				clusterscan_left = clusterscan_left->next) {
			TGAClusterNode *clusterscan_right;
			for (clusterscan_right = clusterscan_left->next; clusterscan_right;
					clusterscan_right = clusterscan_right->next) {
				int left_id = clusterscan_left->tga_id;
				int right_id = clusterscan_right->tga_id;
				double left_time = clusterscan_left->join_time;
				double right_time = clusterscan_right->join_time;
				int pair_index = (left_id > right_id ?
							right_id * num_tga + left_id :
							left_id * num_tga + right_id);
				pair_duration[pair_index] += last_timestep -
						(left_time > right_time ? left_time : right_time);
			}
		}
	}
}

void RecordPairwise::write_output(State *state, bool verbose)
{
	output_pairwise_statistics();
	output_pairwise_cutoff_table();
	char *filename = new char[strlen(filename_prefix)+15];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordPairwise::write_output\n");
		exit(1);
	}
	strcpy(filename,filename_prefix);
	strcat(filename,"pseudoclusters");
	FILE *out = fopen(filename,"w");
	if (!out) {
		fprintf(stderr, "Error: could not open %s file for writing\n",filename);
		fflush(0);
		exit(1);
	}
	if (verbose) output_pseudoclusters(out,state);
	else output_pseudocluster(out,state,pseudocluster_cutoff);
	fclose(out);
	delete[] filename;
	/* now lock in pseudocluster assignments */
	assign_to_pseudoclusters(pseudocluster_cutoff);
	calculate_pseudocluster_statistics(state);
	calculate_pseudocluster_weight_matricies(state);
	calculate_information_scores();
	calculate_information_ranks();
	if (verbose) output_information_ranks();
	/* create arrays for transferring significance info */
	output_pseudocluster_statistics(state,verbose);
	output_pseudocluster_weight_matricies(state,verbose);
}

void RecordPairwise::output_pairwise_statistics()
{
	char *filename = new char[strlen(filename_prefix)+19];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordPairwise::write_output\n");
		exit(1);
	}
	strcpy(filename,filename_prefix);
	strcat(filename,"pairwise_raw_stats");
	FILE *out = fopen(filename,"w");
	if (!out) {
		fprintf(stderr, "Error: could not open %s file for writing\n",filename);
		fflush(0);
		exit(1);
	}
	fprintf(out,"Co-occurrence Raw Pairwise counts after %20f timesteps of MCM process\n",
			last_timestep + 1.0);
	SortEl *sortbuffer = new SortEl[num_tga];
	if (!sortbuffer) {
		fprintf(stderr,"Error : out of memory in RecordPairwise::output_pairwise_statistics\n");
		exit(1);
	}
	int tga_id;
	for (tga_id = 0; tga_id < num_tga; tga_id++) {
		int co_tga_id;
		for (co_tga_id = 0; co_tga_id < num_tga; co_tga_id++) {
			int pair_index = (tga_id > co_tga_id ?
						co_tga_id * num_tga + tga_id :
						tga_id * num_tga + co_tga_id);
			sortbuffer[co_tga_id].id = co_tga_id;
			sortbuffer[co_tga_id].sig = pair_duration[pair_index];
		}
		qsort(sortbuffer,num_tga,sizeof(SortEl),&compare_sig);
		fprintf(out,"Co-occurrence for tga #%d:\n",tga_id);
		int bufferscan;
		for (bufferscan = 0; bufferscan < num_tga; bufferscan++) {
			if (sortbuffer[bufferscan].sig == 0) break;
			/* output data using percentages */
			fprintf(out,"%d (%.4f) ",
					sortbuffer[bufferscan].id,
					(double)(sortbuffer[bufferscan].sig) /
					(double)(last_timestep+1));
		}
		fprintf(out,"\n");
	}
	delete[] sortbuffer;
	fclose(out);
	delete[] filename;
}

void RecordPairwise::recursive_assign(int tga_id, int cluster, double cutoff)
{
	if (tga_pseudocluster[tga_id] != -1) {
		fprintf(stderr,"Error in recursive assign, while producing pairwise cutoff table:\n");
		fprintf(stderr,"Attempt to reassign previously assigned tga\n");
		exit(1);
	}
	tga_pseudocluster[tga_id] = cluster;
	int co_tga_id;
	for(co_tga_id = 0; co_tga_id < num_tga; co_tga_id++) {
		if (tga_id == co_tga_id) continue;
		int pair_index = (tga_id > co_tga_id ?
					co_tga_id * num_tga + tga_id :
					tga_id * num_tga + co_tga_id);
		if (pair_duration[pair_index] * 1.0 >= (last_timestep + 1) * cutoff) {
			if (tga_pseudocluster[co_tga_id] != cluster) {
				recursive_assign(co_tga_id, cluster, cutoff);
			}
		}
	}
}

void RecordPairwise::assign_to_pseudoclusters(double cutoff)
{
	int tga_id;
	for (tga_id = 0; tga_id < num_tga; tga_id++) {
		tga_pseudocluster[tga_id] = -1;
		pseudocluster_size[tga_id] = 0;
	}
	int next_cluster = 0;
	for (tga_id = 0; tga_id < num_tga; tga_id++) {
		if (tga_pseudocluster[tga_id] >= 0) continue;	/* skip already assigned tga */
		recursive_assign(tga_id,next_cluster++,cutoff);
	}
	/* count cluster size */
	for (tga_id = 0; tga_id < num_tga; tga_id++) {
		pseudocluster_size[tga_pseudocluster[tga_id]]++;
	}
}

void RecordPairwise::output_cutoff_cluster_count(FILE *out, double cutoff)
{
	assign_to_pseudoclusters(cutoff);
	int number_of_multi_clusters = 0;
	int number_of_singlet_clusters = 0;
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		if (pseudocluster_size[cluster_id] > 1) number_of_multi_clusters++;
		else if (pseudocluster_size[cluster_id] == 1) number_of_singlet_clusters++;
	}
	fprintf(out,"%01.3f\t\t%d\t\t\t%.4f\n",
			cutoff,
			number_of_multi_clusters,
			1.0 - (double)number_of_singlet_clusters / (double)num_tga);
}

void RecordPairwise::output_pairwise_cutoff_table()
{
	char *filename = new char[strlen(filename_prefix)+32];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordPairwise::output_pairwise_cutoff_table\n");
		exit(1);
	}
	strcpy(filename,filename_prefix);
	strcat(filename,"pseudocluster_cutoff_size_table");
	FILE *out = fopen(filename,"w");
	if (!out) {
		fprintf(stderr, "Error: could not open %s file for writing\n",filename);
		fflush(0);
		exit(1);
	}
	fprintf(out,"Pairwise clustering cutoff table\n\n");
	fprintf(out,"cooccurrence_\tnumber_of_non-empty_\tpercent_tga_inclusion_\n");
	fprintf(out,"cutoff\t\tanchor_pseudoclusters\tin_some_pseudocluster\n");
	int cutoff;
	for (cutoff = 0; cutoff <= 1000; cutoff += 25) {
		output_cutoff_cluster_count(out,cutoff/1000.0);
	}
	fprintf(out,"\n");
	fclose(out);
	delete[] filename;
}

void RecordPairwise::arrange_reference_from_pairwise_data(State *state, double cutoff)
{
	assign_to_pseudoclusters(cutoff);
	/* rearrange reference state */
	state->rearrange_to_pairwise_assignment(tga_pseudocluster);
}

void RecordPairwise::output_pseudocluster(FILE *out, State *state, double cutoff)
{
	State *pseudo_state = state->create_clone();
	arrange_reference_from_pairwise_data(pseudo_state,cutoff);
	fprintf(out,"ANCHOR PSEUDOCLUSTERS FROM PAIRWISE DATA (cutoff: %.3f)\n\n",cutoff);
	pseudo_state->write_membership_summary(out);
	delete pseudo_state;
}

void RecordPairwise::output_pseudoclusters(FILE *out, State *state)
{
	int cutoff;
	for (cutoff = 0; cutoff <= 1000; cutoff += 25) {
		output_pseudocluster(out,state,cutoff/1000.0);
	}
}

void RecordPairwise::calculate_pseudocluster_statistics(State *state)
{
	fprintf(log_file,"GENERATING MEMBERSHIP PROBABILITIES FOR PSEUDOCLUSTERS (%.3f cutoff)\n\n",pseudocluster_cutoff);
	/* get cluster assignment from pairwise data */
	int *cluster_tga = new int[num_tga];	/* what tgas belong to current cluster */
	SortEl *tga_order = new SortEl[num_tga];	/* for sorting */
	if (!cluster_tga || !tga_order) {
		fprintf(stderr,"Error: out of memory during RecordPairwise::calculate_pseudocluster_statistics1\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		if (pseudocluster_size[cluster_id] < 2) continue; /* skip small clusters */
		int cluster_pos = 0;
		int tga_id;
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			anchor_prob[cluster_id*num_tga + tga_id] = 0.0;
			assoc_prob[cluster_id*num_tga + tga_id] = 0.0;
			if (tga_pseudocluster[tga_id] == cluster_id) {
				cluster_tga[cluster_pos++] = tga_id;
			}
		}
		if (cluster_pos != pseudocluster_size[cluster_id]) {
			fprintf(stderr,"Error: inconsistancy counting cluster members in\n");
			fprintf(stderr,"RecordPairwise::calculate_pseudocluster_statistics\n");
			exit(1);
		}
		const int LEN = cluster_pos;
		/* create coocurrence matrix for eigenvector */
		double *pseudo_base = new double[LEN*LEN];
		double *pseudo_eigen = new double[LEN*LEN];
		double *pseudo_d = new double[LEN];
		double *pseudo_e = new double[LEN];
		if (!pseudo_base || !pseudo_eigen || !pseudo_d || !pseudo_e) {
			fprintf(stderr,"Error: out of memory in\n"); 
			fprintf(stderr,"RecordPairwise::calculate_pseudocluster_statistics\n");
			exit(1);
		}
		int i,j;
		for (i = 0; i<LEN; i++) {
			pseudo_base[i*LEN+i] = 0.0;
			for (j = 0; j<i; j++) {
				double cooccur = (double)pair_duration[cluster_tga[j] *
						num_tga + cluster_tga[i]] /
						(double)(last_timestep + 1);
				pseudo_base[i*LEN+j] = cooccur;
				pseudo_base[j*LEN+i] = cooccur;
			}
		}
		/* find eigenvector */
		tri_diagonalize(pseudo_base,pseudo_d,pseudo_e,pseudo_eigen,LEN,1.0e-6);
		if (calc_eigenstructure(pseudo_d,pseudo_e,pseudo_eigen,LEN,1.0e-16) != 0) {
			fprintf(stderr,"Error: eigenvector calculation did not converge in\n");
			fprintf(stderr,"RecordPairwise::calculate_pseudocluster_statistics\n");
			exit(1);
		}
		/* copy elements of eigenvector into anchor probabilities */
		double component_total = 0.0;
		for (i = 0; i<LEN; i++) component_total += pseudo_eigen[i*LEN+LEN-1];
		for (i = 0; i<LEN; i++) {
			anchor_prob[cluster_id*num_tga + cluster_tga[i]] = pseudo_eigen[i*LEN+LEN-1] / component_total;
		}
		delete[] pseudo_e;
		delete[] pseudo_d;
		delete[] pseudo_eigen;
		delete[] pseudo_base;
		/* calculate member_probabilities */
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			for (i = 0; i<LEN; i++) {
				int anchor_id = cluster_tga[i];
				if (anchor_id == tga_id) continue; /* skip self cooccurence */
				int pair_index = (tga_id > anchor_id ?
							anchor_id * num_tga + tga_id :
							tga_id * num_tga + anchor_id);
				assoc_prob[cluster_id*num_tga + tga_id] +=
						 anchor_prob[cluster_id*num_tga + anchor_id] *
						(double)pair_duration[pair_index] /
						(double)(last_timestep + 1);
			}
			tga_order[tga_id].id = tga_id;
			tga_order[tga_id].sig =
					anchor_prob[cluster_id*num_tga + tga_id] +
					assoc_prob[cluster_id*num_tga + tga_id];
		}
		/* sort member_probabilities */
		qsort(tga_order,num_tga,sizeof(SortEl),&compare_sig);
		/* store member_probabilities into sorted array */
		for (tga_id = 0; tga_id < num_tga; tga_id++) {
			sorted_tga_id[cluster_id*num_tga+tga_id] = tga_order[tga_id].id;
			sorted_tga_sig[cluster_id*num_tga+tga_id] = tga_order[tga_id].sig;
		}
	}
	delete[] tga_order;
	delete[] cluster_tga;
}

void RecordPairwise::output_pseudocluster_statistics(State *state, bool verbose)
{
	fprintf(log_file,"WRITING MEMBERSHIP PROBABILITIES FOR PSEUDOCLUSTERS (%.3f cutoff)\n\n",pseudocluster_cutoff);
	char clusternumberstring[10];
	char *filename = new char[strlen(filename_prefix)+35];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::output_pseudocluster_statistics\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		if (pseudocluster_size[cluster_id] < 2) continue; /* skip small clusters */
		/* output table */
		strcpy(filename,filename_prefix);
		strcat(filename,"pseudocluster");
		sprintf(clusternumberstring,"%d",final_information_rank[cluster_id]);
		strcat(filename,clusternumberstring);
		strcat(filename,"_membership");
		FILE *out = fopen(filename,"w");
		if (!out) {
			fprintf(stderr, "Error: could not open %s file for writing\n",filename);
			fflush(0);
			exit(1);
		}
		fprintf(out,"Pseudo Cluster #%d:\n",cluster_id);
		if (verbose) fprintf(out,"TGA#\tanchor_prob\tassoc_prob\tmember_prob\tTGA name\n");
		else fprintf(out,"TGA#\tmember_prob\tTGA name\n");
		int i;
		for (i = 0; i < num_tga; i++) {
			int tga_id = sorted_tga_id[cluster_id*num_tga + i];
			double tga_sig = sorted_tga_sig[cluster_id*num_tga + i];
			if (tga_sig < cluster_member_sig_cutoff) break;
			if (verbose) fprintf(out,"%d\t%.4f\t\t%.4f\t\t%.4f\t\t%s\n", tga_id,
				anchor_prob[cluster_id*num_tga + tga_id],
				assoc_prob[cluster_id*num_tga + tga_id],
				tga_sig, state->get_tga_name(tga_id));
			else fprintf(out,"%d\t%.4f\t\t%s\n", tga_id,
				tga_sig, state->get_tga_name(tga_id));
		}
		fclose(out);
	}
	delete[] filename;
}

void RecordPairwise::calculate_pseudocluster_weight_matricies(State *state)
{
	fprintf(log_file,"Generating Pseudocluster Weight Matricies (%.3f cutoff)\n\n",pseudocluster_cutoff);
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		if (pseudocluster_size[cluster_id] < 2) continue;
		state->calculate_cluster_weight_matrix(weight_matrix+cluster_id*TGA::get_aw_len()*TGA_Raw::NUM_BASES,
				tga_aw_left+cluster_id*num_tga, tga_aw_sense+cluster_id*num_tga,
				final_entropy+cluster_id,
				sorted_tga_id+cluster_id*num_tga,sorted_tga_sig+cluster_id*num_tga,q);
	}
}

void RecordPairwise::output_pseudocluster_weight_matricies(State *state, bool verbose)
{
	fprintf(log_file,"Writing Pseudocluster Weight Matricies (%.3f cutoff)\n\n",pseudocluster_cutoff);
	char *filename = new char[strlen(filename_prefix)+38];
	if (!filename) {
		fprintf(stderr,"Error : out of memory in RecordReference::output_pseudocluster_weight_matricies\n");
		exit(1);
	}
	int cluster_id;
	for (cluster_id = 0; cluster_id < num_tga; cluster_id++) {
		/* filter out clusters which do not have at least
			a minimum presence of 2 over a percentage threshold */
		if (pseudocluster_size[cluster_id] < 2) continue;
		char clusternumberstring[10];
		strcpy(filename,filename_prefix);
		strcat(filename,"pseudocluster");
		sprintf(clusternumberstring,"%d",final_information_rank[cluster_id]);
		strcat(filename,clusternumberstring);
		strcat(filename,"_weight_matrix");
		FILE *out = fopen(filename,"w");
		if (!out) {
			fprintf(stderr, "Error: could not open %s file for writing\n",filename);
			fflush(0);
			exit(1);
		}
		fprintf(out,"Pseudo Cluster #%d:\n",cluster_id);
		state->write_cluster_weight_matrix(weight_matrix+cluster_id*TGA::get_aw_len()*TGA_Raw::NUM_BASES,
				tga_aw_left+cluster_id*num_tga, tga_aw_sense+cluster_id*TGA::get_aw_len()*TGA_Raw::NUM_BASES,
				final_entropy+cluster_id,out,verbose,
				sorted_tga_id+cluster_id*num_tga,sorted_tga_sig+cluster_id*num_tga,q);
		fclose(out);
	}
	delete[] filename;
}
