# procse parameter file

#####################################################################
# general parameters
# This file contains all the parameters that the PROCSE algorithm takes.
# Some parameters have no defaults and have to be explicitly set by
# the user. Others have defaults that the program uses if the user
# doesn't specify the parameter.
# All lines in this file that start with # are comments.
# All other lines specify a parameter.
# The general format is of a line is 'parameter_name parameter_value'.
# For instance, the line 'q 1.0' means that the parameter  q is set to 1.0.
# Below all parameters are described in turn. Note that for many
# applications one may use the defaults. 
# To set a parameter to its default one should simply
# comment out the line that contains the setting of the parameter.
# See the minimal parameter file to see what needs to be set at a minimum.

# random seed (integer) : will seed by time if not specified
random_seed 1
# if you command out the above line (recommended) the program will
# seed the random number generator from the internal clock.


# q (double) : used as log gamma offset from factorial (default: 1.0)
q 1.0
# q is a parameter that controls that controls one of the 'priors'
# that the program uses. Each weight matrix component has a Dirichlet
# prior with exponent (q-1). The parameter q is often referred to as 'pseudocount'.
# This prior controls how 'conserved' the
# algorithm expects the binding site motifs to be. That is
# for q close to zero (for instance q = 0.01) the program expects
# that all binding sites for a TF will have a very similar binding pattern
# in other words, that they almost all perfectly match the consensus sequence.
# For q = 1.0 the program does not make any a priori assumptions about the
# amount of conservation that is to be expected in each position of
# the binding site. The user is advised to use q = 1.0 in general.
# Only when entropy_method is set to 1 (see below) is it advisable to
# set q to something smaller (for instance q = 0.1).


# aw_len (integer) : alignment window length (required)
aw_len 27
# This parameter specifies the maximal width of the binding sites.
# The above length (27) is a reasonable guess for prokaryotic binding
# sites. For Eukaryotic ones something like 12 or 14 may be more
# appropriate. Note that in general it is much worse to underestimate
# the length of the binding sites than to overestimate them.
# Especially when running with the defaults entropy_method=0 and
# q=1.0, the performance of the program should be relatively
# insensitive to overestimations of the binding site length.  When
# running with entropy_method=1 and q < 1.0 it is more deleterious to
# specify a window length that is significantly larger than the actual
# size. Still it would be worse to specify a window length that is too
# short.


# single_strand (boolean) : if only forward strand should be used (default: false)
single_strand false
#This parameter tells the program if it should use both forward and
#reverse-complement of the sequences or if it should only use the
#sequences in the forward sense. By default both strands are used
#(which corresponds to the parameter being set to false).

# initial_cluster_size (integer) : for initial state (default: 1)
initial_cluster_size 1
# The program starts with a random clustering of clusters of this size.
# This parameter should be set to 1 except when running with sizeadj=1
# (see below) or sizeadj=2. 
# Set initial_cluster_size=2 or 3 for those cases.

#####################################################################
# type of run parameters

# record_type (integer) : code to indicate what statistics to keep
#			during the run:
#				0 = none, just output final state
#				1 = Find best partition through annealing
#					followed by sampling run to
#					test significance.
#				2 = Sample and record pair
#					statistics. 
#			(default: 1)
record_type 1
# There are 3 ways of running the program. With record_type=0 the
# program simply runs once and outputs the partition that it found at
# the end of the run.
# For record_type=1 the program first does an annealing run in which it attempts
# to find the best possible partition of the sequences into clusters.
# It then does an additional run (with beta set at beta_transient) in which it tests
# the significance of all the clusters found in the optimal
# partition obtained by the annealing. Significant clusters are reported.
# With record_type=2 it does a single run (with beta set at
# beta_transient) during which it measures the fractions of time f_ij 
# that sequences i and j occur in the same cluster.
# Significant clusters are reconstructed from these pair statistics at
# the end and reported.
# The record_type=1 generally gives the best results for data sets
# that are not too large. For large data sets
# (4000 or more sequences) it may be better to use record_type=2. 
# More specifically, if the annealing produces 'optimal' partitions
# that vary wildly from run to run even when one runs the program
# for a long time, then it may be advisable to use record_type=2.

# entropy_method (integer) : code to indicate what entropy formula
#			to apply:
#				0 = Plain (columns scored as WM samples)
#				1 = Random (averaging of the columns
#					scored as WM samples and
#					samples from background.
#			(default: 0)
entropy_method 0
# This option controls if the algorithm considers all positions inside
# the window part of the site or only a subset of them. When
# entropy_method=0, all positions are considered part of the site.
# When entropy_method=1 every column is considered to either be part
# of the site, or part of the 'background'. This may give better results
# when the sites contain one or more 'gaps' of unconserved positions.
# It is highly advisable to set q<1.0 (for instance 0.1 or 0.2)
# when using this entropy_method 1.

# sizeadj (integer) : What kind of prior to use over the space of
#				partitions. 
#				0 = Uniform prior over partitions
#				1 = Uniform prior over cluster number
#				2 = Uniform prior over cluster number
#					and their size.
#				When using 1 or 2, it is
#				recommended that you set
#				initial_cluster_size > 1
#			(default: 0)
sizeadj 0
# This parameter controls the prior that the program uses over the space
# of partitions of sequences into clusters.
# With sizeadj=0 all possible partitions get equal a priori
# probability. 
# With sizeadj=1 all possible cluster NUMBERS get equal a priori
# probability. Note that if one has, for example, 100 sequences, then
# the number of ways that these can be partitioned into 25 subsets is much
# larger than the number of ways of partitioning them into 2 subsets.
# With sizeadj=1 each partition into 25 subsets gets weighted by the
# inverse of the number of partitions of 100 sequences into 25 subsets,
# and each partition into 2 subsets gets weighted by the inverse of
# the number of partitions of 100 sequence into two subsets.
# With sizeadj=2 the prior is also uniform over the cluster sizes.
# To use the same example, there are many more ways of partitioning
# 100 sequences into 25 subsets of 4 members each, then to partition
# them into 25 subsets of which one has 76 members and 24 have 1
# member. With sizeadj=2 this effect is also corrected for. That is, 
# each partition of 100 sequences into 25 partitions with 4 members
# each is weighted with the inverse of the number of partitions of 100
# sequences into 25 clusters with 4 members each.
# When the user is generally ignorant of the number of clusters
# represented in the data, it is advisable to use the default
# sizeadj=0. When the user knows that there are very few clusters
# represented sizeadj=1 is advisable. When the user knows that
# some very large clusters may be represented, sizeadj=2 is
# advisable.


#chemical_potential (double) : used during sizeadj processing to
#			determine the log prior post probability
#			(default: 0.0)
chemical_potential 0.0
# This prior gives an additional way to tune the prior over the number
# of clusters
# Each partition with n clusters is weighted with exp(-chemical_potential * n)
# in addition to the weighting set by the sizeadj parameter.
# Practically, setting chemical_potential to a positive value reduces the number of
# clusters.

#Note that the user is advised to set both sizeadj and
# chemical_potential to their defaults unless they understand what
# these priors implement. 

# coherent_shift_period (double) : number of timesteps between
#			proposed coherent shift moves (default: 20)
coherent_shift_period 20.0
# Every now and then the algorithm attempts to shift all windows
# in a cluster to the left or right in a coherent fashion (i.e. all
# windows are shifted the same amount).
# The default works fine for most applications.

#####################################################################
# temperature clock parameters

# transient_beta (double) : starting inverse temperature,
#			used throughout run if record_type == 2
#			(default: 1.0)
transient_beta 1.0
# 'Inverse temperature' at which the program runs before annealing
# starts, and also for the the significance sampling and when running
# record_type=2. 
# It is highly advisable to leave this at 1.0 because the posterior
# distribution is only sampled correctly at beta=1.0
# When this parameter is raised one can expect the significance of the
# clusters to be overestimated.

# final_beta (double) : end of annealing inverse temperature,
#			not used if record_type == 2
#			(default: 4.0)
final_beta 4.0
# Final beta at the end of annealing. 4.0 should be a reasonable
# value for most data sets. If you suspect that this parameter is set
# too low one can try to set it to 6 and run the program 1.5 times as
# long. If there is no substantial improvement in the final score at
# the end of annealing, then 4.0 is sufficiently high.

# total_time_steps (double) : timesteps to get to reference state
#			(required)
total_time_steps 250000.0
# This parameter HAS to be set by the user. It sets the total running time of the 
# program. Generally running longer can only improve the results so
# this parameter should basically be set to the maximum value that one
# is willing to wait for. One can experiment with different running
# times and compare the score that the program got to at the end of 
# annealing. Notice that if one runs the data on N objects and the
# algorithm finds M clusters at the end of annealing, then one should
# generally have the total number of time steps is much larger than
# N*M (at least 100 times as large for instance). Otherwise one can be
# almost certain the the program has had insufficient time to find a global
# optimum. 


# transient_time_steps (double) : duration of transient period,
#			not used if record_type == 2
#			(default: 0.0)
transient_time_steps  25000.0
#This is the amount the time the program runs before annealing starts.
#5-10% of the total running time is generally reasonable.

# deep_quench_time_steps (double) : duration of last period of
#			very high (25.0) beta, not used if
#			record_type == 2
#			(default: 0.0)
deep_quench_time_steps 25000.0
# Just before getting to the end of annealing the program tunes beta up 
# very high to find a locally optimal clustering. This is
# the amount of time the program spends at high beta.
# this number can generally be set fairly small. With N objects
# and M clusters, 3*N*M should be enough.

# significance_run_total_time_steps (double) : timesteps of the
#			best matching significance run; only used
#			when record_type is 1.
#			(default: the value of total_time_steps)
significance_run_total_time_steps 250000.0
# This is the total time that the program runs to test the 
# significance of the clusters found at the end of annealing.
# The default of setting it equal to the total number of 
# time steps of the annealing run is reasonable.

#####################################################################
# record keeping and output parameters

# filename_prefix (string) : a prefix to be added to all output
#			file names (default: )
filename_prefix output_
# All output files will get this prefix.
# To put the output files in a particular directory, prepend
# a path to this prefix.

# log_dump_period (integer) : seconds between state dumps to log file
#			(default: 3600)
log_dump_period  3600
# Every this many seconds the output writes a report of its
# current state into the log file.

# significance_sample_period (double) : timesteps between samples
#			during the best match significance run. Only
#			 used when record_type is 1. (default: 1000.0)
significance_sample_period 1000.0
# When assessing the significance of clusters, the algorithm takes
# a 'snapshot' of the state. To maximize the number of statistically 'independent' snapshots
# this parameter should be set to approximately N when clustering N 
# objects.

# sig_min_pair_presence (double) : a fraction, indicating what
#			proportion of the significance samples having
#			more than one of the members of a
#			reference cluster in the best matching
#			cluster, needs to be present in order to
#			display a summary of that reference cluster's
#			members. Only used when record_type is 1.
#			(default: 0.9)
sig_min_pair_presence 0.9
# When using record_type=1 only clusters that have at least a pair
# of sites occur in the same cluster with probability larger than
# this are considered significant. One can consider lowering or
# raising the default when the user feels too few or too many
# significant clusters are reported.

# cluster_member_sig_cutoff (double) : a number between 0.0 and 1.0
#			which will be the threshold of the normalized
#			significance rating of each tga's membership
#			likeliness in a reference cluster, below
#			which the tga will not appear in the summary
#			of membership of a reference cluster.
#			(default: 0.05)
cluster_member_sig_cutoff 0.05
# When reporting significant clusters only members that belong to the cluster
# with at least this probability are reported.

# pseudocluster_cutoff (double) : the fraction of samples in which
#			a pair of tgas are co-clustered, above which
#			the two tgas are placed in the same
#			pseudocluster. Only used when record_type
#			is 2. (default: 0.5)
pseudocluster_cutoff 0.5
# When using record_type=2 this cutoff determines how pair
# statistics are integrated into clusters. The program reconstructs
# a graph where each sequence is a node and an edge exists between
# nodes i and j if and only if the sequences i and j occur in the same
# cluster more than the cutoff percent of the time. 
# The connected components of this graph then form the clusters (of
# which the significance is then also assessed). Thus, with the
# default of 0.5 a sequence will cluster with all those sequences with
# which it occurs in the same cluster at least 50% of the time.

# verbose (boolean) : will dumps of the state include the individual
#			tga's distributions? Also, will the
#			pseudoclusters for a wide range of cutoff
#			values be displayed instead of only the
#			one based on the specified
#			pseudocluster_cutoff? (default: false)
verbose false
# When true more information is output. This is probably only useful
# for 'expert' users.


#####################################################################
# end of parameter file

