2
#include <arb_assert.h>
4
#define st_assert(bed) arb_assert(bed)
1
// =============================================================== //
6
// Institute of Microbiology (Technical University Munich) //
7
// http://www.arb-home.de/ //
9
// =============================================================== //
15
#include <arbdb_base.h>
17
#ifndef _GLIBCXX_ALGORITHM
21
#include <attributes.h>
30
#define st_assert(cond) arb_assert(cond)
45
typedef unsigned char ST_ML_Color;
47
typedef float ST_FLOAT;
48
// typedef double ST_FLOAT; // careful, using double has quite an impact on the memory footprint
56
ST_MAX_BASE, // [can't be changed w/o recoding several parts]
16
extern class AWT_dna_table {
17
char char_to_enum_table[256];
19
AWT_dna_base char_to_enum(char i) {
20
return (AWT_dna_base)char_to_enum_table[(unsigned char)i];
25
typedef unsigned char ST_ML_Color;
27
const int ST_MAX_SEQ_PART = 256;
28
// should be greater than the editor width
29
// otherwise extrem performance penalties
30
const int ST_BUCKET_SIZE = 16;
31
// at minumum ST_BUCKET_SIZE characters are calculated per call
32
const int LD_BUCKET_SIZE = 4; // log dualis of ST_BUCKET_SIZE
34
class ST_base_vector {
36
float b[ST_MAX_BASE]; // acgt-
38
float lik; // likelihood = 2^ld_lik * lik * (b[0] + b[1] + b[2] ..)
39
void set(char base, ST_base_vector *frequencies);
40
inline void mult(ST_base_vector *other);
60
struct ST_base_vector {
61
ST_FLOAT b[ST_MAX_BASE]; // acgt-
63
ST_FLOAT lik; // likelihood = 2^ld_lik * lik * (b[0] + b[1] + b[2] ..)
65
void setTo(ST_FLOAT freq) {
66
for (int base = ST_A; base<ST_MAX_BASE; ++base) {
70
void multiplyWith(ST_FLOAT factor) {
71
for (int base = ST_A; base<ST_MAX_BASE; ++base) {
75
void increaseBy(ST_FLOAT inc) {
76
for (int base = ST_A; base<ST_MAX_BASE; ++base) {
80
void makeInverseOf(const ST_base_vector& other, ST_FLOAT other_min_freq) {
81
for (int base = ST_A; base<ST_MAX_BASE; ++base) {
82
b[base] = other_min_freq / other.b[base];
86
void setBase(const ST_base_vector& frequencies, char base);
41
88
void check_overflow();
91
ST_FLOAT summarize() const { return b[ST_A] + b[ST_C] + b[ST_G] + b[ST_T] + b[ST_GAP]; }
92
ST_FLOAT min_frequency() { return std::min(std::min(std::min(b[ST_A], b[ST_C]), std::min(b[ST_G], b[ST_T])), b[ST_GAP]); }
93
ST_FLOAT max_frequency() { return std::max(std::max(std::max(b[ST_A], b[ST_C]), std::max(b[ST_G], b[ST_T])), b[ST_GAP]); }
95
inline ST_base_vector& operator *= (const ST_base_vector& other);
45
98
class ST_rate_matrix {
46
float m[ST_MAX_BASE][ST_MAX_BASE];
99
// symmetric matrix containing only two different values
101
ST_FLOAT diag; // value for indices [0][0], [1][1]...
102
ST_FLOAT rest; // other indices
106
ST_rate_matrix() : diag(0.0), rest(0.0) {}
48
107
void set(double dist, double TT_ratio);
49
inline void mult(ST_base_vector *in, ST_base_vector *out);
56
/** Note: Because we have only limited memory we split the
57
sequence into ST_MAX_SEQ_PART long parts */
58
class ST_sequence_ml : private AP_sequence {
62
GBDATA *gb_data; // the sequence
63
static ST_base_vector *tmp_out; // len = alignment length
67
ST_ML *st_ml; // link to a global ST object
68
ST_base_vector *sequence; // A part of the sequence
70
ST_ML_Color *color_out;
71
int *color_out_valid_till; // color_out is valid up to
74
void delete_sequence(); // remove link to database
75
void sequence_change(); // sequence has changed in db
76
AP_FLOAT combine(const AP_sequence* lefts, const AP_sequence *rights);
77
void partial_match(const AP_sequence* part, long *overlap, long *penalty) const;
78
ST_sequence_ml(AP_tree_root *rooti, ST_ML *st_ml);
80
AP_sequence *dup(void);
82
void set(const char *sequence);
83
void set_gb(GBDATA *gbd);
85
void set_sequence(); // start at st_ml->base
87
void go(const ST_sequence_ml *lefts, double leftl,
88
const ST_sequence_ml *rights, double rightl);
89
void ungo(); // undo go
91
void calc_out(ST_sequence_ml *sequence_of_brother, double dist);
96
typedef void (*AW_CB0)(AW_window*);
100
friend AP_tree *st_ml_convert_species_name_to_node(ST_ML *st_ml,
101
const char *species_name);
102
GB_HASH *hash_2_ap_tree; // hash table to get from name to tree_node
103
GB_HASH *keep_species_hash; // temporary hash to find
105
int *not_valid; // which columns are valid
107
ST_sequence_ml *do_tree(AP_tree *node);
108
void undo_tree(AP_tree *node); //opposite of do_tree
108
inline void transform(const ST_base_vector& in, ST_base_vector& out) const;
112
class ST_ML : virtual Noncopyable {
114
/* Note: Because we have only limited memory we split the
115
* sequence into several ST_MAX_SEQ_PART-sized parts during stat-calculation
118
char *alignment_name;
119
GB_HASH *hash_2_ap_tree; // hash table to get from name to tree_node
120
GB_HASH *keep_species_hash; // temporary hash to find ???
122
int *not_valid; // which columns are valid
124
AP_tree_root *tree_root;
125
int latest_modification; // last mod
126
size_t first_pos; // first..
127
size_t last_pos; // .. and last alignment position of current slice
130
AW_window *cb_window;
134
ColumnStat *column_stat;
135
// if column_stat == 0 -> rates and ttratio are owned by ST_ML,
136
// otherwise they belong to column_stat
137
const float *rates; // column independent
138
const float *ttratio; // column independent
140
ST_base_vector *base_frequencies; // column independent
141
ST_base_vector *inv_base_frequencies; // column independent
143
double max_dist; // max_dist for rate_matrices
144
double step_size; // max_dist/step_size matrices
146
int max_rate_matrices; // number of rate_matrices
147
ST_rate_matrix *rate_matrices; // one matrix for each distance
151
size_t distance_2_matrices_index(int distance) {
152
if (distance<0) distance = -distance; // same matrix for neg/pos distance
154
return distance >= max_rate_matrices
155
? (max_rate_matrices-1)
159
MostLikelySeq *getOrCreate_seq(AP_tree *node);
161
const MostLikelySeq *get_mostlikely_sequence(AP_tree *node);
162
void undo_tree(AP_tree *node); // opposite of get_mostlikely_sequence()
109
164
void insert_tree_into_hash_rek(AP_tree *node);
110
void create_matrizes(double max_disti, int nmatrizes);
165
void create_matrices(double max_disti, int nmatrices);
111
166
void create_frequencies();
112
168
static long delete_species(const char *key, long val, void *cd_st_ml);
114
AP_tree_root *tree_root;
115
int latest_modification; // last mod;
119
AW_window *aw_window;
122
float *ttratio; // column independent
123
ST_base_vector *base_frequencies; // column independent
124
ST_base_vector *inv_base_frequencies; // column independent
125
float *rates; // column independent
126
double max_dist; // max_dist for rate_matrizes
127
double step_size; // max_dist/step_size matrizes
129
ST_rate_matrix *rate_matrizes; // for each distance a new matrix
172
ST_ML(GBDATA *gb_main);
175
void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment);
176
ColumnStat *get_column_statistic() { return column_stat; }
178
GB_ERROR calc_st_ml(const char *tree_name,
179
const char *alignment_name,
180
const char *species_names, // 0 -> all [marked] species (else species_names is a (char)1 separated list of species)
183
const WeightedFilter *weighted_filter) __ATTR__USERESULT;
187
const AP_filter *get_filter() const;
188
size_t get_filtered_length() const;
189
size_t get_alignment_length() const;
191
ST_rate_matrix& get_matrix_for(int distance) {
192
return rate_matrices[distance_2_matrices_index(distance)];
195
GBDATA *get_gb_main() const { return gb_main; }
196
const GBT_TREE *get_gbt_tree() const;
197
size_t count_species_in_tree() const;
199
AP_tree *find_node_by_name(const char *species_name);
201
void set_postcalc_callback(AW_CB0 postcalc_cb_, AW_window *cb_window_) {
202
postcalc_cb = postcalc_cb_;
203
cb_window = cb_window_;
205
void do_postcalc_callback() { if (postcalc_cb) postcalc_cb(cb_window); }
207
size_t get_first_pos() const { return first_pos; }
208
size_t get_last_pos() const { return last_pos; }
210
double get_step_size() const { return step_size; }
212
const ST_base_vector *get_inv_base_frequencies() const { return inv_base_frequencies; }
213
const ST_base_vector& get_base_frequency_at(size_t pos) const { return base_frequencies[pos]; }
214
float get_rate_at(size_t pos) const { return rates[pos]; }
132
216
void set_modified(int *what = 0);
133
void set_refresh(); // set flag for refresh
136
ST_ML(GBDATA *gb_main);
140
GB_ERROR init(const char *tree_name, const char *alignment_name,
141
const char *species_names, int marked_only,
142
const char *filter_string, AWT_csp *awt_csp);
143
// species_names is 0 -> all [marked] species (else species_names is a (char)1 seperated list of species)
144
// filter_string==0 -> no filter
146
void clear_all(); // delete all caches
149
ST_sequence_ml *get_ml_vectors(char *species_name, AP_tree *node,
150
int start_ali_pos, int end_ali_pos);
151
ST_ML_Color *get_color_string(char *species_name, AP_tree *node,
152
int start_ali_pos, int end_ali_pos);
154
int update_ml_likelihood(char *result[4], int *latest_update,
155
char *species_name, AP_tree *node);
217
void set_refresh(); // set flag for refresh
219
void clear_all(); // delete all caches
221
MostLikelySeq *get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos);
222
ST_ML_Color *get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos);
224
bool update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node);
157
226
int refresh_needed();
230
#error st_ml.hxx included twice