From 73df129c27d5b6eb01f353e70acb71ae1cf46f7d Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 13 May 2019 14:43:58 +0200 Subject: [PATCH 01/16] added tbe extra matrix --- src/tree/pll_tree.h | 4 +- src/tree/tbe_functions.c | 108 +++++++++++++++++++++++++++++++++------ 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 6e32ddb..9ca72d8 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -717,7 +717,9 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info); + pllmod_tbe_split_info_t* split_info, + unsigned int d, + double** extra_taxa_table); /* This is an old, naive and rather inefficient TBE computation method by Alexey. * Keep it here just in case */ diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 6ac79e5..f593806 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -49,7 +49,7 @@ int cb_full_traversal(pll_unode_t * node) } void postorder_init_recursive(pll_unode_t * node, unsigned int * index, - unsigned int * subtree_size, index_information_t* idx_infos) + unsigned int * subtree_size, index_information_t* idx_infos, unsigned int* clv_idx_to_postorder_idx) { if (node->next == NULL) { @@ -59,7 +59,7 @@ void postorder_init_recursive(pll_unode_t * node, unsigned int * index, pll_unode_t * snode = node->next; do { - postorder_init_recursive(snode->back, index, subtree_size, idx_infos); + postorder_init_recursive(snode->back, index, subtree_size, idx_infos, clv_idx_to_postorder_idx); snode = snode->next; } while (snode && snode != node); index_information_t info; @@ -68,18 +68,21 @@ void postorder_init_recursive(pll_unode_t * node, unsigned int * index, info.idx_right = node->next->next->back->clv_index; subtree_size[node->clv_index] = subtree_size[info.idx_left] + subtree_size[info.idx_right]; idx_infos[*index] = info; + if (clv_idx_to_postorder_idx) { + clv_idx_to_postorder_idx[info.idx] = *index; + } *index = *index + 1; } void postorder_init(pll_unode_t * root, unsigned int * trav_size, - unsigned int * subtree_size, index_information_t* idx_infos) + unsigned int * subtree_size, index_information_t* idx_infos, unsigned int* clv_idx_to_postorder_idx) { *trav_size = 0; - postorder_init_recursive(root->back, trav_size, subtree_size, idx_infos); - postorder_init_recursive(root, trav_size, subtree_size, idx_infos); + postorder_init_recursive(root->back, trav_size, subtree_size, idx_infos, clv_idx_to_postorder_idx); + postorder_init_recursive(root, trav_size, subtree_size, idx_infos, clv_idx_to_postorder_idx); } -tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count) +tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count, unsigned int* clv_idx_to_postorder_idx) { tbe_data_t* data = (tbe_data_t*) malloc(sizeof(tbe_data_t)); data->tip_count = tip_count; @@ -89,7 +92,7 @@ tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count) data->subtree_size = (unsigned int*) malloc(sizeof(unsigned int) * data->nodes_count); data->idx_infos = (index_information_t*) malloc(sizeof(index_information_t) * data->nodes_count); data->count_ones = (unsigned int*) malloc(sizeof(unsigned int) * data->nodes_count); - postorder_init(root, &data->trav_size, data->subtree_size, data->idx_infos); + postorder_init(root, &data->trav_size, data->subtree_size, data->idx_infos, clv_idx_to_postorder_idx); return data; } @@ -101,8 +104,59 @@ void free_tbe_data(tbe_data_t* data) free(data); } -unsigned int search_mindist(const pllmod_tbe_split_info_t* query, - tbe_data_t* data) +void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, + unsigned int* extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) { + // check if the current node is a leaf node + if (act_node_idx < data->tip_count) { // leaf node + // update the array + extra_taxa_count_for_split[act_node_idx]++; + return; + } + + if (act_node_idx == best_clv_idx) { + want_ones_now = !want_ones_now; + } + + unsigned int n_s = data->subtree_size[act_node_idx]; + unsigned int ones_s = data->count_ones[act_node_idx]; + if ((want_ones_now && ones_s == n_s) || (!want_ones_now && ones_s == 0)) { + return; // we don't need to go further down this subtree. + } else { + unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + } +} + +void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, + unsigned int * extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) { + if (dist == 1) { + // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. + unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf + extra_taxa_count_for_split[moved_taxon]++; + return; + } + + // we re-use the count_ones information that we got in the postorder-traversal done for finding mindist. + + // First question: Do we want to transform into only ones in subtree & zeros outside or only zeros in subtree & ones outside? + unsigned int root_idx = data->idx_infos[data->trav_size - 1].idx; + unsigned int n = data->tip_count; + unsigned int n_s = data->subtree_size[best_clv_idx]; + unsigned int ones_total = data->count_ones[root_idx]; + unsigned int zeros_total = n - ones_total; + unsigned int ones_s = data->count_ones[best_clv_idx]; + unsigned int zeros_s = n_s - ones_s; + unsigned int ops_ones_subtree = (n_s - ones_s) + (n - n_s) - (zeros_total - zeros_s); + unsigned int ops_zeros_subtree = (n_s - zeros_s) + (n - n_s) - (ones_total - ones_s); + int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; + + // now we do the preorder traversal, starting from the root node. + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); +} + +unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int d, + unsigned int * extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) { unsigned int min_dist = query->p - 1; unsigned int* count_ones = data->count_ones; @@ -121,6 +175,7 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, count_ones[i] = !query->subtree_res; } + size_t best_clv_idx = 0; for (size_t i = 0; i < data->trav_size; ++i) { unsigned int idx = data->idx_infos[i].idx; @@ -137,12 +192,17 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, if (dist_cand < min_dist) { min_dist = dist_cand; + best_clv_idx = idx; if (min_dist == 1) { - return min_dist; + break; } } } + + if (min_dist < d) { + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + } return min_dist; } @@ -251,7 +311,9 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info) + pllmod_tbe_split_info_t* split_info, + unsigned int d, + double** extra_taxa_table) { unsigned int i; unsigned int split_count = tip_count - 3; @@ -266,14 +328,21 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, tip_count, split_count, NULL, 0); + assert(d == 0 || extra_taxa_table); + if (!bs_splits_hash) return PLL_FAILURE; tbe_data_t* tbe_data = NULL; + unsigned int* clv_idx_to_postorder_idx = NULL; // only needed for exta_taxa_table /* iterate over all splits of the reference tree */ for (i = 0; i < split_count; i++) { + unsigned int* extra_taxa_count_for_split = NULL; + if (extra_taxa_table) { + extra_taxa_count_for_split = extra_taxa_table[i]; + } pll_split_t ref_split = ref_splits[i]; if (pllmod_utree_split_hashtable_lookup(bs_splits_hash, ref_split, tip_count)) @@ -286,15 +355,22 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, if (split_info[i].p == 2) { // no need for further searching support[i] = 0.0; + if (d > 1) { + unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf + extra_taxa_count_for_split[moved_taxon]++; + } continue; } - if (!tbe_data) - tbe_data = init_tbe_data(bs_root, tip_count); + if (!tbe_data) { + if (extra_taxa_table) { + clv_idx_to_postorder_idx = (unsigned int*) malloc((2*tip_count-1) * sizeof(unsigned int)); + } + tbe_data = init_tbe_data(bs_root, tip_count, clv_idx_to_postorder_idx); + } // else, we are in the search for minimum distance... - unsigned int min_hdist = search_mindist(&split_info[i], tbe_data); - //assert(min_hdist > 0); + unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, d, extra_taxa_count_for_split, clv_idx_to_postorder_idx); support[i] = 1.0 - (((double) min_hdist) / (split_info[i].p - 1)); } @@ -306,7 +382,6 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, return PLL_SUCCESS; } - /* This is an old, naive and rather inefficient TBE computation method by Alexey, * keep it here just in case */ PLL_EXPORT int pllmod_utree_tbe_naive(pll_split_t * ref_splits, @@ -410,4 +485,3 @@ PLL_EXPORT int pllmod_utree_tbe_naive(pll_split_t * ref_splits, return PLL_SUCCESS; } - From 398281c8b7770640f1f3c97f260c29c64656aa30 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Wed, 12 Jun 2019 13:50:26 +0200 Subject: [PATCH 02/16] added more extra output stuff --- src/tree/pll_tree.h | 34 ++++++++- src/tree/tbe_functions.c | 147 ++++++++++++++++++++++++++++++++------- 2 files changed, 155 insertions(+), 26 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 9ca72d8..280eb4a 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -705,6 +705,23 @@ typedef struct refsplit_info unsigned int right_leaf_idx; } pllmod_tbe_split_info_t; +typedef struct tbe_extra_info +{ + unsigned int d; // maximum distance to be considered a "close enough" branch + unsigned short** extra_taxa_table; + unsigned long * extra_taxa_array; + double * extra_avg_dist_array; + unsigned long num_close_enough_branches; + unsigned long num_bs_trees; +} pllmod_tbe_extra_info_t; + +typedef struct tbe_extra_all_result +{ + double* support; + pllmod_tbe_extra_info_t extra_info; +} pllmod_tbe_extra_all_result_t; + + PLL_EXPORT pllmod_tbe_split_info_t * pllmod_utree_tbe_nature_init(pll_unode_t * ref_root, unsigned int tip_count, @@ -717,9 +734,22 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, + pllmod_tbe_split_info_t* split_info); + +PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, + pll_split_t * bs_splits, + pll_unode_t * bs_root, + unsigned int tip_count, + double * support, pllmod_tbe_split_info_t* split_info, - unsigned int d, - double** extra_taxa_table); + pllmod_tbe_extra_info_t* extra_info); + +PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, bool doTable, bool doArray, bool doTree); + +PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_info, unsigned int refsplit_count); + +PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, + pll_unode_t ** bs_roots, unsigned int bs_count); /* This is an old, naive and rather inefficient TBE computation method by Alexey. * Keep it here just in case */ diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index f593806..e132a61 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -104,12 +104,25 @@ void free_tbe_data(tbe_data_t* data) free(data); } +void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id) +{ + if (info) { + if (info->extra_taxa_array) { + info->extra_taxa_array[taxon_id]++; + info->num_close_enough_branches++; + } + if (info->extra_taxa_table) { + info->extra_taxa_table[refsplit_id][taxon_id]++; + } + } +} + void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - unsigned int* extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) { // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array - extra_taxa_count_for_split[act_node_idx]++; + update_moved_taxa(extra_info, refsplit_id, act_node_idx); return; } @@ -123,17 +136,18 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ return; // we don't need to go further down this subtree. } else { unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); } } void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - unsigned int * extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) { + assert(extra_info != NULL); if (dist == 1) { // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf - extra_taxa_count_for_split[moved_taxon]++; + update_moved_taxa(extra_info, refsplit_id, moved_taxon); return; } @@ -152,11 +166,10 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; // now we do the preorder traversal, starting from the root node. - fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); } -unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int d, - unsigned int * extra_taxa_count_for_split, unsigned int* clv_idx_to_postorder_idx) +unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) { unsigned int min_dist = query->p - 1; unsigned int* count_ones = data->count_ones; @@ -200,8 +213,8 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da } } - if (min_dist < d) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + if (extra_info && min_dist < extra_info->d) { + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); } return min_dist; } @@ -307,13 +320,22 @@ pllmod_tbe_split_info_t * pllmod_utree_tbe_nature_init(pll_unode_t * ref_root, } PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, + pll_split_t * bs_splits, + pll_unode_t* bs_root, + unsigned int tip_count, + double * support, + pllmod_tbe_split_info_t* split_info) +{ + return pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_root, tip_count, support, split_info, NULL); +} + +PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, pll_split_t * bs_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, pllmod_tbe_split_info_t* split_info, - unsigned int d, - double** extra_taxa_table) + pllmod_tbe_extra_info_t* extra_info) { unsigned int i; unsigned int split_count = tip_count - 3; @@ -324,25 +346,23 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, return PLL_FAILURE; } + if (extra_info) { + extra_info->num_bs_trees++; + } + bitv_hashtable_t * bs_splits_hash = pllmod_utree_split_hashtable_insert(NULL, bs_splits, tip_count, split_count, NULL, 0); - assert(d == 0 || extra_taxa_table); - if (!bs_splits_hash) return PLL_FAILURE; tbe_data_t* tbe_data = NULL; - unsigned int* clv_idx_to_postorder_idx = NULL; // only needed for exta_taxa_table + unsigned int* clv_idx_to_postorder_idx = NULL; // only needed for exta_taxa_table or extra_taxa_array /* iterate over all splits of the reference tree */ for (i = 0; i < split_count; i++) { - unsigned int* extra_taxa_count_for_split = NULL; - if (extra_taxa_table) { - extra_taxa_count_for_split = extra_taxa_table[i]; - } pll_split_t ref_split = ref_splits[i]; if (pllmod_utree_split_hashtable_lookup(bs_splits_hash, ref_split, tip_count)) @@ -355,22 +375,22 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, if (split_info[i].p == 2) { // no need for further searching support[i] = 0.0; - if (d > 1) { + if (extra_info && extra_info->d > 1) { unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf - extra_taxa_count_for_split[moved_taxon]++; + update_moved_taxa(extra_info, i, moved_taxon); } continue; } if (!tbe_data) { - if (extra_taxa_table) { + if (extra_info) { clv_idx_to_postorder_idx = (unsigned int*) malloc((2*tip_count-1) * sizeof(unsigned int)); } tbe_data = init_tbe_data(bs_root, tip_count, clv_idx_to_postorder_idx); } // else, we are in the search for minimum distance... - unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, d, extra_taxa_count_for_split, clv_idx_to_postorder_idx); + unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i); support[i] = 1.0 - (((double) min_hdist) / (split_info[i].p - 1)); } @@ -382,6 +402,85 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, return PLL_SUCCESS; } +PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, bool doTable, bool doArray, bool doTree) { + pllmod_tbe_extra_info_t * extra_info = malloc(sizeof(pllmod_tbe_extra_info_t)); + if (doArray) { + extra_info->extra_taxa_array = calloc(tip_count, sizeof(unsigned long)); + } + if (doTable) { + extra_info->extra_taxa_table = calloc(refsplit_count, sizeof(unsigned short*)); + int i; + for (i = 0; i < refsplit_count; ++i) { + extra_info->extra_taxa_table[i] = calloc(tip_count, sizeof(unsigned short)); + } + } + if (doTree) { + extra_info->extra_avg_dist_array = calloc(refsplit_count, sizeof(unsigned long)); + } + extra_info->num_bs_trees = 0; + return extra_info; +} + +PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_info, unsigned int refsplit_count) { + if (extra_info->extra_taxa_array) { + free(extra_info->extra_taxa_array); + } + if (extra_info->extra_taxa_table) { + unsigned int i; + for (i = 0; i < refsplit_count; ++i) { + free(extra_info->extra_taxa_table[i]); + } + free(extra_info->extra_taxa_table); + } + if (extra_info->extra_avg_dist_array) { + free(extra_info->extra_avg_dist_array); + } + free(extra_info); +} + +pllmod_tbe_extra_all_result_t * create_tbe_extra_all_result(unsigned int refsplit_count, unsigned int tip_count) { + pllmod_tbe_extra_all_result_t* result = malloc(sizeof(pllmod_tbe_extra_all_result_t)); + result->support = calloc(refsplit_count, sizeof(double)); + result->extra_info = pllmod_tbe_extra_info_create(refsplit_count, tip_count, true, true, true); + return result; +} + +PLL_EXPORT void pllmod_tbe_nature_extra_all_print(pllmod_tbe_extra_all_result_t * result, unsigned int refsplit_count, unsigned int tip_count, unsigned int bs_count, char* file_path) { + unsigned int i, j; + for (i = 0; i < refsplit_count; ++i) { + double val = (double) (result->support[i]) / bs_count; + // TODO: print val + } + for (i = 0; i < refsplit_count; ++i) { + for (j = 0; j < tip_count; ++j) { + double val = (double) (result->extra_info->extra_taxa_table[i][j]) / bs_count; + // TODO: print val + } + } + for (i = 0; i < tip_count; ++i) { + double val = (double) (result->extra_info->extra_taxa_array[i]) / result->extra_info->num_close_enough_branches; + // TODO: print val + } +} + +/*PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, + pll_unode_t ** bs_roots, unsigned int bs_count) +{ + pllmod_tbe_extra_all_result_t* result = create_tbe_extra_all_result(refsplit_count, tip_count); + + pllmod_tbe_split_info_t * refsplit_info = pllmod_utree_tbe_nature_init(ref_root, tip_count, split_to_node_map); + unsigned int i, j; + for (i = 0; i < bs_count; ++i) { + pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_roots[i], tip_count, result->support, refsplit_info, result->extra_info); + } + return result; +} + +PLL_EXPORT void pllmod_tbe_extra_all_fake_main() { + pllmod_tbe_extra_all_result_t* result = pllmod_tbe_nature_extra_all(ref_root, tip_count, bs_roots, bs_count); + pllmod_tbe_nature_extra_all_print(result, refsplit_count, tip_count, bs_count, file_path); +}*/ + /* This is an old, naive and rather inefficient TBE computation method by Alexey, * keep it here just in case */ PLL_EXPORT int pllmod_utree_tbe_naive(pll_split_t * ref_splits, From 8c3f328393587c24440e5a25eefbd3cc90a9a5fa Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Wed, 12 Jun 2019 14:27:44 +0200 Subject: [PATCH 03/16] fixed compile errors --- src/tree/pll_tree.h | 2 +- src/tree/tbe_functions.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 280eb4a..1dcf601 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -718,7 +718,7 @@ typedef struct tbe_extra_info typedef struct tbe_extra_all_result { double* support; - pllmod_tbe_extra_info_t extra_info; + pllmod_tbe_extra_info_t* extra_info; } pllmod_tbe_extra_all_result_t; diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index e132a61..f026f7a 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -409,7 +409,7 @@ PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int r } if (doTable) { extra_info->extra_taxa_table = calloc(refsplit_count, sizeof(unsigned short*)); - int i; + unsigned int i; for (i = 0; i < refsplit_count; ++i) { extra_info->extra_taxa_table[i] = calloc(tip_count, sizeof(unsigned short)); } From ec3f0552f47511e316f3b3ba1da64275c8e6d603 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Wed, 12 Jun 2019 15:32:05 +0200 Subject: [PATCH 04/16] adapted to this weird cutoff value from booster --- src/tree/pll_tree.h | 5 +++-- src/tree/tbe_functions.c | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 1dcf601..a897fbe 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -707,7 +707,8 @@ typedef struct refsplit_info typedef struct tbe_extra_info { - unsigned int d; // maximum distance to be considered a "close enough" branch + double tbe_cutoff; + unsigned int min_p; // minimum size of "light side" to be considered a "close enough" branch unsigned short** extra_taxa_table; unsigned long * extra_taxa_array; double * extra_avg_dist_array; @@ -744,7 +745,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, pllmod_tbe_split_info_t* split_info, pllmod_tbe_extra_info_t* extra_info); -PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, bool doTable, bool doArray, bool doTree); +PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, double tbe_cutoff, bool doTable, bool doArray, bool doTree); PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_info, unsigned int refsplit_count); diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index f026f7a..aaadc9d 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -213,8 +213,11 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da } } - if (extra_info && min_dist < extra_info->d) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + if (extra_info && query->p >= extra_info->min_p) { + double norm_dist = ((double)min_dist) * 1.0 / (((double)query->p) - 1.0); + if (norm_dist <= extra_info->tbe_cutoff) { + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + } } return min_dist; } @@ -375,7 +378,8 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, if (split_info[i].p == 2) { // no need for further searching support[i] = 0.0; - if (extra_info && extra_info->d > 1) { + // double norm = ((double)min_dist) * 1.0 / (((double)split_info[i].p) - 1.0); + if (extra_info && 2 >= extra_info->min_p && 1.0 <= extra_info->tbe_cutoff) { unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf update_moved_taxa(extra_info, i, moved_taxon); } @@ -402,7 +406,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, return PLL_SUCCESS; } -PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, bool doTable, bool doArray, bool doTree) { +PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, double tbe_cutoff, bool doTable, bool doArray, bool doTree) { pllmod_tbe_extra_info_t * extra_info = malloc(sizeof(pllmod_tbe_extra_info_t)); if (doArray) { extra_info->extra_taxa_array = calloc(tip_count, sizeof(unsigned long)); @@ -418,6 +422,8 @@ PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int r extra_info->extra_avg_dist_array = calloc(refsplit_count, sizeof(unsigned long)); } extra_info->num_bs_trees = 0; + extra_info->tbe_cutoff = tbe_cutoff; + extra_info->min_p = (unsigned int)(ceil(1.0/tbe_cutoff + 1.0)); return extra_info; } @@ -438,7 +444,7 @@ PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_in free(extra_info); } -pllmod_tbe_extra_all_result_t * create_tbe_extra_all_result(unsigned int refsplit_count, unsigned int tip_count) { +/*pllmod_tbe_extra_all_result_t * create_tbe_extra_all_result(unsigned int refsplit_count, unsigned int tip_count) { pllmod_tbe_extra_all_result_t* result = malloc(sizeof(pllmod_tbe_extra_all_result_t)); result->support = calloc(refsplit_count, sizeof(double)); result->extra_info = pllmod_tbe_extra_info_create(refsplit_count, tip_count, true, true, true); @@ -463,7 +469,7 @@ PLL_EXPORT void pllmod_tbe_nature_extra_all_print(pllmod_tbe_extra_all_result_t } } -/*PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, +PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, pll_unode_t ** bs_roots, unsigned int bs_count) { pllmod_tbe_extra_all_result_t* result = create_tbe_extra_all_result(refsplit_count, tip_count); From 96629b49b5834a9707af69c7e56469a5a2598965 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Wed, 12 Jun 2019 18:17:35 +0200 Subject: [PATCH 05/16] changed extra taxa table --- src/tree/pll_tree.h | 3 +-- src/tree/tbe_functions.c | 46 +++++++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index a897fbe..5e8ade8 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -710,9 +710,8 @@ typedef struct tbe_extra_info double tbe_cutoff; unsigned int min_p; // minimum size of "light side" to be considered a "close enough" branch unsigned short** extra_taxa_table; - unsigned long * extra_taxa_array; + double * extra_taxa_array; double * extra_avg_dist_array; - unsigned long num_close_enough_branches; unsigned long num_bs_trees; } pllmod_tbe_extra_info_t; diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index aaadc9d..91fb63a 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -104,12 +104,12 @@ void free_tbe_data(tbe_data_t* data) free(data); } -void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id) +void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { if (info) { if (info->extra_taxa_array) { - info->extra_taxa_array[taxon_id]++; - info->num_close_enough_branches++; + extra_taxa_array_single[taxon_id]++; + (*num_close_enough_branches)++; } if (info->extra_taxa_table) { info->extra_taxa_table[refsplit_id][taxon_id]++; @@ -118,11 +118,11 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, } void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array - update_moved_taxa(extra_info, refsplit_id, act_node_idx); + update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); return; } @@ -136,18 +136,18 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ return; // we don't need to go further down this subtree. } else { unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); if (dist == 1) { // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf - update_moved_taxa(extra_info, refsplit_id, moved_taxon); + update_moved_taxa(extra_info, refsplit_id, moved_taxon, extra_taxa_array_single, num_close_enough_branches); return; } @@ -166,10 +166,10 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; // now we do the preorder traversal, starting from the root node. - fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } -unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id) +unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { unsigned int min_dist = query->p - 1; unsigned int* count_ones = data->count_ones; @@ -216,7 +216,7 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da if (extra_info && query->p >= extra_info->min_p) { double norm_dist = ((double)min_dist) * 1.0 / (((double)query->p) - 1.0); if (norm_dist <= extra_info->tbe_cutoff) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id); + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } return min_dist; @@ -362,6 +362,12 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, tbe_data_t* tbe_data = NULL; unsigned int* clv_idx_to_postorder_idx = NULL; // only needed for exta_taxa_table or extra_taxa_array + unsigned int* extra_taxa_array_single = NULL; // only needed for extra_taxa_array + unsigned int num_close_enough_branches = 0; // only needed for extra_taxa_array + + if (extra_info && extra_info->extra_taxa_array) { + extra_taxa_array_single = calloc(tip_count, sizeof(unsigned int)); + } /* iterate over all splits of the reference tree */ for (i = 0; i < split_count; i++) @@ -381,7 +387,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, // double norm = ((double)min_dist) * 1.0 / (((double)split_info[i].p) - 1.0); if (extra_info && 2 >= extra_info->min_p && 1.0 <= extra_info->tbe_cutoff) { unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf - update_moved_taxa(extra_info, i, moved_taxon); + update_moved_taxa(extra_info, i, moved_taxon, extra_taxa_array_single, &num_close_enough_branches); } continue; } @@ -394,10 +400,20 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, } // else, we are in the search for minimum distance... - unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i); + unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i, extra_taxa_array_single, &num_close_enough_branches); support[i] = 1.0 - (((double) min_hdist) / (split_info[i].p - 1)); } + // update extra taxa array + if (extra_info && extra_info->extra_taxa_array) { + for (i = 0; i < tip_count; ++i) { + if (extra_taxa_array_single[i] > 0) { + extra_info->extra_taxa_array[i] += (double) extra_taxa_array_single[i] / (double) num_close_enough_branches; + } + } + free(extra_taxa_array_single); + } + pllmod_utree_split_hashtable_destroy(bs_splits_hash); if (tbe_data) @@ -409,7 +425,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, double tbe_cutoff, bool doTable, bool doArray, bool doTree) { pllmod_tbe_extra_info_t * extra_info = malloc(sizeof(pllmod_tbe_extra_info_t)); if (doArray) { - extra_info->extra_taxa_array = calloc(tip_count, sizeof(unsigned long)); + extra_info->extra_taxa_array = calloc(tip_count, sizeof(double)); } if (doTable) { extra_info->extra_taxa_table = calloc(refsplit_count, sizeof(unsigned short*)); From 3a150aaf9ff93e3f089a5131e7a412697ddf9a6b Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 17 Jun 2019 16:31:46 +0200 Subject: [PATCH 06/16] fixed computation of taxa to move --- src/tree/tbe_functions.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 91fb63a..9267db9 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -24,6 +24,8 @@ #include "../pllmod_common.h" +unsigned int dbg_counter = 0; + typedef struct index_information { unsigned int idx; @@ -89,10 +91,23 @@ tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count, unsigned i data->tip_count_div_2 = tip_count / 2; data->trav_size = 0; data->nodes_count = 2 * tip_count - 2; - data->subtree_size = (unsigned int*) malloc(sizeof(unsigned int) * data->nodes_count); - data->idx_infos = (index_information_t*) malloc(sizeof(index_information_t) * data->nodes_count); - data->count_ones = (unsigned int*) malloc(sizeof(unsigned int) * data->nodes_count); + data->subtree_size = (unsigned int*) malloc(sizeof(unsigned int) * (data->nodes_count + 1)); + data->idx_infos = (index_information_t*) malloc(sizeof(index_information_t) * (data->nodes_count + 1)); + data->count_ones = (unsigned int*) malloc(sizeof(unsigned int) * (data->nodes_count + 1)); postorder_init(root, &data->trav_size, data->subtree_size, data->idx_infos, clv_idx_to_postorder_idx); + + // add new fake_root node for the preorder-extra-info-stuff + index_information_t root_info; + root_info.idx = data->nodes_count; + root_info.idx_left = root->clv_index; + root_info.idx_right = root->back->clv_index; + data->trav_size++; + data->subtree_size[data->nodes_count] = data->subtree_size[root->clv_index] + data->subtree_size[root->back->clv_index]; + data->idx_infos[data->trav_size - 1] = root_info; + if (clv_idx_to_postorder_idx) { + clv_idx_to_postorder_idx[data->nodes_count] = data->trav_size - 1; + } + return data; } @@ -115,6 +130,7 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, info->extra_taxa_table[refsplit_id][taxon_id]++; } } + dbg_counter++; } void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, @@ -122,7 +138,9 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array - update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); + if ((want_ones_now && data->count_ones[act_node_idx] == 0) || (!want_ones_now && data->count_ones[act_node_idx] == 1)) { + update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); + } return; } @@ -144,6 +162,7 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); + dbg_counter = 0; if (dist == 1) { // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf @@ -155,10 +174,12 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d // First question: Do we want to transform into only ones in subtree & zeros outside or only zeros in subtree & ones outside? unsigned int root_idx = data->idx_infos[data->trav_size - 1].idx; + unsigned int n = data->tip_count; unsigned int n_s = data->subtree_size[best_clv_idx]; unsigned int ones_total = data->count_ones[root_idx]; - unsigned int zeros_total = n - ones_total; + unsigned int zeros_total = query->p; + assert(ones_total + zeros_total == n); unsigned int ones_s = data->count_ones[best_clv_idx]; unsigned int zeros_s = n_s - ones_s; unsigned int ops_ones_subtree = (n_s - ones_s) + (n - n_s) - (zeros_total - zeros_s); @@ -167,6 +188,8 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d // now we do the preorder traversal, starting from the root node. fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + + assert(dbg_counter == dist); } unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) From 5b40e2d58f08fff8e781927af100a919817c1cb5 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 17 Jun 2019 18:07:38 +0200 Subject: [PATCH 07/16] made computation of tbe extra array more like in booster --- src/tree/tbe_functions.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 9267db9..416e523 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -399,6 +399,10 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, if (pllmod_utree_split_hashtable_lookup(bs_splits_hash, ref_split, tip_count)) { + if (split_info[i].p >= extra_info->min_p) + { + num_close_enough_branches++; + } /* found identical split in a bootstrap tree -> assign full support */ support[i] = 1.0; continue; From 2dbe97bd7d000717fbcfdc36488e01ce1dc49d0a Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Tue, 18 Jun 2019 15:31:51 +0200 Subject: [PATCH 08/16] remove debug counter --- src/tree/tbe_functions.c | 49 ---------------------------------------- 1 file changed, 49 deletions(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 416e523..04321ea 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -24,8 +24,6 @@ #include "../pllmod_common.h" -unsigned int dbg_counter = 0; - typedef struct index_information { unsigned int idx; @@ -130,7 +128,6 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, info->extra_taxa_table[refsplit_id][taxon_id]++; } } - dbg_counter++; } void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, @@ -162,7 +159,6 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); - dbg_counter = 0; if (dist == 1) { // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf @@ -188,8 +184,6 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d // now we do the preorder traversal, starting from the root node. fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); - - assert(dbg_counter == dist); } unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) @@ -487,49 +481,6 @@ PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_in free(extra_info); } -/*pllmod_tbe_extra_all_result_t * create_tbe_extra_all_result(unsigned int refsplit_count, unsigned int tip_count) { - pllmod_tbe_extra_all_result_t* result = malloc(sizeof(pllmod_tbe_extra_all_result_t)); - result->support = calloc(refsplit_count, sizeof(double)); - result->extra_info = pllmod_tbe_extra_info_create(refsplit_count, tip_count, true, true, true); - return result; -} - -PLL_EXPORT void pllmod_tbe_nature_extra_all_print(pllmod_tbe_extra_all_result_t * result, unsigned int refsplit_count, unsigned int tip_count, unsigned int bs_count, char* file_path) { - unsigned int i, j; - for (i = 0; i < refsplit_count; ++i) { - double val = (double) (result->support[i]) / bs_count; - // TODO: print val - } - for (i = 0; i < refsplit_count; ++i) { - for (j = 0; j < tip_count; ++j) { - double val = (double) (result->extra_info->extra_taxa_table[i][j]) / bs_count; - // TODO: print val - } - } - for (i = 0; i < tip_count; ++i) { - double val = (double) (result->extra_info->extra_taxa_array[i]) / result->extra_info->num_close_enough_branches; - // TODO: print val - } -} - -PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, - pll_unode_t ** bs_roots, unsigned int bs_count) -{ - pllmod_tbe_extra_all_result_t* result = create_tbe_extra_all_result(refsplit_count, tip_count); - - pllmod_tbe_split_info_t * refsplit_info = pllmod_utree_tbe_nature_init(ref_root, tip_count, split_to_node_map); - unsigned int i, j; - for (i = 0; i < bs_count; ++i) { - pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_roots[i], tip_count, result->support, refsplit_info, result->extra_info); - } - return result; -} - -PLL_EXPORT void pllmod_tbe_extra_all_fake_main() { - pllmod_tbe_extra_all_result_t* result = pllmod_tbe_nature_extra_all(ref_root, tip_count, bs_roots, bs_count); - pllmod_tbe_nature_extra_all_print(result, refsplit_count, tip_count, bs_count, file_path); -}*/ - /* This is an old, naive and rather inefficient TBE computation method by Alexey, * keep it here just in case */ PLL_EXPORT int pllmod_utree_tbe_naive(pll_split_t * ref_splits, From 703c14394b9801a380826a9122e97df003ecdc2a Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Tue, 18 Jun 2019 15:33:23 +0200 Subject: [PATCH 09/16] removed unused tbe function --- src/tree/pll_tree.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 5e8ade8..1d04d4d 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -715,13 +715,6 @@ typedef struct tbe_extra_info unsigned long num_bs_trees; } pllmod_tbe_extra_info_t; -typedef struct tbe_extra_all_result -{ - double* support; - pllmod_tbe_extra_info_t* extra_info; -} pllmod_tbe_extra_all_result_t; - - PLL_EXPORT pllmod_tbe_split_info_t * pllmod_utree_tbe_nature_init(pll_unode_t * ref_root, unsigned int tip_count, @@ -748,9 +741,6 @@ PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int r PLL_EXPORT void pllmod_tbe_extra_info_destroy(pllmod_tbe_extra_info_t * extra_info, unsigned int refsplit_count); -PLL_EXPORT pllmod_tbe_extra_all_result_t* pllmod_tbe_nature_extra_all(pll_unode_t * ref_root, unsigned int tip_count, - pll_unode_t ** bs_roots, unsigned int bs_count); - /* This is an old, naive and rather inefficient TBE computation method by Alexey. * Keep it here just in case */ PLL_EXPORT int pllmod_utree_tbe_naive(pll_split_t * ref_splits, From c4011bc8af260be9d009062a4577d1ccd8b18fc6 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Fri, 21 Jun 2019 11:54:08 +0200 Subject: [PATCH 10/16] fixed bug in species_to_move computation --- src/tree/tbe_functions.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 04321ea..26c9fd6 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -159,13 +159,6 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); - if (dist == 1) { - // easy case. If dist == 1, the reference split has a subtree with only two taxa. Both taxa would be potential move candidates. - unsigned int moved_taxon = query->left_leaf_idx; // we arbitrarily choose the left leaf - update_moved_taxa(extra_info, refsplit_id, moved_taxon, extra_taxa_array_single, num_close_enough_branches); - return; - } - // we re-use the count_ones information that we got in the postorder-traversal done for finding mindist. // First question: Do we want to transform into only ones in subtree & zeros outside or only zeros in subtree & ones outside? From 3b66783a7cbe88014fb437e29086598f34ba74de Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 1 Jul 2019 15:33:54 +0200 Subject: [PATCH 11/16] tbe functions with lock callbacks --- src/tree/pll_tree.h | 6 ++++-- src/tree/tbe_functions.c | 42 ++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 1d04d4d..19ff433 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -727,7 +727,8 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info); + pllmod_tbe_split_info_t* split_info, + void (*lock_callback)(bool )); PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, pll_split_t * bs_splits, @@ -735,7 +736,8 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, unsigned int tip_count, double * support, pllmod_tbe_split_info_t* split_info, - pllmod_tbe_extra_info_t* extra_info); + pllmod_tbe_extra_info_t* extra_info, + void (*lock_callback)(bool )); PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, double tbe_cutoff, bool doTable, bool doArray, bool doTree); diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 26c9fd6..f9d9485 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -117,9 +117,12 @@ void free_tbe_data(tbe_data_t* data) free(data); } -void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) +void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { if (info) { + if (lock_callback) { + lock_callback(true); + } if (info->extra_taxa_array) { extra_taxa_array_single[taxon_id]++; (*num_close_enough_branches)++; @@ -127,16 +130,19 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, if (info->extra_taxa_table) { info->extra_taxa_table[refsplit_id][taxon_id]++; } + if (lock_callback) { + lock_callback(false); + } } } void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array if ((want_ones_now && data->count_ones[act_node_idx] == 0) || (!want_ones_now && data->count_ones[act_node_idx] == 1)) { - update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); + update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches, lock_callback); } return; } @@ -151,13 +157,13 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ return; // we don't need to go further down this subtree. } else { unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); } } void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { assert(extra_info != NULL); // we re-use the count_ones information that we got in the postorder-traversal done for finding mindist. @@ -176,10 +182,10 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; // now we do the preorder traversal, starting from the root node. - fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); } -unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) +unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { unsigned int min_dist = query->p - 1; unsigned int* count_ones = data->count_ones; @@ -226,7 +232,7 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da if (extra_info && query->p >= extra_info->min_p) { double norm_dist = ((double)min_dist) * 1.0 / (((double)query->p) - 1.0); if (norm_dist <= extra_info->tbe_cutoff) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); } } return min_dist; @@ -337,9 +343,10 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info) + pllmod_tbe_split_info_t* split_info, + void (*lock_callback)(bool )) { - return pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_root, tip_count, support, split_info, NULL); + return pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_root, tip_count, support, split_info, NULL, lock_callback); } PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, @@ -348,7 +355,8 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, unsigned int tip_count, double * support, pllmod_tbe_split_info_t* split_info, - pllmod_tbe_extra_info_t* extra_info) + pllmod_tbe_extra_info_t* extra_info, + void (*lock_callback)(bool )) { unsigned int i; unsigned int split_count = tip_count - 3; @@ -401,7 +409,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, // double norm = ((double)min_dist) * 1.0 / (((double)split_info[i].p) - 1.0); if (extra_info && 2 >= extra_info->min_p && 1.0 <= extra_info->tbe_cutoff) { unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf - update_moved_taxa(extra_info, i, moved_taxon, extra_taxa_array_single, &num_close_enough_branches); + update_moved_taxa(extra_info, i, moved_taxon, extra_taxa_array_single, &num_close_enough_branches, lock_callback); } continue; } @@ -414,7 +422,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, } // else, we are in the search for minimum distance... - unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i, extra_taxa_array_single, &num_close_enough_branches); + unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i, extra_taxa_array_single, &num_close_enough_branches, lock_callback); support[i] = 1.0 - (((double) min_hdist) / (split_info[i].p - 1)); } @@ -422,7 +430,13 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, if (extra_info && extra_info->extra_taxa_array) { for (i = 0; i < tip_count; ++i) { if (extra_taxa_array_single[i] > 0) { + if (lock_callback) { + lock_callback(true); + } extra_info->extra_taxa_array[i] += (double) extra_taxa_array_single[i] / (double) num_close_enough_branches; + if (lock_callback) { + lock_callback(false); + } } } free(extra_taxa_array_single); From 1c1793afc7f22690a8858af83c21081b74fd09de Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 1 Jul 2019 15:35:23 +0200 Subject: [PATCH 12/16] Revert "tbe functions with lock callbacks" This reverts commit 3b66783a7cbe88014fb437e29086598f34ba74de. --- src/tree/pll_tree.h | 6 ++---- src/tree/tbe_functions.c | 42 ++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 32 deletions(-) diff --git a/src/tree/pll_tree.h b/src/tree/pll_tree.h index 19ff433..1d04d4d 100644 --- a/src/tree/pll_tree.h +++ b/src/tree/pll_tree.h @@ -727,8 +727,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info, - void (*lock_callback)(bool )); + pllmod_tbe_split_info_t* split_info); PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, pll_split_t * bs_splits, @@ -736,8 +735,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, unsigned int tip_count, double * support, pllmod_tbe_split_info_t* split_info, - pllmod_tbe_extra_info_t* extra_info, - void (*lock_callback)(bool )); + pllmod_tbe_extra_info_t* extra_info); PLL_EXPORT pllmod_tbe_extra_info_t * pllmod_tbe_extra_info_create(unsigned int refsplit_count, unsigned int tip_count, double tbe_cutoff, bool doTable, bool doArray, bool doTree); diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index f9d9485..26c9fd6 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -117,12 +117,9 @@ void free_tbe_data(tbe_data_t* data) free(data); } -void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) +void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, unsigned int taxon_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { if (info) { - if (lock_callback) { - lock_callback(true); - } if (info->extra_taxa_array) { extra_taxa_array_single[taxon_id]++; (*num_close_enough_branches)++; @@ -130,19 +127,16 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, if (info->extra_taxa_table) { info->extra_taxa_table[refsplit_id][taxon_id]++; } - if (lock_callback) { - lock_callback(false); - } } } void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array if ((want_ones_now && data->count_ones[act_node_idx] == 0) || (!want_ones_now && data->count_ones[act_node_idx] == 1)) { - update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches, lock_callback); + update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); } return; } @@ -157,13 +151,13 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ return; // we don't need to go further down this subtree. } else { unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, - pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) { + pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); // we re-use the count_ones information that we got in the postorder-traversal done for finding mindist. @@ -182,10 +176,10 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; // now we do the preorder traversal, starting from the root node. - fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } -unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches, void (*lock_callback)(bool )) +unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { unsigned int min_dist = query->p - 1; unsigned int* count_ones = data->count_ones; @@ -232,7 +226,7 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da if (extra_info && query->p >= extra_info->min_p) { double norm_dist = ((double)min_dist) * 1.0 / (((double)query->p) - 1.0); if (norm_dist <= extra_info->tbe_cutoff) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches, lock_callback); + fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } return min_dist; @@ -343,10 +337,9 @@ PLL_EXPORT int pllmod_utree_tbe_nature(pll_split_t * ref_splits, pll_unode_t* bs_root, unsigned int tip_count, double * support, - pllmod_tbe_split_info_t* split_info, - void (*lock_callback)(bool )) + pllmod_tbe_split_info_t* split_info) { - return pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_root, tip_count, support, split_info, NULL, lock_callback); + return pllmod_utree_tbe_nature_extra(ref_splits, bs_splits, bs_root, tip_count, support, split_info, NULL); } PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, @@ -355,8 +348,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, unsigned int tip_count, double * support, pllmod_tbe_split_info_t* split_info, - pllmod_tbe_extra_info_t* extra_info, - void (*lock_callback)(bool )) + pllmod_tbe_extra_info_t* extra_info) { unsigned int i; unsigned int split_count = tip_count - 3; @@ -409,7 +401,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, // double norm = ((double)min_dist) * 1.0 / (((double)split_info[i].p) - 1.0); if (extra_info && 2 >= extra_info->min_p && 1.0 <= extra_info->tbe_cutoff) { unsigned int moved_taxon = split_info[i].left_leaf_idx; // we arbitrarily choose the left leaf - update_moved_taxa(extra_info, i, moved_taxon, extra_taxa_array_single, &num_close_enough_branches, lock_callback); + update_moved_taxa(extra_info, i, moved_taxon, extra_taxa_array_single, &num_close_enough_branches); } continue; } @@ -422,7 +414,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, } // else, we are in the search for minimum distance... - unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i, extra_taxa_array_single, &num_close_enough_branches, lock_callback); + unsigned int min_hdist = search_mindist(&split_info[i], tbe_data, extra_info, clv_idx_to_postorder_idx, i, extra_taxa_array_single, &num_close_enough_branches); support[i] = 1.0 - (((double) min_hdist) / (split_info[i].p - 1)); } @@ -430,13 +422,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, if (extra_info && extra_info->extra_taxa_array) { for (i = 0; i < tip_count; ++i) { if (extra_taxa_array_single[i] > 0) { - if (lock_callback) { - lock_callback(true); - } extra_info->extra_taxa_array[i] += (double) extra_taxa_array_single[i] / (double) num_close_enough_branches; - if (lock_callback) { - lock_callback(false); - } } } free(extra_taxa_array_single); From 224499378c3e12e2efde0de2599fedcf83009905 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 1 Jul 2019 15:52:48 +0200 Subject: [PATCH 13/16] added parallelization of TBE support via OpenMP --- src/CMakeLists.txt | 4 ++-- src/tree/tbe_functions.c | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2ebf181..c961d36 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required (VERSION 2.8.10 FATAL_ERROR) -set(PLLMOD_CFLAGS "-Wall -Wsign-compare -D_GNU_SOURCE -std=c99 -O3") +set(PLLMOD_CFLAGS "-Wall -Wsign-compare -D_GNU_SOURCE -std=c99 -O3 -fopenmp") if(PLLMOD_DEBUG) - set(PLLMOD_CFLAGS "${PLLMOD_CFLAGS} -DDEBUG") + set(PLLMOD_CFLAGS "${PLLMOD_CFLAGS} -DDEBUG -fopenmp") endif() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-undefined -Wl,dynamic_lookup") diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 26c9fd6..b88a229 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -121,10 +121,14 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, { if (info) { if (info->extra_taxa_array) { +#pragma omp critical + { extra_taxa_array_single[taxon_id]++; (*num_close_enough_branches)++; + } } if (info->extra_taxa_table) { +#pragma omp atomic info->extra_taxa_table[refsplit_id][taxon_id]++; } } @@ -380,6 +384,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, } /* iterate over all splits of the reference tree */ +#pragma omp parallel for schedule(dynamic) for (i = 0; i < split_count; i++) { pll_split_t ref_split = ref_splits[i]; @@ -388,6 +393,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, { if (split_info[i].p >= extra_info->min_p) { +#pragma omp atomic num_close_enough_branches++; } /* found identical split in a bootstrap tree -> assign full support */ From 7aeb7138470502c5b4181c1d9aedf75df2811fa5 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 1 Jul 2019 16:33:54 +0200 Subject: [PATCH 14/16] fixed bug in OpenMP parallelization (reuse of array only works single-threaded), fixed bug in early-stop (only works without extra info) --- src/tree/tbe_functions.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index b88a229..47482a8 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -35,7 +35,6 @@ typedef struct tbe_data { unsigned int* subtree_size; index_information_t* idx_infos; - unsigned int* count_ones; unsigned int nodes_count; unsigned int trav_size; unsigned int tip_count; @@ -91,7 +90,6 @@ tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count, unsigned i data->nodes_count = 2 * tip_count - 2; data->subtree_size = (unsigned int*) malloc(sizeof(unsigned int) * (data->nodes_count + 1)); data->idx_infos = (index_information_t*) malloc(sizeof(index_information_t) * (data->nodes_count + 1)); - data->count_ones = (unsigned int*) malloc(sizeof(unsigned int) * (data->nodes_count + 1)); postorder_init(root, &data->trav_size, data->subtree_size, data->idx_infos, clv_idx_to_postorder_idx); // add new fake_root node for the preorder-extra-info-stuff @@ -101,6 +99,7 @@ tbe_data_t* init_tbe_data(pll_unode_t * root, unsigned int tip_count, unsigned i root_info.idx_right = root->back->clv_index; data->trav_size++; data->subtree_size[data->nodes_count] = data->subtree_size[root->clv_index] + data->subtree_size[root->back->clv_index]; + assert(data->subtree_size[data->nodes_count] == tip_count); data->idx_infos[data->trav_size - 1] = root_info; if (clv_idx_to_postorder_idx) { clv_idx_to_postorder_idx[data->nodes_count] = data->trav_size - 1; @@ -113,7 +112,6 @@ void free_tbe_data(tbe_data_t* data) { free(data->subtree_size); free(data->idx_infos); - free(data->count_ones); free(data); } @@ -134,12 +132,12 @@ void update_moved_taxa(pllmod_tbe_extra_info_t* info, unsigned int refsplit_id, } } -void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, +void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_now, tbe_data_t* data, unsigned int* count_ones, unsigned int dist, unsigned int best_clv_idx, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { // check if the current node is a leaf node if (act_node_idx < data->tip_count) { // leaf node // update the array - if ((want_ones_now && data->count_ones[act_node_idx] == 0) || (!want_ones_now && data->count_ones[act_node_idx] == 1)) { + if ((want_ones_now && count_ones[act_node_idx] == 0) || (!want_ones_now && count_ones[act_node_idx] == 1)) { update_moved_taxa(extra_info, refsplit_id, act_node_idx, extra_taxa_array_single, num_close_enough_branches); } return; @@ -150,17 +148,17 @@ void fill_extra_taxa_entries_recursive(unsigned int act_node_idx, int want_ones_ } unsigned int n_s = data->subtree_size[act_node_idx]; - unsigned int ones_s = data->count_ones[act_node_idx]; + unsigned int ones_s = count_ones[act_node_idx]; if ((want_ones_now && ones_s == n_s) || (!want_ones_now && ones_s == 0)) { return; // we don't need to go further down this subtree. } else { unsigned int postorder_idx = clv_idx_to_postorder_idx[act_node_idx]; - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); - fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_left, want_ones_now, data, count_ones, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(data->idx_infos[postorder_idx].idx_right, want_ones_now, data, count_ones, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } -void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int dist, unsigned int best_clv_idx, +void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* data, unsigned int* count_ones, unsigned int dist, unsigned int best_clv_idx, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { assert(extra_info != NULL); // we re-use the count_ones information that we got in the postorder-traversal done for finding mindist. @@ -170,23 +168,23 @@ void fill_extra_taxa_entries(const pllmod_tbe_split_info_t* query, tbe_data_t* d unsigned int n = data->tip_count; unsigned int n_s = data->subtree_size[best_clv_idx]; - unsigned int ones_total = data->count_ones[root_idx]; + unsigned int ones_total = count_ones[root_idx]; unsigned int zeros_total = query->p; assert(ones_total + zeros_total == n); - unsigned int ones_s = data->count_ones[best_clv_idx]; + unsigned int ones_s = count_ones[best_clv_idx]; unsigned int zeros_s = n_s - ones_s; unsigned int ops_ones_subtree = (n_s - ones_s) + (n - n_s) - (zeros_total - zeros_s); unsigned int ops_zeros_subtree = (n_s - zeros_s) + (n - n_s) - (ones_total - ones_s); int want_ones_outside = (ops_zeros_subtree <= ops_ones_subtree) ? 1 : 0; // now we do the preorder traversal, starting from the root node. - fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries_recursive(root_idx, want_ones_outside, data, count_ones, dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* data, pllmod_tbe_extra_info_t * extra_info, unsigned int* clv_idx_to_postorder_idx, unsigned int refsplit_id, unsigned int* extra_taxa_array_single, unsigned int* num_close_enough_branches) { unsigned int min_dist = query->p - 1; - unsigned int* count_ones = data->count_ones; + unsigned int* count_ones = malloc(sizeof(unsigned int) * (data->nodes_count + 1)); // initialize the leaf node informations... for (size_t i = 0; i < query->left_leaf_idx; ++i) @@ -220,7 +218,7 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da { min_dist = dist_cand; best_clv_idx = idx; - if (min_dist == 1) + if (min_dist == 1 && !extra_info) { break; } @@ -230,9 +228,11 @@ unsigned int search_mindist(const pllmod_tbe_split_info_t* query, tbe_data_t* da if (extra_info && query->p >= extra_info->min_p) { double norm_dist = ((double)min_dist) * 1.0 / (((double)query->p) - 1.0); if (norm_dist <= extra_info->tbe_cutoff) { - fill_extra_taxa_entries(query, data, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); + fill_extra_taxa_entries(query, data, count_ones, min_dist, best_clv_idx, extra_info, clv_idx_to_postorder_idx, refsplit_id, extra_taxa_array_single, num_close_enough_branches); } } + free(count_ones); + return min_dist; } From e518b63a23c9702310778cf5cd23a514dd44cc8e Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 8 Jul 2019 13:24:49 +0200 Subject: [PATCH 15/16] added missing check for non-existing extra info --- src/tree/tbe_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index 47482a8..dee2742 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -391,7 +391,7 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, if (pllmod_utree_split_hashtable_lookup(bs_splits_hash, ref_split, tip_count)) { - if (split_info[i].p >= extra_info->min_p) + if (extra_info && (split_info[i].p >= extra_info->min_p)) { #pragma omp atomic num_close_enough_branches++; From 6e812ef87e9254c02142a939516890166471e7d6 Mon Sep 17 00:00:00 2001 From: Sarah Lutteropp Date: Mon, 8 Jul 2019 14:46:18 +0200 Subject: [PATCH 16/16] fixed race condition in TBE data initialization --- src/tree/tbe_functions.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/tree/tbe_functions.c b/src/tree/tbe_functions.c index dee2742..67bad73 100644 --- a/src/tree/tbe_functions.c +++ b/src/tree/tbe_functions.c @@ -413,10 +413,13 @@ PLL_EXPORT int pllmod_utree_tbe_nature_extra(pll_split_t * ref_splits, } if (!tbe_data) { - if (extra_info) { - clv_idx_to_postorder_idx = (unsigned int*) malloc((2*tip_count-1) * sizeof(unsigned int)); + #pragma omp critical + if (!tbe_data) { + if (extra_info) { + clv_idx_to_postorder_idx = (unsigned int*) malloc((2*tip_count-1) * sizeof(unsigned int)); + } + tbe_data = init_tbe_data(bs_root, tip_count, clv_idx_to_postorder_idx); } - tbe_data = init_tbe_data(bs_root, tip_count, clv_idx_to_postorder_idx); } // else, we are in the search for minimum distance...