From 4f8323c460824721892d5e805bc8bbf382121162 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:01:26 +0000 Subject: [PATCH 01/25] Error checking --- R/Concordance.R | 10 +++++++++- tests/testthat/test-Concordance.R | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/R/Concordance.R b/R/Concordance.R index f47ed056c..6e896df89 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -84,7 +84,15 @@ QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be a phyDat object.") + } + tipLabels <- intersect(TipLabels(tree), names(dataset)) + if (!length(tipLabels)) { + warning("No overlap between tree labels and dataset.") + return(NULL) + } + dataset <- dataset[tipLabels, drop = FALSE] splits <- as.Splits(tree, dataset) logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), logical(NTip(dataset))) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index c3aecd83a..b4ffdc786 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -8,6 +8,13 @@ test_that("_Concordance() handles null input", { expect_warning(expect_null(SharedPhylogeneticConcordance(BalancedTree(8), NULL))) }) +test_that("_Concordance() handles tip mismatch", { + char <- MatrixToPhyDat(cbind(c(a = 0, b = 0, c = 0, d = 1, e = 1))) + tree <- BalancedTree(5) + expect_warning(expect_null(QuartetConcordance(tree, char)), + "No overlap between tree labels and dataset.") +}) + test_that("QuartetConcordance() works", { tree <- BalancedTree(8) splits <- as.Splits(tree) @@ -18,6 +25,8 @@ test_that("QuartetConcordance() works", { 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0), 9, dimnames = list(paste0("t", 1:9), NULL)) + expect_error(QuartetConcordance(tree, mataset), + "`dataset` must be a phyDat object") dat <- MatrixToPhyDat(mataset) expect_equal(unname(QuartetConcordance(tree, dat[, 1])), rep(1, 5)) # plot(tree); nodelabels(); From 76f9d2a49896f07bb8c3f810f4b5b53366b73bb0 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:01:35 +0000 Subject: [PATCH 02/25] Temp Test script --- inst/concordance.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 inst/concordance.R diff --git a/inst/concordance.R b/inst/concordance.R new file mode 100644 index 000000000..7521facbc --- /dev/null +++ b/inst/concordance.R @@ -0,0 +1,22 @@ +devtools::load_all() +use("ape") +use("TreeTools") + +labels <- letters[1:5] +tr <- PectinateTree(labels) +plot(tr) +nodelabels(adj = c(4, 0.5)) + char <- MatrixToPhyDat(cbind( + c(a = 0, b = 0, c = 0, d = 1, e = 1), # 100% concordant with both + c(0, 0, 1, 0, 1), # discordant + c(0, 1, 0, 0, 1) + )) +QuartetConcordance(tr, char, weight = FALSE) + +Conc <- lapply(labels, function(lab) { + quartet <- DropTip(tr, lab) + for (i in 1:3) { + message(setdiff(labels, lab), " char ", i, ": ", QuartetConcordance(quartet, char[, i])) + } +}) + From 586b9223abf5a2d2c65e058c12926368a7b7009f Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Tue, 11 Feb 2025 14:53:01 +0000 Subject: [PATCH 03/25] Draft of method = "minh" --- DESCRIPTION | 2 +- NEWS.md | 4 + R/Concordance.R | 247 ++++++++++++++++++++++-------- man/SiteConcordance.Rd | 40 +++-- tests/testthat/test-Concordance.R | 41 +++++ 5 files changed, 253 insertions(+), 81 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf99b9165..929fe0c13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.5.1.9005 +Version: 1.5.1.9006 Authors@R: c( person( "Martin R.", 'Smith', diff --git a/NEWS.md b/NEWS.md index 4b8fa9af3..41ba848ba 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# TreeSearch 1.5.1.9006 (2025-02) + +- `QuartetConcordance(method = "minh")` uses the approach of Minh et al. (2020) + # TreeSearch 1.5.1.9005 (2025-02) - Support for ordered (additive) characters diff --git a/R/Concordance.R b/R/Concordance.R index 6e896df89..1e93593a8 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -1,6 +1,6 @@ #' Calculate site concordance factor #' -#' The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure +#' The site concordance factor is a measure #' of the strength of support that the dataset presents for a given split in a #' tree. #' @@ -12,21 +12,33 @@ #' with any tree that groups the first two leaves together to the exclusion #' of the second. #' -#' By default, the reported value weights each site by the number of quartets -#' it is decisive for. This value can be interpreted as the proportion of -#' all decisive quartets that are concordant with a split. -#' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. +#' The `methods` argument selects from different methods for quantifying the +#' concordance associated with a given edge of a tree. +#' +#' If `method = "split"` (the default), the concordance is calculated by +#' counting all quartets that are decisive for a split, i.e. for the split +#' $A|B$, any quartet that contains two leaves from set $A$ and two from set $B$. +#' This value can be interpreted as the proportion of all decisive quartets +#' that are concordant with a split. +#' If `method = "sitemean"`, the reported value is the mean of the concordance +#' value for each site. #' Consider a split associated with two sites: #' one that is concordant with 25% of 96 decisive quartets, and #' a second that is concordant with 75% of 4 decisive quartets. -#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. -#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. +#' `method = "split"` returns a value of 24 + 3 / 96 + 4 = 27%. +#' `method = "sitemean"` returns mean(75%, 25%) = 50%. +#' +#' `method = "minh"` uses the approach of +#' \insertCite{Minh2020;textual}{TreeSearch}. +#' Briefly, this interprets each edge as defining *four* clades, and counts +#' the status of quartets that contain exactly one leaf from each of these +#' clades. This is a subset of the quartets considered by other methods. #' -#' `QuartetConcordance()` is computed exactly, using all quartets, where as -#' other implementations (e.g. IQ-TREE) follow +#' +#' `QuartetConcordance()` is computed exactly, using all quartets. +#' Other implementations (e.g. IQ-TREE) follow #' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -#' of quartets for a faster, if potentially less accurate, computation. +#' of quartets for a faster computation, potentially at the expense of accuracy. #' # `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report # the proportion of clustering information and phylogenetic information @@ -44,8 +56,9 @@ #' #' @template treeParam #' @template datasetParam -#' @param weight Logical specifying whether to weight sites according to the -#' number of quartets they are decisive for. +#' @param method Character vector specifying which concordance measures to +#' calculate. See details section for available options. +#' #' #' #' @@ -79,7 +92,8 @@ #' @name SiteConcordance #' @family split support functions #' @export -QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { +# TODO support `method = c("split", "sitemean", "minh")` +QuartetConcordance <- function (tree, dataset = NULL, method = "split") { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -93,62 +107,163 @@ QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { return(NULL) } dataset <- dataset[tipLabels, drop = FALSE] - splits <- as.Splits(tree, dataset) - logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), - logical(NTip(dataset))) - characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) - cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) - setNames(apply(logiSplits, 2, function (split) { - cli_progress_update(1, .envir = parent.frame(2)) - quarts <- apply(characters, 2, function (char) { - tab <- table(split, char) - nCol <- dim(tab)[[2]] - if (nCol > 1L) { - # Consider the case - # split 0 1 2 - # FALSE 2 3 0 - # TRUE 0 4 2 - # - # Concordant quartets with bin i = 1 (character 0) and bin j = 2 - # (character 1) include any combination of the two taxa from - # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], - # i.e. choose(2, 2) * choose(4, 2) - concordant <- sum(vapply(seq_len(nCol), function (i) { - inBinI <- tab[1, i] - iChoices <- choose(inBinI, 2) - sum(vapply(seq_len(nCol)[-i], function (j) { - inBinJ <- tab[2, j] - iChoices * choose(inBinJ, 2) - }, 1)) - }, 1)) - - # To be discordant, we must select a pair of taxa from TT and from FF; - # and the character states must group each T with an F - # e.g. T0 T1 F0 F1 - # T0 T1 F0 F2 would not be discordant - just uninformative - discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) - - # Only quartets that include two T and two F can be decisive - # Quartets must also include two pairs of characters - decisive <- concordant + discordant - - # Return the numerator and denominatory of equation 2 in - # Minh et al. 2020 - c(concordant, decisive) + if (method == "minh") { + if (attr(tree, "order") != "preorder") { + stop("Tree must be in preorder; try `tree <- Preorder(tree)`") + } + edge <- tree[["edge"]] + parent <- edge[, 1] + child <- edge[, 2] + nTip <- NTip(tree) + nodes <- nTip + seq_len(nTip - 1) + parentEdge <- match(nodes, child) + descended <- DescendantTips(parent, child) + childEdges <- vapply(nodes, function(x) + which(parent == x), integer(2)) + daughters <- matrix(child[childEdges], 2) + rootNode <- nTip + 1 + rootChildren <- child[parent == rootNode] + nonRootChildren <- setdiff(nodes, c(rootNode, rootChildren)) + + # TODO pull out into a standalone function, perhaps in TreeTools + # nodes[-1] omits the root node, which defines no quartet + quarters <- cbind( + if (all(rootChildren > nTip)) { + ret <- integer(nTip) + groups <- as.numeric(childEdges[, rootChildren - nTip]) + for (i in 0:3) { + ret[descended[groups[[i + 1]], ]] <- i + } + `dimnames<-`(cbind(ret), list(TipLabels(tree), rootChildren[[1]])) + }, + `dimnames<-`(vapply(nonRootChildren, function(n) { + ret <- integer(nTip) + if (parent[[parentEdge[[n - nTip]]]] == rootNode) { + + } + ret[apply(descended[childEdges[, parent[parentEdge[n - nTip]] - nTip], ], + 2, any)] <- 1L + ce <- childEdges[, n - nTip] + # Overprint with own children + ret[descended[ce[[1]], ]] <- 2L + ret[descended[ce[[2]], ]] <- 3L + ret + }, integer(nTip)), list(TipLabels(tree), nonRootChildren)) + ) + + cli_progress_bar(name = "Quartet concordance", total = dim(quarters)[[2]]) + on.exit(cli_progress_done(.envir = parent.frame(2))) + concDeci <- apply(quarters, 2, function (q) { + #cli_progress_update(1, .envir = parent.frame(2)) + quarts <- apply(characters, 2, function (char) { + tab <- table(q, char) + nCol <- dim(tab)[[2]] + if (nCol > 1L) { + + # RULES: + # We must pick one leaf per row + # To be parsimony informative, we must pick two leaves from each of + # two columns + # To be concordant, leaves 1 and 2 must come from the same column + + # Example table: + # q 0 1 2 + # A 2 3 0 + # B 2 3 2 + # C 0 1 2 + # D 2 1 2 + # + + concordant <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + i <- ij[[1]] + j <- ij[[2]] + # Taxa from A and B from column i, C and D from j + prod(tab[1:2, i], tab[3:4, j]) + + # Taxa from A and B from column j, C and D from i + prod(tab[1:2, j], tab[3:4, i]) + })) + + decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + i <- ij[[1]] + j <- ij[[2]] + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from col j + pair1 <- kl + pair2 <- (1:4)[-kl] + prod(tab[pair1, i], tab[pair2, j]) + })) + })) + + c(concordant, decisive) + } else { + c(0L, 0L) + } + }) + rowSums(quarts) + }) + concDeci[1, ] / concDeci[2, ] + } else { + splits <- as.Splits(tree, dataset) + logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), + logical(NTip(dataset))) + cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) + on.exit(cli_progress_done(.envir = parent.frame(2))) + setNames(apply(logiSplits, 2, function (split) { + cli_progress_update(1, .envir = parent.frame(2)) + quarts <- apply(characters, 2, function (char) { + tab <- table(split, char) + nCol <- dim(tab)[[2]] + if (nCol > 1L) { + # Consider the case + # split 0 1 2 + # FALSE 2 3 0 + # TRUE 0 4 2 + # + # Concordant quartets with bin i = 1 (character 0) and bin j = 2 + # (character 1) include any combination of the two taxa from + # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], + # i.e. choose(2, 2) * choose(4, 2) + concordant <- sum(vapply(seq_len(nCol), function (i) { + inBinI <- tab[1, i] + iChoices <- choose(inBinI, 2) + sum(vapply(seq_len(nCol)[-i], function (j) { + inBinJ <- tab[2, j] + iChoices * choose(inBinJ, 2) + }, 1)) + }, 1)) + + # To be discordant, we must select a pair of taxa from TT and from FF; + # and the character states must group each T with an F + # e.g. T0 T1 F0 F1 + # T0 T1 F0 F2 would not be discordant - just uninformative + discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) + + # Only quartets that include two T and two F can be decisive + # Quartets must also include two pairs of characters + decisive <- concordant + discordant + + # Return the numerator and denominatory of equation 2 in + # Minh et al. 2020 + c(concordant, decisive) + } else { + c(0L, 0L) + } + }) + if (method == "split") { + quartSums <- rowSums(quarts) + ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) + } else if (method == "sitemean") { + mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), + na.rm = TRUE) } else { - c(0L, 0L) + stop("Unrecognized method: ", method) } - }) - if (isTRUE(weight)) { - quartSums <- rowSums(quarts) - ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) - } else { - mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), - na.rm = TRUE) - } - }), names(splits)) + }), names(splits)) + } } #' @importFrom TreeDist Entropy diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 85e38d090..8f3ad1df3 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -9,7 +9,7 @@ \alias{SharedPhylogeneticConcordance} \title{Calculate site concordance factor} \usage{ -QuartetConcordance(tree, dataset = NULL, weight = TRUE) +QuartetConcordance(tree, dataset = NULL, method = "split") ClusteringConcordance(tree, dataset) @@ -23,13 +23,14 @@ SharedPhylogeneticConcordance(tree, dataset) \item{tree}{A tree of class \code{\link{phylo}}.} \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class -\code{phyDat}, whose names correspond to the labels of any accompanying tree.} +\code{phyDat}, whose names correspond to the labels of any accompanying tree. +Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} -\item{weight}{Logical specifying whether to weight sites according to the -number of quartets they are decisive for.} +\item{method}{Character vector specifying which concordance measures to +calculate. See details section for available options.} } \description{ -The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure +The site concordance factor is a measure of the strength of support that the dataset presents for a given split in a tree. } @@ -42,21 +43,32 @@ But a quartet with characters \verb{0 0 1 1} is decisive, and is concordant with any tree that groups the first two leaves together to the exclusion of the second. -By default, the reported value weights each site by the number of quartets -it is decisive for. This value can be interpreted as the proportion of -all decisive quartets that are concordant with a split. -If \code{weight = FALSE}, the reported value is the mean of the concordance +The \code{methods} argument selects from different methods for quantifying the +concordance associated with a given edge of a tree. + +If \code{method = "split"} (the default), the concordance is calculated by +counting all quartets that are decisive for a split, i.e. for the split +$A|B$, any quartet that contains two leaves from set $A$ and two from set $B$. +This value can be interpreted as the proportion of all decisive quartets +that are concordant with a split. +If \code{method = "sitemean"}, the reported value is the mean of the concordance value for each site. Consider a split associated with two sites: one that is concordant with 25\% of 96 decisive quartets, and a second that is concordant with 75\% of 4 decisive quartets. -If \code{weight = TRUE}, the split concordance will be 24 + 3 / 96 + 4 = 27\%. -If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. +\code{method = "split"} returns a value of 24 + 3 / 96 + 4 = 27\%. +\code{method = "sitemean"} returns mean(75\%, 25\%) = 50\%. + +\code{method = "minh"} uses the approach of +\insertCite{Minh2020;textual}{TreeSearch}. +Briefly, this interprets each edge as defining \emph{four} clades, and counts +the status of quartets that contain exactly one leaf from each of these +clades. This is a subset of the quartets considered by other methods. -\code{QuartetConcordance()} is computed exactly, using all quartets, where as -other implementations (e.g. IQ-TREE) follow +\code{QuartetConcordance()} is computed exactly, using all quartets. +Other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -of quartets for a faster, if potentially less accurate, computation. +of quartets for a faster computation, potentially at the expense of accuracy. \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index b4ffdc786..d2b718e41 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -15,6 +15,46 @@ test_that("_Concordance() handles tip mismatch", { "No overlap between tree labels and dataset.") }) +test_that("QuartetConcordance(method = minh)", { + tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") + mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 1, 1, 0, 0, 0, 2, 2, 0, + 0, 0, 0, 1, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0), 9, + dimnames = list(paste0("t", 1:9), NULL)) + dat <- MatrixToPhyDat(mataset) + + # plot(tree); nodelabels(); + minh10 <- rep(1:4, each = 2) + minh11 <- c(1, 2, 3, 3, rep(4, 4)) + minh15 <- c(rep(1, 4), rep(2, 2), 3, 4) + + # We must pick one leaf per row + # To be parsimony informative, we must pick two leaves from each of two columns + # To be concordant, leaves 1 and 2 must come from the same column + table(minh10, PhyDatToMatrix(dat[, 1])[1:8, 1]) + concordant10.1 <- 2*2*2*2 + informative10.1 <- 2*2*2*2 + + table(minh10, PhyDatToMatrix(dat[, 2])[1:8, 1]) + concordant10.2 <- 0 + informative10.2 <- 0 + + + table(minh15, PhyDatToMatrix(dat[, 2])[1:8, 1]) + concordant15.2 <- 2*2*1*1 + informative15.2 <- 2*2*1*1 + + table(minh11, PhyDatToMatrix(dat[, 2])[1:8, 1]) + concordant11.2 <- 0 + informative11.2 <- 2 + QuartetConcordance(tree, dat[, 1], method = "minh") + expect_equal(unname(QuartetConcordance(tree, dat[, 1])), rep(1, 5)) +}) + + test_that("QuartetConcordance() works", { tree <- BalancedTree(8) splits <- as.Splits(tree) @@ -29,6 +69,7 @@ test_that("QuartetConcordance() works", { "`dataset` must be a phyDat object") dat <- MatrixToPhyDat(mataset) expect_equal(unname(QuartetConcordance(tree, dat[, 1])), rep(1, 5)) + # plot(tree); nodelabels(); expect_equal(QuartetConcordance(tree, dat[, 2]), c("10" = 1/9, "11" = 0, "12" = 0, From 70aece81b8c56fe2f8eb338ff06fe306300f1bf3 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Tue, 11 Feb 2025 14:56:43 +0000 Subject: [PATCH 04/25] Emphatically failing --- tests/testthat/test-Concordance.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index d2b718e41..0c08c0c4f 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -18,15 +18,22 @@ test_that("_Concordance() handles tip mismatch", { test_that("QuartetConcordance(method = minh)", { tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, - 0, 1, 1, 0, 0, 0, 2, 2, 0, - 0, 0, 0, 1, 0, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0), 9, - dimnames = list(paste0("t", 1:9), NULL)) + dimnames = list(letters[1:9], NULL)) dat <- MatrixToPhyDat(mataset) + expect_error(QuartetConcordance(tree, dat, method = "minh"), + "must be in preorder") + tree <- Preorder(tree) + # plot(tree); nodelabels(); + QuartetConcordance(tree, dat, method = "minh") minh10 <- rep(1:4, each = 2) minh11 <- c(1, 2, 3, 3, rep(4, 4)) minh15 <- c(rep(1, 4), rep(2, 2), 3, 4) From 3b2e8927af84211339fa65cb3af4021f010c2b39 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:00:18 +0000 Subject: [PATCH 05/25] Integrate into unit test --- inst/concordance.R | 22 ------------------- tests/testthat/test-Concordance.R | 35 ++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 23 deletions(-) delete mode 100644 inst/concordance.R diff --git a/inst/concordance.R b/inst/concordance.R deleted file mode 100644 index 7521facbc..000000000 --- a/inst/concordance.R +++ /dev/null @@ -1,22 +0,0 @@ -devtools::load_all() -use("ape") -use("TreeTools") - -labels <- letters[1:5] -tr <- PectinateTree(labels) -plot(tr) -nodelabels(adj = c(4, 0.5)) - char <- MatrixToPhyDat(cbind( - c(a = 0, b = 0, c = 0, d = 1, e = 1), # 100% concordant with both - c(0, 0, 1, 0, 1), # discordant - c(0, 1, 0, 0, 1) - )) -QuartetConcordance(tr, char, weight = FALSE) - -Conc <- lapply(labels, function(lab) { - quartet <- DropTip(tr, lab) - for (i in 1:3) { - message(setdiff(labels, lab), " char ", i, ": ", QuartetConcordance(quartet, char[, i])) - } -}) - diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 0c08c0c4f..60e0b7f0e 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -62,7 +62,40 @@ test_that("QuartetConcordance(method = minh)", { }) -test_that("QuartetConcordance() works", { +test_that("QuartetConcordance() calculates correct values - weighting", { + labels <- letters[1:5] + tr <- PectinateTree(labels) + # plot(tr) + # nodelabels(adj = c(4, 0.5)) + + char <- MatrixToPhyDat(cbind( + c(a = 0, b = 0, c = 0, d = 1, e = 1), + c(0, 0, 1, 0, 1), + c(0, 1, 0, 0, 1) + )) + + # Is quartet concordant with character? + # Quartets labelled by omitted leaf + C <- 1 # Concordant + D <- 0 # Discordant + I <- NaN # Irrelevant + expected_concordance <- cbind( + ch1 = c(a = C, b = C, c = C, d = I, e = I), + ch2 = c(D, D, I, C, I), + ch3 = c(D, I, D, D, I)) + + conc <- vapply(labels, function(lab) { + quartet <- DropTip(tr, lab) + vapply(1:3, function(i) QuartetConcordance(quartet, char[, i]), double(1)) + }, c(ch1 = NaN, ch2 = NaN, ch3 = NaN)) + expect_equal(conc, t(expected_concordance)) + + + QuartetConcordance(tr, char, weight = FALSE) + +}) + +test_that("QuartetConcordance() calculates correct values", { tree <- BalancedTree(8) splits <- as.Splits(tree) mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, From 0fb6938b158683f8952e9c5b79aaa032a44a3ccb Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:59:25 +0000 Subject: [PATCH 06/25] Multiple copies of characters --- tests/testthat/test-Concordance.R | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 60e0b7f0e..717e756b8 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -65,14 +65,13 @@ test_that("QuartetConcordance(method = minh)", { test_that("QuartetConcordance() calculates correct values - weighting", { labels <- letters[1:5] tr <- PectinateTree(labels) - # plot(tr) - # nodelabels(adj = c(4, 0.5)) + # plot(tr); nodelabels(adj = c(4, 0.5)) char <- MatrixToPhyDat(cbind( c(a = 0, b = 0, c = 0, d = 1, e = 1), c(0, 0, 1, 0, 1), c(0, 1, 0, 0, 1) - )) + )[, c(1:3, 1)]) # Is quartet concordant with character? # Quartets labelled by omitted leaf @@ -80,19 +79,34 @@ test_that("QuartetConcordance() calculates correct values - weighting", { D <- 0 # Discordant I <- NaN # Irrelevant expected_concordance <- cbind( - ch1 = c(a = C, b = C, c = C, d = I, e = I), + ch1 = c(a = C, b = C, c = C, d = I, e = I), ch2 = c(D, D, I, C, I), - ch3 = c(D, I, D, D, I)) + ch3 = c(D, I, D, D, I), + ch4 = c(a = C, b = C, c = C, d = I, e = I)) conc <- vapply(labels, function(lab) { quartet <- DropTip(tr, lab) - vapply(1:3, function(i) QuartetConcordance(quartet, char[, i]), double(1)) - }, c(ch1 = NaN, ch2 = NaN, ch3 = NaN)) + vapply(1:4, function(i) QuartetConcordance(quartet, char[, i]), double(1)) + }, c(ch1 = NaN, ch2 = NaN, ch3 = NaN, ch4 = NaN)) expect_equal(conc, t(expected_concordance)) + split_relevant <- list( + "8" = letters[3:5], + "9" = letters[1:3]) - QuartetConcordance(tr, char, weight = FALSE) + expect_equal( + QuartetConcordance(tr, char, weight = FALSE), # 8 = 0.75; 9 = 0.5 + vapply(split_relevant, function(splitQ) { + sum(expected_concordance[splitQ, ], na.rm = TRUE) / + sum(!is.nan(expected_concordance[splitQ, ])) + }, double(1)) + ) + # Same proportions, different number of chars -> weights + char10 <- char[, rep(1:4, each = 10)] + expect_equal( + QuartetConcordance(tr, char, weight = FALSE), + QuartetConcordance(tr, char10, weight = FALSE)) }) test_that("QuartetConcordance() calculates correct values", { From a69db66e12d6cfe02219546ed1b6803faa3f4267 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:42:08 +0000 Subject: [PATCH 07/25] import --- NAMESPACE | 1 + R/Concordance.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 98ce38262..5cb839a65 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -142,6 +142,7 @@ importFrom(TreeTools,CladisticInfo) importFrom(TreeTools,CompatibleSplits) importFrom(TreeTools,ConstrainedNJ) importFrom(TreeTools,DescendantEdges) +importFrom(TreeTools,DescendantTips) importFrom(TreeTools,DoubleFactorial) importFrom(TreeTools,DropTip) importFrom(TreeTools,EdgeAncestry) diff --git a/R/Concordance.R b/R/Concordance.R index 1e93593a8..a32f68a6c 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -88,7 +88,7 @@ #' @importFrom ape keep.tip #' @importFrom cli cli_progress_bar cli_progress_update #' @importFrom utils combn -#' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels +#' @importFrom TreeTools as.Splits DescendantTips PhyDatToMatrix TipLabels #' @name SiteConcordance #' @family split support functions #' @export From 4fc794f694e19521e1d2738302cdb523d3f28a12 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 08:21:05 +0000 Subject: [PATCH 08/25] =?UTF-8?q?weight=E2=86=92method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/testthat/test-Concordance.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 717e756b8..cce7d6a80 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -95,7 +95,7 @@ test_that("QuartetConcordance() calculates correct values - weighting", { "9" = letters[1:3]) expect_equal( - QuartetConcordance(tr, char, weight = FALSE), # 8 = 0.75; 9 = 0.5 + QuartetConcordance(tr, char, method = "split"), # 8 = 0.75; 9 = 0.5 vapply(split_relevant, function(splitQ) { sum(expected_concordance[splitQ, ], na.rm = TRUE) / sum(!is.nan(expected_concordance[splitQ, ])) @@ -105,8 +105,8 @@ test_that("QuartetConcordance() calculates correct values - weighting", { # Same proportions, different number of chars -> weights char10 <- char[, rep(1:4, each = 10)] expect_equal( - QuartetConcordance(tr, char, weight = FALSE), - QuartetConcordance(tr, char10, weight = FALSE)) + QuartetConcordance(tr, char, method = "split"), + QuartetConcordance(tr, char10, method = "split")) }) test_that("QuartetConcordance() calculates correct values", { From 4c23582b9d6e9bfee9b9a76f56bf80431c33469d Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 08:24:23 +0000 Subject: [PATCH 09/25] Test Minh concordance --- tests/testthat/test-Concordance.R | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index cce7d6a80..6f6dc7fbe 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -24,7 +24,8 @@ test_that("QuartetConcordance(method = minh)", { 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 1, 1, 2, 2, 3, 3, 0, - 0, 1, 2, 3, 0, 1, 2, 3, 0), 9, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, dimnames = list(letters[1:9], NULL)) dat <- MatrixToPhyDat(mataset) @@ -33,34 +34,35 @@ test_that("QuartetConcordance(method = minh)", { tree <- Preorder(tree) # plot(tree); nodelabels(); - QuartetConcordance(tree, dat, method = "minh") - minh10 <- rep(1:4, each = 2) - minh11 <- c(1, 2, 3, 3, rep(4, 4)) - minh15 <- c(rep(1, 4), rep(2, 2), 3, 4) - - # We must pick one leaf per row - # To be parsimony informative, we must pick two leaves from each of two columns - # To be concordant, leaves 1 and 2 must come from the same column - table(minh10, PhyDatToMatrix(dat[, 1])[1:8, 1]) - concordant10.1 <- 2*2*2*2 - informative10.1 <- 2*2*2*2 - - table(minh10, PhyDatToMatrix(dat[, 2])[1:8, 1]) - concordant10.2 <- 0 - informative10.2 <- 0 - - - table(minh15, PhyDatToMatrix(dat[, 2])[1:8, 1]) - concordant15.2 <- 2*2*1*1 - informative15.2 <- 2*2*1*1 - - table(minh11, PhyDatToMatrix(dat[, 2])[1:8, 1]) - concordant11.2 <- 0 - informative11.2 <- 2 - QuartetConcordance(tree, dat[, 1], method = "minh") - expect_equal(unname(QuartetConcordance(tree, dat[, 1])), rep(1, 5)) + expect_concordance <- function(i, expectation) { + expect_equal(QuartetConcordance(tree, dat[, i], method = "minh"), + setNames(expectation, 11:15)) + } + # Expectations computed by working through tables manually + expect_concordance(1, c(NaN, NaN, 1, NaN, NaN)) + expect_concordance(2, c(1, NaN, NaN, NaN, NaN)) + expect_concordance(3, c(1, NaN, NaN, NaN, 1)) + expect_concordance(4, c(0, 0, 1, NaN, NaN)) + expect_concordance(5, c(0, 0, 4 / 6, 0, 1)) + expect_concordance(6, rep(NaN, 5)) + expect_concordance(7, c(1, NaN, NaN, NaN, NaN)) + expect_concordance(8, c(NaN, NaN, 0, NaN, NaN)) + expect_concordance(9, c(NaN, 0, 1 / 2, 0, NaN)) + + # Values calculated from summing results above + expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), + c(5 + 3 + 1, + 0, + 12 + 8 + 4 + 1, + 0, + 4 + 3) / + c(5 + 3 + 4 + 3 + 1, + 4 + 3 + 2, + 12 + 8 + 6 + 2 + 2, + 6 + 2, + 4 + 3), + tolerance = 0.1) }) - test_that("QuartetConcordance() calculates correct values - weighting", { labels <- letters[1:5] From 43dc12eb62bbcf130cf978607b1064f9fa3e4dff Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 08:59:30 +0000 Subject: [PATCH 10/25] test vs IQTree --- data-raw/quartet-concordance-tests.R | 52 ++++++++++++++++++++++++++++ tests/testthat/test-Concordance.R | 7 ++-- 2 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 data-raw/quartet-concordance-tests.R diff --git a/data-raw/quartet-concordance-tests.R b/data-raw/quartet-concordance-tests.R new file mode 100644 index 000000000..c4386fb07 --- /dev/null +++ b/data-raw/quartet-concordance-tests.R @@ -0,0 +1,52 @@ +library("TreeTools") +tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") +mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, + dimnames = list(letters[1:9], NULL)) +dat <- MatrixToPhyDat(mataset[1:8, ]) +iqtreeDir <- stop("Path to folder that contains iqtree2.exe") +treeFile <- paste0(iqtreeDir, "tree.nwk") +dataFile <- paste0(iqtreeDir, "data.nex") +charFile <- paste0(iqtreeDir, "data%i.nex") + +write.tree(tree, file = treeFile) + +# Expected output per character +vapply(1:9, function(i) { + charIFile <- sprintf(charFile, i) + write.nexus.data(dat[, i], file = charIFile, format = "STANDARD") + dataLines <- readLines(charIFile) + dataLines[[5]] <- sub("MISSING=? GAP=- INTERLEAVE=NO ", "", dataLines[[5]], + fixed = TRUE) + writeLines(dataLines, charIFile) + system2(paste0(iqtreeDir, "iqtree2.exe"), + paste("-t", treeFile, "-s", charIFile, + "--scf 100000", + "-nt 4" + ), stdout = NULL) + cf <- read.tree(paste0(treeFile, ".cf.tree")) + + as.numeric(cf[["node.label"]][-(1:2)]) +}, double(5)) + +# Expected output across entire dataset +write.nexus.data(dat, file = dataFile, format = "STANDARD") +dataLines <- readLines(dataFile) +dataLines[[5]] <- sub("MISSING=? GAP=- INTERLEAVE=NO ", "", dataLines[[5]], fixed = TRUE) +writeLines(dataLines, dataFile) +system2(paste0(iqtreeDir, "iqtree2.exe"), + paste("-t", treeFile, "-s", dataFile, + "--scf 1000000", + "-nt 4" + ), stdout = NULL) +cf <- read.tree(paste0(treeFile, ".cf.tree")) +plot(cf, show.node.label = TRUE, use.edge.length = FALSE) + +as.numeric(cf[["node.label"]][-(1:2)]) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 6f6dc7fbe..ae5c6401b 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -49,6 +49,7 @@ test_that("QuartetConcordance(method = minh)", { expect_concordance(8, c(NaN, NaN, 0, NaN, NaN)) expect_concordance(9, c(NaN, 0, 1 / 2, 0, NaN)) + expect_concordance(6, rep(NaN, 5)) # Values calculated from summing results above expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), c(5 + 3 + 1, @@ -60,8 +61,10 @@ test_that("QuartetConcordance(method = minh)", { 4 + 3 + 2, 12 + 8 + 6 + 2 + 2, 6 + 2, - 4 + 3), - tolerance = 0.1) + 4 + 3)) + + expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), + c(56.7, 0, 85.4, 0, 62.5), tolerance = 0.01) }) test_that("QuartetConcordance() calculates correct values - weighting", { From d5c5c9b11ae42d18b3a8aa93bc2d6015418f1aea Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 10:08:28 +0000 Subject: [PATCH 11/25] Lanfear2024 --- R/Concordance.R | 6 +++++- inst/REFERENCES.bib | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/Concordance.R b/R/Concordance.R index a32f68a6c..796e1bb29 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -29,7 +29,8 @@ #' `method = "sitemean"` returns mean(75%, 25%) = 50%. #' #' `method = "minh"` uses the approach of -#' \insertCite{Minh2020;textual}{TreeSearch}. +#' \insertCite{Minh2020;textual}{TreeSearch} to compute the site concordance +#' factor. #' Briefly, this interprets each edge as defining *four* clades, and counts #' the status of quartets that contain exactly one leaf from each of these #' clades. This is a subset of the quartets considered by other methods. @@ -49,6 +50,9 @@ #TODO More thought / explanation needed. #' #TODO Finally, `ProfileConcordance()` (to follow) +#' +#' For an overview of the use and interpretation of concordance factors, +#' see \insertCite{Lanfear2024}{TreeSearch}. #' #' **NOTE:** These functions are under development. They are incompletely #' tested, and may change without notice. diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 130091ffc..a14f1876f 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -311,6 +311,17 @@ @article{Kumar2012 number = {2} } +@article{Lanfear2024, + title = {The Meaning and Measure of Concordance Factors in Phylogenomics}, + author = {Lanfear, Robert and Hahn, Matthew W}, + year = {2024}, + journal = {Molecular Biology and Evolution}, + volume = {41}, + number = {11}, + pages = {msae214}, + doi = {10.1093/molbev/msae214} +} + @article{Maddison1991, title = {Null models for the number of evolutionary steps in a character on a phylogenetic tree}, author = {Maddison, Wayne P. and Slatkin, M}, From 44a2763e638f21766d8d7dccf23964fc7bcc95f5 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 10:08:46 +0000 Subject: [PATCH 12/25] Update SiteConcordance.Rd --- man/SiteConcordance.Rd | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 8f3ad1df3..53422a9dc 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -60,7 +60,8 @@ a second that is concordant with 75\% of 4 decisive quartets. \code{method = "sitemean"} returns mean(75\%, 25\%) = 50\%. \code{method = "minh"} uses the approach of -\insertCite{Minh2020;textual}{TreeSearch}. +\insertCite{Minh2020;textual}{TreeSearch} to compute the site concordance +factor. Briefly, this interprets each edge as defining \emph{four} clades, and counts the status of quartets that contain exactly one leaf from each of these clades. This is a subset of the quartets considered by other methods. @@ -70,6 +71,9 @@ Other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample of quartets for a faster computation, potentially at the expense of accuracy. +For an overview of the use and interpretation of concordance factors, +see \insertCite{Lanfear2024}{TreeSearch}. + \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. Complete documentation and discussion will follow in due course. From 9ba63ffd0f73ede51e33f0fb54c39481b8cd1755 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 10:38:44 +0000 Subject: [PATCH 13/25] First bash at IQ-TREE impl Maths not right yet --- R/Concordance.R | 68 +++++++++++++++++++++++-------- tests/testthat/test-Concordance.R | 22 +++++++--- 2 files changed, 69 insertions(+), 21 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 796e1bb29..9dacc1112 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -34,6 +34,9 @@ #' Briefly, this interprets each edge as defining *four* clades, and counts #' the status of quartets that contain exactly one leaf from each of these #' clades. This is a subset of the quartets considered by other methods. +#' `method = "iqtree"` uses the \insertCite{Minh2020;textual}{TreeSearch} +#' approach as [implemented in IQ-TREE]( +#' https://github.com/iqtree/iqtree2/issues/415). #' #' #' `QuartetConcordance()` is computed exactly, using all quartets. @@ -113,7 +116,7 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { dataset <- dataset[tipLabels, drop = FALSE] characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) - if (method == "minh") { + if (method %in% c("minh", "iqtree")) { if (attr(tree, "order") != "preorder") { stop("Tree must be in preorder; try `tree <- Preorder(tree)`") } @@ -165,11 +168,9 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { tab <- table(q, char) nCol <- dim(tab)[[2]] if (nCol > 1L) { - # RULES: # We must pick one leaf per row - # To be parsimony informative, we must pick two leaves from each of - # two columns + # Only parsimony informative are decisive # To be concordant, leaves 1 and 2 must come from the same column # Example table: @@ -189,19 +190,54 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { prod(tab[1:2, j], tab[3:4, i]) })) - decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { - # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 - i <- ij[[1]] - j <- ij[[2]] - sum(apply(combn(4, 2), 2, function(kl) { - # We can pick taxa AB, AC, AD, BC, BD, CD from column i, - # and the other pair from col j - pair1 <- kl - pair2 <- (1:4)[-kl] - prod(tab[pair1, i], tab[pair2, j]) + if (method == "minh") { + # To be parsimony informative, we must pick two leaves from each of + # two columns + + + decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + i <- ij[[1]] + j <- ij[[2]] + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from col j + pair1 <- kl + pair2 <- (1:4)[-kl] + prod(tab[pair1, i], tab[pair2, j]) + })) })) - })) - + } else { + # Under the IQ-definition, to be parsimony informative, + # we must pick two leaves from one column. + # Hence 0012 is parsimony informative (!) + + parsInf <- sum(vapply(seq_len(nCol), function(i) { + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from any other column + pair1 <- kl + prod(tab[pair1, i], rowSums(tab[-pair1, -i, drop = FALSE])) + })) + }, double(1))) + + # inclusion-exclusion principle: + countedTwice <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + i <- ij[[1]] + j <- ij[[2]] + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from col j + pair1 <- kl + pair2 <- (1:4)[-kl] + prod(tab[pair1, i], tab[pair2, j]) + })) + })) + + decisive <- parsInf - countedTwice + + } c(concordant, decisive) } else { c(0L, 0L) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index ae5c6401b..823f458a7 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -34,9 +34,11 @@ test_that("QuartetConcordance(method = minh)", { tree <- Preorder(tree) # plot(tree); nodelabels(); - expect_concordance <- function(i, expectation) { - expect_equal(QuartetConcordance(tree, dat[, i], method = "minh"), - setNames(expectation, 11:15)) + expect_concordance <- function(i, expectation, iq = "minh") { + expect_equal( + QuartetConcordance(tree, dat[, i], + method = ifelse(iq == "iq", "iqtree", "minh")), + setNames(expectation, 11:15)) } # Expectations computed by working through tables manually expect_concordance(1, c(NaN, NaN, 1, NaN, NaN)) @@ -49,7 +51,6 @@ test_that("QuartetConcordance(method = minh)", { expect_concordance(8, c(NaN, NaN, 0, NaN, NaN)) expect_concordance(9, c(NaN, 0, 1 / 2, 0, NaN)) - expect_concordance(6, rep(NaN, 5)) # Values calculated from summing results above expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), c(5 + 3 + 1, @@ -63,8 +64,19 @@ test_that("QuartetConcordance(method = minh)", { 6 + 2, 4 + 3)) - expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), + # Expectations computed by iq-tree + expect_concordance(iq = "iq", 1, c(NaN, NaN, 1, NaN, NaN)) + expect_concordance(iq = "iq", 2, c(1, NaN, NaN, NaN, NaN)) + expect_concordance(iq = "iq", 3, c(0.6, NaN, NaN, NaN, 0.5)) + expect_concordance(iq = "iq", 4, c(0, 0, 2 / 3, NaN, NaN)) + expect_concordance(iq = "iq", 5, c(0, 0, 1 / 3, 0, 3 / 8)) + expect_concordance(iq = "iq", 6, rep(NaN, 5)) + expect_concordance(iq = "iq", 7, c(1 / 5, NaN, NaN, NaN, NaN)) + expect_concordance(iq = "iq", 8, c(NaN, NaN, 0, NaN, NaN)) + expect_concordance(iq = "iq", 9, c(NaN, 0, 8.29, 0, NaN)) + expect_equal(unname(QuartetConcordance(tree, dat, method = "iqtree")), c(56.7, 0, 85.4, 0, 62.5), tolerance = 0.01) + }) test_that("QuartetConcordance() calculates correct values - weighting", { From cab7f64cb83feb0ea8eeabf20d7e92013230f758 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 11:28:58 +0000 Subject: [PATCH 14/25] Update SiteConcordance.Rd --- man/SiteConcordance.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 53422a9dc..55113b651 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -65,6 +65,8 @@ factor. Briefly, this interprets each edge as defining \emph{four} clades, and counts the status of quartets that contain exactly one leaf from each of these clades. This is a subset of the quartets considered by other methods. +\code{method = "iqtree"} uses the \insertCite{Minh2020;textual}{TreeSearch} +approach as \href{https://github.com/iqtree/iqtree2/issues/415}{implemented in IQ-TREE}. \code{QuartetConcordance()} is computed exactly, using all quartets. Other implementations (e.g. IQ-TREE) follow From d32d3349612b39dd3c7ca87304a0a4aee2f87631 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 11:29:08 +0000 Subject: [PATCH 15/25] Correct tests --- tests/testthat/test-Concordance.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 823f458a7..69ed1f3ae 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -65,17 +65,17 @@ test_that("QuartetConcordance(method = minh)", { 4 + 3)) # Expectations computed by iq-tree - expect_concordance(iq = "iq", 1, c(NaN, NaN, 1, NaN, NaN)) - expect_concordance(iq = "iq", 2, c(1, NaN, NaN, NaN, NaN)) - expect_concordance(iq = "iq", 3, c(0.6, NaN, NaN, NaN, 0.5)) - expect_concordance(iq = "iq", 4, c(0, 0, 2 / 3, NaN, NaN)) + expect_concordance(iq = "iq", 1, c(0, 0, 1, 0, 0)) + expect_concordance(iq = "iq", 2, c(1, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 3, c(0.6, 0, 0, 0, 0.5)) + expect_concordance(iq = "iq", 4, c(0, 0, 2 / 3, 0, 0)) expect_concordance(iq = "iq", 5, c(0, 0, 1 / 3, 0, 3 / 8)) - expect_concordance(iq = "iq", 6, rep(NaN, 5)) - expect_concordance(iq = "iq", 7, c(1 / 5, NaN, NaN, NaN, NaN)) - expect_concordance(iq = "iq", 8, c(NaN, NaN, 0, NaN, NaN)) - expect_concordance(iq = "iq", 9, c(NaN, 0, 8.29, 0, NaN)) + expect_concordance(iq = "iq", 6, c(0, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 7, c(1 / 5, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 8, rep(0, 5)) + expect_concordance(iq = "iq", 9, c(0, 0, 1 / 12, 0, 0)) expect_equal(unname(QuartetConcordance(tree, dat, method = "iqtree")), - c(56.7, 0, 85.4, 0, 62.5), tolerance = 0.01) + c(56.7, 0, 85.4, 0, 62.5) / 100, tolerance = 0.01) }) From 85693aa817a83374d505a6b9b9ebc5469ca3c497 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 11:29:21 +0000 Subject: [PATCH 16/25] Even simpler (!?) --- R/Concordance.R | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 9dacc1112..692ee80f0 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -208,34 +208,12 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { })) })) } else { - # Under the IQ-definition, to be parsimony informative, - # we must pick two leaves from one column. - # Hence 0012 is parsimony informative (!) + # IQ-TREE seems to use "variant" in place of "parsimony informative" - parsInf <- sum(vapply(seq_len(nCol), function(i) { - sum(apply(combn(4, 2), 2, function(kl) { - # We can pick taxa AB, AC, AD, BC, BD, CD from column i, - # and the other pair from any other column - pair1 <- kl - prod(tab[pair1, i], rowSums(tab[-pair1, -i, drop = FALSE])) - })) - }, double(1))) - - # inclusion-exclusion principle: - countedTwice <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { - # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 - i <- ij[[1]] - j <- ij[[2]] - sum(apply(combn(4, 2), 2, function(kl) { - # We can pick taxa AB, AC, AD, BC, BD, CD from column i, - # and the other pair from col j - pair1 <- kl - pair2 <- (1:4)[-kl] - prod(tab[pair1, i], tab[pair2, j]) - })) - })) + nPatterns <- prod(rowSums(tab)) + allSame <- sum(apply(tab, 2, prod)) - decisive <- parsInf - countedTwice + decisive <- nPatterns# - allSame } c(concordant, decisive) From a6ad8f8fa999a32ce87d1aa871a055d09a30bb27 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 13 Feb 2025 12:31:39 +0000 Subject: [PATCH 17/25] Random quartet sampling required MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Still doesn't quite align – why not? --- R/Concordance.R | 133 +++++++++++++++++------------- R/data_manipulation.R | 2 +- tests/testthat/test-Concordance.R | 47 +++++++++-- 3 files changed, 120 insertions(+), 62 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 692ee80f0..0298930ae 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -100,7 +100,8 @@ #' @family split support functions #' @export # TODO support `method = c("split", "sitemean", "minh")` -QuartetConcordance <- function (tree, dataset = NULL, method = "split") { +QuartetConcordance <- function (tree, dataset = NULL, method = "split", + n = 250) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -116,7 +117,7 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { dataset <- dataset[tipLabels, drop = FALSE] characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) - if (method %in% c("minh", "iqtree")) { + if (method %in% c("minh", "minhq", "iqtree")) { if (attr(tree, "order") != "preorder") { stop("Tree must be in preorder; try `tree <- Preorder(tree)`") } @@ -162,68 +163,88 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split") { cli_progress_bar(name = "Quartet concordance", total = dim(quarters)[[2]]) on.exit(cli_progress_done(.envir = parent.frame(2))) - concDeci <- apply(quarters, 2, function (q) { - #cli_progress_update(1, .envir = parent.frame(2)) - quarts <- apply(characters, 2, function (char) { - tab <- table(q, char) - nCol <- dim(tab)[[2]] - if (nCol > 1L) { - # RULES: - # We must pick one leaf per row - # Only parsimony informative are decisive - # To be concordant, leaves 1 and 2 must come from the same column - - # Example table: - # q 0 1 2 - # A 2 3 0 - # B 2 3 2 - # C 0 1 2 - # D 2 1 2 - # - - concordant <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { - i <- ij[[1]] - j <- ij[[2]] - # Taxa from A and B from column i, C and D from j - prod(tab[1:2, i], tab[3:4, j]) + - # Taxa from A and B from column j, C and D from i - prod(tab[1:2, j], tab[3:4, i]) - })) - - if (method == "minh") { - # To be parsimony informative, we must pick two leaves from each of - # two columns + if (method == "minh") { + sCF <- apply(quarters, 2, function(abcd) { + resample <- function(x) x[sample.int(length(x), 1)] + leaves <- replicate(n, { + vapply(0:3, function(q) resample(which(abcd == q)), integer(1)) + }) + cfQ <- apply(leaves, 2, function(q) { + qCh <- `mode<-`(characters[q, , drop = FALSE], "numeric") + concordant <- sum(qCh[1, ] == qCh[2, ] & + qCh[2, ] != qCh[3, ] & + qCh[3, ] == qCh[4, ]) + decisive <- dim(qCh)[[2]] + decisive <- sum(colSums(apply(qCh + 1, 2, tabulate, 4) >= 2) > 1) + decisive <- sum(colSums(apply(qCh + 1, 2, tabulate, 4) >= 2) > 0) + concordant / decisive + }) + mean(cfQ, na.rm = TRUE) + }) + } else { + concDeci <- apply(quarters, 2, function (q) { + #cli_progress_update(1, .envir = parent.frame(2)) + quarts <- apply(characters, 2, function (char) { + tab <- table(q, char) + nCol <- dim(tab)[[2]] + if (nCol > 1L) { + # RULES: + # We must pick one leaf per row + # Only parsimony informative are decisive + # To be concordant, leaves 1 and 2 must come from the same column + # Example table: + # q 0 1 2 + # A 2 3 0 + # B 2 3 2 + # C 0 1 2 + # D 2 1 2 + # - decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { - # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + concordant <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { i <- ij[[1]] j <- ij[[2]] - sum(apply(combn(4, 2), 2, function(kl) { - # We can pick taxa AB, AC, AD, BC, BD, CD from column i, - # and the other pair from col j - pair1 <- kl - pair2 <- (1:4)[-kl] - prod(tab[pair1, i], tab[pair2, j]) - })) + # Taxa from A and B from column i, C and D from j + prod(tab[1:2, i], tab[3:4, j]) + + # Taxa from A and B from column j, C and D from i + prod(tab[1:2, j], tab[3:4, i]) })) - } else { - # IQ-TREE seems to use "variant" in place of "parsimony informative" - - nPatterns <- prod(rowSums(tab)) - allSame <- sum(apply(tab, 2, prod)) - - decisive <- nPatterns# - allSame + if (method == "minhq") { + # To be parsimony informative, we must pick two leaves from each of + # two columns + + + decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + i <- ij[[1]] + j <- ij[[2]] + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from col j + pair1 <- kl + pair2 <- (1:4)[-kl] + prod(tab[pair1, i], tab[pair2, j]) + })) + })) + } else { + # IQ-TREE seems to use "variant" in place of "parsimony informative" + + nPatterns <- prod(rowSums(tab)) + allSame <- sum(apply(tab, 2, prod)) + + decisive <- nPatterns# - allSame + + } + c(concordant, decisive) + } else { + c(0L, 0L) } - c(concordant, decisive) - } else { - c(0L, 0L) - } + }) + rowSums(quarts) }) - rowSums(quarts) - }) - concDeci[1, ] / concDeci[2, ] + concDeci[1, ] / concDeci[2, ] + } } else { splits <- as.Splits(tree, dataset) logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), diff --git a/R/data_manipulation.R b/R/data_manipulation.R index d3d4b8b4c..109659ed3 100644 --- a/R/data_manipulation.R +++ b/R/data_manipulation.R @@ -22,7 +22,7 @@ #' #' - `informative`: logical specifying which characters contain any #' phylogenetic information. -#' +#' #' - `bootstrap`: The character vector #' \code{c("info.amounts", "split.sizes")}, indicating attributes to sample #' when bootstrapping the dataset (e.g. in Ratchet searches). diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 69ed1f3ae..0030176aa 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -15,7 +15,7 @@ test_that("_Concordance() handles tip mismatch", { "No overlap between tree labels and dataset.") }) -test_that("QuartetConcordance(method = minh)", { +test_that("QuartetConcordance(method = minhq)", { tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, @@ -29,15 +29,15 @@ test_that("QuartetConcordance(method = minh)", { dimnames = list(letters[1:9], NULL)) dat <- MatrixToPhyDat(mataset) - expect_error(QuartetConcordance(tree, dat, method = "minh"), + expect_error(QuartetConcordance(tree, dat, method = "minhq"), "must be in preorder") tree <- Preorder(tree) # plot(tree); nodelabels(); - expect_concordance <- function(i, expectation, iq = "minh") { + expect_concordance <- function(i, expectation, iq = "minhq") { expect_equal( QuartetConcordance(tree, dat[, i], - method = ifelse(iq == "iq", "iqtree", "minh")), + method = ifelse(iq == "iq", "iqtree", "minhq")), setNames(expectation, 11:15)) } # Expectations computed by working through tables manually @@ -52,7 +52,7 @@ test_that("QuartetConcordance(method = minh)", { expect_concordance(9, c(NaN, 0, 1 / 2, 0, NaN)) # Values calculated from summing results above - expect_equal(unname(QuartetConcordance(tree, dat, method = "minh")), + expect_equal(unname(QuartetConcordance(tree, dat, method = "minhq")), c(5 + 3 + 1, 0, 12 + 8 + 4 + 1, @@ -76,6 +76,43 @@ test_that("QuartetConcordance(method = minh)", { expect_concordance(iq = "iq", 9, c(0, 0, 1 / 12, 0, 0)) expect_equal(unname(QuartetConcordance(tree, dat, method = "iqtree")), c(56.7, 0, 85.4, 0, 62.5) / 100, tolerance = 0.01) +}) + +test_that("QuartetConcordance(method = minh)", { + tree <- Preorder( + ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));")) + mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, + dimnames = list(letters[1:9], NULL)) + dat <- MatrixToPhyDat(mataset) + + # plot(tree); nodelabels(); + expect_concordance <- function(i, expectation) { + expect_equal( + QuartetConcordance(tree, dat[, i], method = "minh", n = 1250), + setNames(expectation, 11:15), tolerance = 0.05) + } + + # Expectations computed by iq-tree + expect_concordance(1, c(0, 0, 1, 0, 0)) + expect_concordance(2, c(1, 0, 0, 0, 0)) + expect_concordance(3, c(0.6, 0, 0, 0, 0.5)) + expect_concordance(4, c(0, 0, 2 / 3, 0, 0)) + expect_concordance(5, c(0, 0, 1 / 3, 0, 3 / 8)) + expect_concordance(6, c(0, 0, 0, 0, 0)) + expect_concordance(7, c(1 / 5, 0, 0, 0, 0)) + expect_concordance(8, rep(0, 5)) + expect_concordance(9, c(0, 0, 1 / 12, 0, 0)) + expect_equal(tolerance = 0.05, + unname(QuartetConcordance(tree, dat, method = "minh", n = 1234)), + c(56.7, 0, 85.4, 0, 62.5) / 100) }) From ad2079d0e164f2c4be4884e40e3be81ac16505bf Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 17 Feb 2025 10:55:09 +0000 Subject: [PATCH 18/25] Check multifurcating trees --- tests/testthat/test-Concordance.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 0030176aa..9e0b37e92 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -76,6 +76,12 @@ test_that("QuartetConcordance(method = minhq)", { expect_concordance(iq = "iq", 9, c(0, 0, 1 / 12, 0, 0)) expect_equal(unname(QuartetConcordance(tree, dat, method = "iqtree")), c(56.7, 0, 85.4, 0, 62.5) / 100, tolerance = 0.01) + + collapsed <- CollapseNode(tree, c(12, 13)) + expect_equal( + QuartetConcordance(collapsed, dat, method = "iqtree"), + QuartetConcordance(tree, dat, method = "iqtree")[-c(2, 3)] + ) }) test_that("QuartetConcordance(method = minh)", { @@ -114,6 +120,11 @@ test_that("QuartetConcordance(method = minh)", { unname(QuartetConcordance(tree, dat, method = "minh", n = 1234)), c(56.7, 0, 85.4, 0, 62.5) / 100) + collapsed <- CollapseNode(tree, c(12, 13)) + expect_equal(tolerance = 0.05, + QuartetConcordance(collapsed, dat, method = "minh", n = 1234), + QuartetConcordance(tree, dat, method = "minh", n = 1234)[-(2:3)] + ) }) test_that("QuartetConcordance() calculates correct values - weighting", { From 520e51dec681ddb52752d885b67c09b54209224a Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 17 Feb 2025 11:09:09 +0000 Subject: [PATCH 19/25] Support multifurcating trees --- R/Concordance.R | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 0298930ae..fcbd5c5f9 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -125,13 +125,13 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split", parent <- edge[, 1] child <- edge[, 2] nTip <- NTip(tree) - nodes <- nTip + seq_len(nTip - 1) + nodes <- min(parent):max(parent) parentEdge <- match(nodes, child) descended <- DescendantTips(parent, child) - childEdges <- vapply(nodes, function(x) - which(parent == x), integer(2)) - daughters <- matrix(child[childEdges], 2) - rootNode <- nTip + 1 + childEdges <- lapply(nodes, function(x) which(parent == x)) + daughters <- lapply(childEdges, function(e) child[e]) + + rootNode <- nTip + which(is.na(parentEdge)) rootChildren <- child[parent == rootNode] nonRootChildren <- setdiff(nodes, c(rootNode, rootChildren)) @@ -140,7 +140,7 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split", quarters <- cbind( if (all(rootChildren > nTip)) { ret <- integer(nTip) - groups <- as.numeric(childEdges[, rootChildren - nTip]) + groups <- childEdges[[rootChildren - nTip]] for (i in 0:3) { ret[descended[groups[[i + 1]], ]] <- i } @@ -149,11 +149,13 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split", `dimnames<-`(vapply(nonRootChildren, function(n) { ret <- integer(nTip) if (parent[[parentEdge[[n - nTip]]]] == rootNode) { - + stop("Ought something to be here?") } - ret[apply(descended[childEdges[, parent[parentEdge[n - nTip]] - nTip], ], - 2, any)] <- 1L - ce <- childEdges[, n - nTip] + ret[apply( + descended[childEdges[[parent[parentEdge[n - nTip]] - nTip]], ], + 2, + any)] <- 1L + ce <- childEdges[[n - nTip]] # Overprint with own children ret[descended[ce[[1]], ]] <- 2L ret[descended[ce[[2]], ]] <- 3L From bdacf87fa8dc8831ea78401abcf072fe729e56dc Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 17 Feb 2025 11:23:55 +0000 Subject: [PATCH 20/25] names will differ --- tests/testthat/test-Concordance.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 9e0b37e92..eb0367b32 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -79,8 +79,8 @@ test_that("QuartetConcordance(method = minhq)", { collapsed <- CollapseNode(tree, c(12, 13)) expect_equal( - QuartetConcordance(collapsed, dat, method = "iqtree"), - QuartetConcordance(tree, dat, method = "iqtree")[-c(2, 3)] + unname(QuartetConcordance(collapsed, dat, method = "iqtree")), + unname(QuartetConcordance(tree, dat, method = "iqtree"))[-c(2, 3)] ) }) From d5e487e43be727bc45e5840e7a36d99a791c026e Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 17 Feb 2025 11:24:03 +0000 Subject: [PATCH 21/25] Standalone quartering function --- R/Concordance.R | 91 +++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index fcbd5c5f9..5c93e62b5 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -118,50 +118,7 @@ QuartetConcordance <- function (tree, dataset = NULL, method = "split", characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) if (method %in% c("minh", "minhq", "iqtree")) { - if (attr(tree, "order") != "preorder") { - stop("Tree must be in preorder; try `tree <- Preorder(tree)`") - } - edge <- tree[["edge"]] - parent <- edge[, 1] - child <- edge[, 2] - nTip <- NTip(tree) - nodes <- min(parent):max(parent) - parentEdge <- match(nodes, child) - descended <- DescendantTips(parent, child) - childEdges <- lapply(nodes, function(x) which(parent == x)) - daughters <- lapply(childEdges, function(e) child[e]) - - rootNode <- nTip + which(is.na(parentEdge)) - rootChildren <- child[parent == rootNode] - nonRootChildren <- setdiff(nodes, c(rootNode, rootChildren)) - - # TODO pull out into a standalone function, perhaps in TreeTools - # nodes[-1] omits the root node, which defines no quartet - quarters <- cbind( - if (all(rootChildren > nTip)) { - ret <- integer(nTip) - groups <- childEdges[[rootChildren - nTip]] - for (i in 0:3) { - ret[descended[groups[[i + 1]], ]] <- i - } - `dimnames<-`(cbind(ret), list(TipLabels(tree), rootChildren[[1]])) - }, - `dimnames<-`(vapply(nonRootChildren, function(n) { - ret <- integer(nTip) - if (parent[[parentEdge[[n - nTip]]]] == rootNode) { - stop("Ought something to be here?") - } - ret[apply( - descended[childEdges[[parent[parentEdge[n - nTip]] - nTip]], ], - 2, - any)] <- 1L - ce <- childEdges[[n - nTip]] - # Overprint with own children - ret[descended[ce[[1]], ]] <- 2L - ret[descended[ce[[2]], ]] <- 3L - ret - }, integer(nTip)), list(TipLabels(tree), nonRootChildren)) - ) + quarters <- .TreeQuarters(tree) cli_progress_bar(name = "Quartet concordance", total = dim(quarters)[[2]]) on.exit(cli_progress_done(.envir = parent.frame(2))) @@ -397,6 +354,52 @@ PhylogeneticConcordance <- function (tree, dataset) { support[, 1] / support[, 2] } +.TreeQuarters <- function(tree) { + if (attr(tree, "order") != "preorder") { + stop("Tree must be in preorder; try `tree <- Preorder(tree)`") + } + edge <- tree[["edge"]] + parent <- edge[, 1] + child <- edge[, 2] + nTip <- NTip(tree) + nodes <- min(parent):max(parent) + parentEdge <- match(nodes, child) + descended <- DescendantTips(parent, child) + childEdges <- lapply(nodes, function(x) which(parent == x)) + daughters <- lapply(childEdges, function(e) child[e]) + + rootNode <- nTip + which(is.na(parentEdge)) + rootChildren <- child[parent == rootNode] + nonRootChildren <- setdiff(nodes, c(rootNode, rootChildren)) + + # Return: + cbind( + if (all(rootChildren > nTip)) { + ret <- integer(nTip) + groups <- childEdges[[rootChildren - nTip]] + for (i in 0:3) { + ret[descended[groups[[i + 1]], ]] <- i + } + `dimnames<-`(cbind(ret), list(TipLabels(tree), rootChildren[[1]])) + }, + `dimnames<-`(vapply(nonRootChildren, function(n) { + ret <- integer(nTip) + if (parent[[parentEdge[[n - nTip]]]] == rootNode) { + stop("Ought something to be here?") + } + ret[apply( + descended[childEdges[[parent[parentEdge[n - nTip]] - nTip]], ], + 2, + any)] <- 1L + ce <- childEdges[[n - nTip]] + # Overprint with own children + ret[descended[ce[[1]], ]] <- 2L + ret[descended[ce[[2]], ]] <- 3L + ret + }, integer(nTip)), list(TipLabels(tree), nonRootChildren)) + ) +} + #' @rdname SiteConcordance # Mutual clustering information of each split with the split implied by each character #' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo From 020546ace928e99891d3e09e641b4bb59451dea0 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Mon, 17 Feb 2025 12:14:54 +0000 Subject: [PATCH 22/25] Test quarters Not clear what should be expected in polytomy though! --- tests/testthat/test-Concordance.R | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index eb0367b32..012991acf 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -15,8 +15,27 @@ test_that("_Concordance() handles tip mismatch", { "No overlap between tree labels and dataset.") }) +test_that(".TreeQuarters() gives complete list", { + tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") + coll <- CollapseNode(Preorder(tree), 12:13) + expect_error(.TreeQuarters(tree), "must be in preorder") + tree <- Preorder(tree) + + expect_equal( + .TreeQuarters(tree), + cbind("11" = c(a = 0, b = 1, c = 2, d = 3, e = 3, f = 3, g = 3, h = 3), + "12" = c(0, 0, 1, 2, 3, 3, 3, 3), + "13" = c(0, 0, 0, 1, 2, 2, 3, 3), + "14" = c(rep(0, 4), 2, 3, 1, 1), + "15" = c(rep(0, 4), 1, 1, 2, 3)) + ) + expect_equal(unname(.TreeQuarters(coll)), + unname(.TreeQuarters(Preorder(tree))[, c(1, 4, 5)])) +}) + test_that("QuartetConcordance(method = minhq)", { tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") + tree <- Preorder(tree) mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, @@ -29,10 +48,6 @@ test_that("QuartetConcordance(method = minhq)", { dimnames = list(letters[1:9], NULL)) dat <- MatrixToPhyDat(mataset) - expect_error(QuartetConcordance(tree, dat, method = "minhq"), - "must be in preorder") - tree <- Preorder(tree) - # plot(tree); nodelabels(); expect_concordance <- function(i, expectation, iq = "minhq") { expect_equal( From fa3f5ad2b95a395e66efa8f5e4cf9da3a853df94 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Fri, 4 Apr 2025 15:46:23 +0100 Subject: [PATCH 23/25] Read as PhyDat --- man-roxygen/datasetParam.r | 2 +- man/AdditionTree.Rd | 2 +- man/CharacterLength.Rd | 2 +- man/ConcordantInformation.Rd | 2 +- man/Consistency.Rd | 2 +- man/MaximizeParsimony.Rd | 2 +- man/PlotCharacter.Rd | 2 +- man/SiteConcordance.Rd | 4 ++-- man/SuccessiveApproximations.Rd | 2 +- man/TaxonInfluence.Rd | 2 +- man/TreeLength.Rd | 2 +- man/TreeSearch.Rd | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) diff --git a/man-roxygen/datasetParam.r b/man-roxygen/datasetParam.r index 5038c6a27..81bc53e03 100644 --- a/man-roxygen/datasetParam.r +++ b/man-roxygen/datasetParam.r @@ -1,4 +1,4 @@ #' @param dataset A phylogenetic data matrix of \pkg{phangorn} class #' \code{phyDat}, whose names correspond to the labels of any accompanying tree. -#' Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}. +#' Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}. # Defined in TreeTools. Please propagate any changes there. diff --git a/man/AdditionTree.Rd b/man/AdditionTree.Rd index 88fe9b76c..0fbddc4b9 100644 --- a/man/AdditionTree.Rd +++ b/man/AdditionTree.Rd @@ -9,7 +9,7 @@ AdditionTree(dataset, concavity = Inf, constraint, sequence) \arguments{ \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{concavity}{Numeric specifying concavity constant for implied step weighting. diff --git a/man/CharacterLength.Rd b/man/CharacterLength.Rd index 5f217f6c5..1d73c6f5a 100644 --- a/man/CharacterLength.Rd +++ b/man/CharacterLength.Rd @@ -17,7 +17,7 @@ FastCharacterLength(tree, dataset) \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{compress}{Logical specifying whether to retain the compression of a \code{phyDat} object or to return a vector specifying to each individual diff --git a/man/ConcordantInformation.Rd b/man/ConcordantInformation.Rd index b1d10935d..8cb5ce3de 100644 --- a/man/ConcordantInformation.Rd +++ b/man/ConcordantInformation.Rd @@ -17,7 +17,7 @@ ConcordantInfo(tree, dataset) \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} } \value{ \code{ConcordantInformation()} returns a named vector with elements: diff --git a/man/Consistency.Rd b/man/Consistency.Rd index 2af757710..7720a5928 100644 --- a/man/Consistency.Rd +++ b/man/Consistency.Rd @@ -9,7 +9,7 @@ Consistency(dataset, tree, compress = FALSE) \arguments{ \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{tree}{A tree of class \code{\link{phylo}}.} diff --git a/man/MaximizeParsimony.Rd b/man/MaximizeParsimony.Rd index 8de3f5d24..5256a440d 100644 --- a/man/MaximizeParsimony.Rd +++ b/man/MaximizeParsimony.Rd @@ -48,7 +48,7 @@ EasyTreesy() \arguments{ \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{tree}{(optional) A bifurcating tree of class \code{\link{phylo}}, containing only the tips listed in \code{dataset}, from which the search diff --git a/man/PlotCharacter.Rd b/man/PlotCharacter.Rd index 094859f81..c7b917c88 100644 --- a/man/PlotCharacter.Rd +++ b/man/PlotCharacter.Rd @@ -26,7 +26,7 @@ PlotCharacter( \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{char}{Index of character to plot.} diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 55113b651..40f0269bd 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -9,7 +9,7 @@ \alias{SharedPhylogeneticConcordance} \title{Calculate site concordance factor} \usage{ -QuartetConcordance(tree, dataset = NULL, method = "split") +QuartetConcordance(tree, dataset = NULL, method = "split", n = 250) ClusteringConcordance(tree, dataset) @@ -24,7 +24,7 @@ SharedPhylogeneticConcordance(tree, dataset) \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{method}{Character vector specifying which concordance measures to calculate. See details section for available options.} diff --git a/man/SuccessiveApproximations.Rd b/man/SuccessiveApproximations.Rd index 774dc45b7..d87888e28 100644 --- a/man/SuccessiveApproximations.Rd +++ b/man/SuccessiveApproximations.Rd @@ -26,7 +26,7 @@ SuccessiveWeights(tree, dataset) \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{outgroup}{if not NULL, taxa on which the tree should be rooted} diff --git a/man/TaxonInfluence.Rd b/man/TaxonInfluence.Rd index fdfe41d85..ef7510e63 100644 --- a/man/TaxonInfluence.Rd +++ b/man/TaxonInfluence.Rd @@ -19,7 +19,7 @@ TaxonInfluence( \arguments{ \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{tree}{Optimal tree or summary tree (of class "phylo") or list of trees (of class "list" or "multiPhylo") against which results should be evaluated. diff --git a/man/TreeLength.Rd b/man/TreeLength.Rd index 075a2b63c..1481ca255 100644 --- a/man/TreeLength.Rd +++ b/man/TreeLength.Rd @@ -31,7 +31,7 @@ uniformly sampled.} \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{concavity}{Determines the degree to which extra steps beyond the first are penalized. Specify a numeric value to use implied weighting diff --git a/man/TreeSearch.Rd b/man/TreeSearch.Rd index 33b0658ff..84680758a 100644 --- a/man/TreeSearch.Rd +++ b/man/TreeSearch.Rd @@ -56,7 +56,7 @@ DoNothing(...) \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class \code{phyDat}, whose names correspond to the labels of any accompanying tree. -Perhaps load into R using \code{\link[TreeTools]{ReadCharacters}}.} +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}.} \item{TreeScorer}{function to score a given tree. The function will be passed three parameters, corresponding to the From 0827e86296f22f384ef3e1be6ec70bd68802c382 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:14:43 +0100 Subject: [PATCH 24/25] / Link https://github.com/iqtree/iqtree2/issues/415 --- data-raw/quartet-concordance-tests.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/data-raw/quartet-concordance-tests.R b/data-raw/quartet-concordance-tests.R index c4386fb07..24bae23dd 100644 --- a/data-raw/quartet-concordance-tests.R +++ b/data-raw/quartet-concordance-tests.R @@ -11,14 +11,15 @@ mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, dimnames = list(letters[1:9], NULL)) dat <- MatrixToPhyDat(mataset[1:8, ]) -iqtreeDir <- stop("Path to folder that contains iqtree2.exe") -treeFile <- paste0(iqtreeDir, "tree.nwk") -dataFile <- paste0(iqtreeDir, "data.nex") -charFile <- paste0(iqtreeDir, "data%i.nex") +iqtreeDir <- stop("Path to folder that contains iqtree3.exe") +treeFile <- paste0(iqtreeDir, "/tree.nwk") +dataFile <- paste0(iqtreeDir, "/data.nex") +charFile <- paste0(iqtreeDir, "/data%i.nex") write.tree(tree, file = treeFile) # Expected output per character +# See: https://github.com/iqtree/iqtree2/issues/415 vapply(1:9, function(i) { charIFile <- sprintf(charFile, i) write.nexus.data(dat[, i], file = charIFile, format = "STANDARD") @@ -26,7 +27,7 @@ vapply(1:9, function(i) { dataLines[[5]] <- sub("MISSING=? GAP=- INTERLEAVE=NO ", "", dataLines[[5]], fixed = TRUE) writeLines(dataLines, charIFile) - system2(paste0(iqtreeDir, "iqtree2.exe"), + system2(paste0(iqtreeDir, "/iqtree3.exe"), paste("-t", treeFile, "-s", charIFile, "--scf 100000", "-nt 4" From d25abefbe9a2aa45d244ca36d5737952e9bbc7b8 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:21:53 +0100 Subject: [PATCH 25/25] Version: 1.5.1.9007 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 929fe0c13..f113bc582 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.5.1.9006 +Version: 1.5.1.9007 Authors@R: c( person( "Martin R.", 'Smith',