diff --git a/DESCRIPTION b/DESCRIPTION index 144fa7548..464913cb8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ License: GPL (>= 3) Copyright: Incorporates C/C++ code from Morphy Phylogenetic Library by Martin Brazeau (GPL3) -Description: Reconstruct phylogenetic trees from discrete data. +Description: Reconstruct phylogenetic trees from discrete/morphological data. Inapplicable character states are handled using the algorithm of Brazeau, Guillerme and Smith (2019) with the "Morphy" library, under equal or implied step weights. diff --git a/NEWS.md b/NEWS.md index 06e8453af..325b3be1c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# Branch 175-concordance + +- `QuartetConcordance(method = "minh")` uses the approach of Minh et al. (2020) + + # TreeSearch 1.7.0.9000 (development) No changes yet. @@ -23,6 +28,7 @@ No changes yet. - Fix character state colours in app legend - Tweak documentation + # TreeSearch 1.6.0 (2025-04-09) ## Improvements diff --git a/R/Concordance.R b/R/Concordance.R index c5c76641b..ff4f38fba 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -1,6 +1,6 @@ #' Calculate site concordance factor #' -#' The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure +#' The site concordance factor is a measure #' of the strength of support that the dataset presents for a given split in a #' tree. #' @@ -12,21 +12,37 @@ #' with any tree that groups the first two leaves together to the exclusion #' of the second. #' -#' By default, the reported value weights each site by the number of quartets -#' it is decisive for. This value can be interpreted as the proportion of -#' all decisive quartets that are concordant with a split. -#' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. +#' The `methods` argument selects from different methods for quantifying the +#' concordance associated with a given edge of a tree. +#' +#' If `method = "split"` (the default), the concordance is calculated by +#' counting all quartets that are decisive for a split, i.e. for the split +#' $A|B$, any quartet that contains two leaves from set $A$ and two from set $B$. +#' This value can be interpreted as the proportion of all decisive quartets +#' that are concordant with a split. +#' If `method = "sitemean"`, the reported value is the mean of the concordance +#' value for each site. #' Consider a split associated with two sites: #' one that is concordant with 25% of 96 decisive quartets, and #' a second that is concordant with 75% of 4 decisive quartets. -#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. -#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. +#' `method = "split"` returns a value of 24 + 3 / 96 + 4 = 27%. +#' `method = "sitemean"` returns mean(75%, 25%) = 50%. +#' +#' `method = "minh"` uses the approach of +#' \insertCite{Minh2020;textual}{TreeSearch} to compute the site concordance +#' factor. +#' Briefly, this interprets each edge as defining *four* clades, and counts +#' the status of quartets that contain exactly one leaf from each of these +#' clades. This is a subset of the quartets considered by other methods. +#' `method = "iqtree"` uses the \insertCite{Minh2020;textual}{TreeSearch} +#' approach as [implemented in IQ-TREE]( +#' https://github.com/iqtree/iqtree2/issues/415). +#' #' -#' `QuartetConcordance()` is computed exactly, using all quartets, where as -#' other implementations (e.g. IQ-TREE) follow +#' `QuartetConcordance()` is computed exactly, using all quartets. +#' Other implementations (e.g. IQ-TREE) follow #' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -#' of quartets for a faster, if potentially less accurate, computation. +#' of quartets for a faster computation, potentially at the expense of accuracy. #' # `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report # the proportion of clustering information and phylogenetic information @@ -37,6 +53,9 @@ #TODO More thought / explanation needed. #' #TODO Finally, `ProfileConcordance()` (to follow) +#' +#' For an overview of the use and interpretation of concordance factors, +#' see \insertCite{Lanfear2024}{TreeSearch}. #' #' **NOTE:** These functions are under development. They are incompletely #' tested, and may change without notice. @@ -45,8 +64,8 @@ # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony -#' @param weight Logical specifying whether to weight sites according to the -#' number of quartets they are decisive for. +#' @param method Character vector specifying which concordance measures to +#' calculate. See details section for available options. #' #' #' @@ -86,11 +105,13 @@ #' @importFrom ape keep.tip #' @importFrom cli cli_progress_bar cli_progress_update #' @importFrom utils combn -#' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels +#' @importFrom TreeTools as.Splits DescendantTips PhyDatToMatrix TipLabels #' @name SiteConcordance #' @family split support functions #' @export -QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { +# TODO support `method = c("split", "sitemean", "minh")` +QuartetConcordance <- function (tree, dataset = NULL, method = "split", + n = 250) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -104,62 +125,153 @@ QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { return(NULL) } dataset <- dataset[tipLabels, drop = FALSE] - splits <- as.Splits(tree, dataset) - logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), - logical(NTip(dataset))) - characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) - cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) - setNames(apply(logiSplits, 2, function (split) { - cli_progress_update(1, .envir = parent.frame(2)) - quarts <- apply(characters, 2, function (char) { - tab <- table(split, char) - nCol <- dim(tab)[[2]] - if (nCol > 1L) { - # Consider the case - # split 0 1 2 - # FALSE 2 3 0 - # TRUE 0 4 2 - # - # Concordant quartets with bin i = 1 (character 0) and bin j = 2 - # (character 1) include any combination of the two taxa from - # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], - # i.e. choose(2, 2) * choose(4, 2) - concordant <- sum(vapply(seq_len(nCol), function (i) { - inBinI <- tab[1, i] - iChoices <- choose(inBinI, 2) - sum(vapply(seq_len(nCol)[-i], function (j) { - inBinJ <- tab[2, j] - iChoices * choose(inBinJ, 2) - }, 1)) - }, 1)) - - # To be discordant, we must select a pair of taxa from TT and from FF; - # and the character states must group each T with an F - # e.g. T0 T1 F0 F1 - # T0 T1 F0 F2 would not be discordant - just uninformative - discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) - - # Only quartets that include two T and two F can be decisive - # Quartets must also include two pairs of characters - decisive <- concordant + discordant - - # Return the numerator and denominatory of equation 2 in - # Minh et al. 2020 - c(concordant, decisive) - } else { - c(0L, 0L) - } - }) - if (isTRUE(weight)) { - quartSums <- rowSums(quarts) - ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) + if (method %in% c("minh", "minhq", "iqtree")) { + quarters <- .TreeQuarters(tree) + + cli_progress_bar(name = "Quartet concordance", total = dim(quarters)[[2]]) + on.exit(cli_progress_done(.envir = parent.frame(2))) + if (method == "minh") { + sCF <- apply(quarters, 2, function(abcd) { + resample <- function(x) x[sample.int(length(x), 1)] + leaves <- replicate(n, { + vapply(0:3, function(q) resample(which(abcd == q)), integer(1)) + }) + cfQ <- apply(leaves, 2, function(q) { + qCh <- `mode<-`(characters[q, , drop = FALSE], "numeric") + concordant <- sum(qCh[1, ] == qCh[2, ] & + qCh[2, ] != qCh[3, ] & + qCh[3, ] == qCh[4, ]) + decisive <- dim(qCh)[[2]] + decisive <- sum(colSums(apply(qCh + 1, 2, tabulate, 4) >= 2) > 1) + decisive <- sum(colSums(apply(qCh + 1, 2, tabulate, 4) >= 2) > 0) + concordant / decisive + }) + mean(cfQ, na.rm = TRUE) + }) } else { - mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), - na.rm = TRUE) + concDeci <- apply(quarters, 2, function (q) { + #cli_progress_update(1, .envir = parent.frame(2)) + quarts <- apply(characters, 2, function (char) { + tab <- table(q, char) + nCol <- dim(tab)[[2]] + if (nCol > 1L) { + # RULES: + # We must pick one leaf per row + # Only parsimony informative are decisive + # To be concordant, leaves 1 and 2 must come from the same column + + # Example table: + # q 0 1 2 + # A 2 3 0 + # B 2 3 2 + # C 0 1 2 + # D 2 1 2 + # + + concordant <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + i <- ij[[1]] + j <- ij[[2]] + # Taxa from A and B from column i, C and D from j + prod(tab[1:2, i], tab[3:4, j]) + + # Taxa from A and B from column j, C and D from i + prod(tab[1:2, j], tab[3:4, i]) + })) + + if (method == "minhq") { + # To be parsimony informative, we must pick two leaves from each of + # two columns + + + decisive <- sum(apply(combn(seq_len(nCol), 2), 2, function(ij) { + # With three columns, we'll encounter i = 1, 1, 2; j = 2, 3, 3 + i <- ij[[1]] + j <- ij[[2]] + sum(apply(combn(4, 2), 2, function(kl) { + # We can pick taxa AB, AC, AD, BC, BD, CD from column i, + # and the other pair from col j + pair1 <- kl + pair2 <- (1:4)[-kl] + prod(tab[pair1, i], tab[pair2, j]) + })) + })) + } else { + # IQ-TREE seems to use "variant" in place of "parsimony informative" + + nPatterns <- prod(rowSums(tab)) + allSame <- sum(apply(tab, 2, prod)) + + decisive <- nPatterns# - allSame + + } + c(concordant, decisive) + } else { + c(0L, 0L) + } + }) + rowSums(quarts) + }) + concDeci[1, ] / concDeci[2, ] } - }), names(splits)) + } else { + splits <- as.Splits(tree, dataset) + logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), + logical(NTip(dataset))) + cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) + on.exit(cli_progress_done(.envir = parent.frame(2))) + setNames(apply(logiSplits, 2, function (split) { + cli_progress_update(1, .envir = parent.frame(2)) + quarts <- apply(characters, 2, function (char) { + tab <- table(split, char) + nCol <- dim(tab)[[2]] + if (nCol > 1L) { + # Consider the case + # split 0 1 2 + # FALSE 2 3 0 + # TRUE 0 4 2 + # + # Concordant quartets with bin i = 1 (character 0) and bin j = 2 + # (character 1) include any combination of the two taxa from + # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], + # i.e. choose(2, 2) * choose(4, 2) + concordant <- sum(vapply(seq_len(nCol), function (i) { + inBinI <- tab[1, i] + iChoices <- choose(inBinI, 2) + sum(vapply(seq_len(nCol)[-i], function (j) { + inBinJ <- tab[2, j] + iChoices * choose(inBinJ, 2) + }, 1)) + }, 1)) + + # To be discordant, we must select a pair of taxa from TT and from FF; + # and the character states must group each T with an F + # e.g. T0 T1 F0 F1 + # T0 T1 F0 F2 would not be discordant - just uninformative + discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) + + # Only quartets that include two T and two F can be decisive + # Quartets must also include two pairs of characters + decisive <- concordant + discordant + + # Return the numerator and denominatory of equation 2 in + # Minh et al. 2020 + c(concordant, decisive) + } else { + c(0L, 0L) + } + }) + if (method == "split") { + quartSums <- rowSums(quarts) + ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) + } else if (method == "sitemean") { + mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), + na.rm = TRUE) + } else { + stop("Unrecognized method: ", method) + } + }), names(splits)) + } } #' @importFrom TreeDist Entropy @@ -252,6 +364,52 @@ PhylogeneticConcordance <- function (tree, dataset) { support[, 1] / support[, 2] } +.TreeQuarters <- function(tree) { + if (attr(tree, "order") != "preorder") { + stop("Tree must be in preorder; try `tree <- Preorder(tree)`") + } + edge <- tree[["edge"]] + parent <- edge[, 1] + child <- edge[, 2] + nTip <- NTip(tree) + nodes <- min(parent):max(parent) + parentEdge <- match(nodes, child) + descended <- DescendantTips(parent, child) + childEdges <- lapply(nodes, function(x) which(parent == x)) + daughters <- lapply(childEdges, function(e) child[e]) + + rootNode <- nTip + which(is.na(parentEdge)) + rootChildren <- child[parent == rootNode] + nonRootChildren <- setdiff(nodes, c(rootNode, rootChildren)) + + # Return: + cbind( + if (all(rootChildren > nTip)) { + ret <- integer(nTip) + groups <- childEdges[[rootChildren - nTip]] + for (i in 0:3) { + ret[descended[groups[[i + 1]], ]] <- i + } + `dimnames<-`(cbind(ret), list(TipLabels(tree), rootChildren[[1]])) + }, + `dimnames<-`(vapply(nonRootChildren, function(n) { + ret <- integer(nTip) + if (parent[[parentEdge[[n - nTip]]]] == rootNode) { + stop("Ought something to be here?") + } + ret[apply( + descended[childEdges[[parent[parentEdge[n - nTip]] - nTip]], ], + 2, + any)] <- 1L + ce <- childEdges[[n - nTip]] + # Overprint with own children + ret[descended[ce[[1]], ]] <- 2L + ret[descended[ce[[2]], ]] <- 3L + ret + }, integer(nTip)), list(TipLabels(tree), nonRootChildren)) + ) +} + #' @rdname SiteConcordance # Mutual clustering information of each split with the split implied by each character #' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo diff --git a/R/data_manipulation.R b/R/data_manipulation.R index d3d4b8b4c..109659ed3 100644 --- a/R/data_manipulation.R +++ b/R/data_manipulation.R @@ -22,7 +22,7 @@ #' #' - `informative`: logical specifying which characters contain any #' phylogenetic information. -#' +#' #' - `bootstrap`: The character vector #' \code{c("info.amounts", "split.sizes")}, indicating attributes to sample #' when bootstrapping the dataset (e.g. in Ratchet searches). diff --git a/data-raw/quartet-concordance-tests.R b/data-raw/quartet-concordance-tests.R new file mode 100644 index 000000000..24bae23dd --- /dev/null +++ b/data-raw/quartet-concordance-tests.R @@ -0,0 +1,53 @@ +library("TreeTools") +tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") +mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, + dimnames = list(letters[1:9], NULL)) +dat <- MatrixToPhyDat(mataset[1:8, ]) +iqtreeDir <- stop("Path to folder that contains iqtree3.exe") +treeFile <- paste0(iqtreeDir, "/tree.nwk") +dataFile <- paste0(iqtreeDir, "/data.nex") +charFile <- paste0(iqtreeDir, "/data%i.nex") + +write.tree(tree, file = treeFile) + +# Expected output per character +# See: https://github.com/iqtree/iqtree2/issues/415 +vapply(1:9, function(i) { + charIFile <- sprintf(charFile, i) + write.nexus.data(dat[, i], file = charIFile, format = "STANDARD") + dataLines <- readLines(charIFile) + dataLines[[5]] <- sub("MISSING=? GAP=- INTERLEAVE=NO ", "", dataLines[[5]], + fixed = TRUE) + writeLines(dataLines, charIFile) + system2(paste0(iqtreeDir, "/iqtree3.exe"), + paste("-t", treeFile, "-s", charIFile, + "--scf 100000", + "-nt 4" + ), stdout = NULL) + cf <- read.tree(paste0(treeFile, ".cf.tree")) + + as.numeric(cf[["node.label"]][-(1:2)]) +}, double(5)) + +# Expected output across entire dataset +write.nexus.data(dat, file = dataFile, format = "STANDARD") +dataLines <- readLines(dataFile) +dataLines[[5]] <- sub("MISSING=? GAP=- INTERLEAVE=NO ", "", dataLines[[5]], fixed = TRUE) +writeLines(dataLines, dataFile) +system2(paste0(iqtreeDir, "iqtree2.exe"), + paste("-t", treeFile, "-s", dataFile, + "--scf 1000000", + "-nt 4" + ), stdout = NULL) +cf <- read.tree(paste0(treeFile, ".cf.tree")) +plot(cf, show.node.label = TRUE, use.edge.length = FALSE) + +as.numeric(cf[["node.label"]][-(1:2)]) diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index a56e54cb8..bbb704efa 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -345,6 +345,17 @@ @article{Kumar2012 number = {2} } +@article{Lanfear2024, + title = {The Meaning and Measure of Concordance Factors in Phylogenomics}, + author = {Lanfear, Robert and Hahn, Matthew W}, + year = {2024}, + journal = {Molecular Biology and Evolution}, + volume = {41}, + number = {11}, + pages = {msae214}, + doi = {10.1093/molbev/msae214} +} + @article{Maddison1991, title = {Null models for the number of evolutionary steps in a character on a phylogenetic tree}, author = {Maddison, Wayne P. and Slatkin, M}, diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 6793a119b..963b7414a 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -9,7 +9,7 @@ \alias{SharedPhylogeneticConcordance} \title{Calculate site concordance factor} \usage{ -QuartetConcordance(tree, dataset = NULL, weight = TRUE) +QuartetConcordance(tree, dataset = NULL, method = "split", n = 250) ClusteringConcordance(tree, dataset) @@ -28,11 +28,11 @@ Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}}. Additive (ordered) characters can be handled using \code{\link[TreeTools]{Decompose}}.} -\item{weight}{Logical specifying whether to weight sites according to the -number of quartets they are decisive for.} +\item{method}{Character vector specifying which concordance measures to +calculate. See details section for available options.} } \description{ -The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure +The site concordance factor is a measure of the strength of support that the dataset presents for a given split in a tree. } @@ -45,21 +45,38 @@ But a quartet with characters \verb{0 0 1 1} is decisive, and is concordant with any tree that groups the first two leaves together to the exclusion of the second. -By default, the reported value weights each site by the number of quartets -it is decisive for. This value can be interpreted as the proportion of -all decisive quartets that are concordant with a split. -If \code{weight = FALSE}, the reported value is the mean of the concordance +The \code{methods} argument selects from different methods for quantifying the +concordance associated with a given edge of a tree. + +If \code{method = "split"} (the default), the concordance is calculated by +counting all quartets that are decisive for a split, i.e. for the split +$A|B$, any quartet that contains two leaves from set $A$ and two from set $B$. +This value can be interpreted as the proportion of all decisive quartets +that are concordant with a split. +If \code{method = "sitemean"}, the reported value is the mean of the concordance value for each site. Consider a split associated with two sites: one that is concordant with 25\% of 96 decisive quartets, and a second that is concordant with 75\% of 4 decisive quartets. -If \code{weight = TRUE}, the split concordance will be 24 + 3 / 96 + 4 = 27\%. -If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. +\code{method = "split"} returns a value of 24 + 3 / 96 + 4 = 27\%. +\code{method = "sitemean"} returns mean(75\%, 25\%) = 50\%. + +\code{method = "minh"} uses the approach of +\insertCite{Minh2020;textual}{TreeSearch} to compute the site concordance +factor. +Briefly, this interprets each edge as defining \emph{four} clades, and counts +the status of quartets that contain exactly one leaf from each of these +clades. This is a subset of the quartets considered by other methods. +\code{method = "iqtree"} uses the \insertCite{Minh2020;textual}{TreeSearch} +approach as \href{https://github.com/iqtree/iqtree2/issues/415}{implemented in IQ-TREE}. -\code{QuartetConcordance()} is computed exactly, using all quartets, where as -other implementations (e.g. IQ-TREE) follow +\code{QuartetConcordance()} is computed exactly, using all quartets. +Other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -of quartets for a faster, if potentially less accurate, computation. +of quartets for a faster computation, potentially at the expense of accuracy. + +For an overview of the use and interpretation of concordance factors, +see \insertCite{Lanfear2024}{TreeSearch}. \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index b4ffdc786..012991acf 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -15,7 +15,181 @@ test_that("_Concordance() handles tip mismatch", { "No overlap between tree labels and dataset.") }) -test_that("QuartetConcordance() works", { +test_that(".TreeQuarters() gives complete list", { + tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") + coll <- CollapseNode(Preorder(tree), 12:13) + expect_error(.TreeQuarters(tree), "must be in preorder") + tree <- Preorder(tree) + + expect_equal( + .TreeQuarters(tree), + cbind("11" = c(a = 0, b = 1, c = 2, d = 3, e = 3, f = 3, g = 3, h = 3), + "12" = c(0, 0, 1, 2, 3, 3, 3, 3), + "13" = c(0, 0, 0, 1, 2, 2, 3, 3), + "14" = c(rep(0, 4), 2, 3, 1, 1), + "15" = c(rep(0, 4), 1, 1, 2, 3)) + ) + expect_equal(unname(.TreeQuarters(coll)), + unname(.TreeQuarters(Preorder(tree))[, c(1, 4, 5)])) +}) + +test_that("QuartetConcordance(method = minhq)", { + tree <- ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));") + tree <- Preorder(tree) + mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, + dimnames = list(letters[1:9], NULL)) + dat <- MatrixToPhyDat(mataset) + + # plot(tree); nodelabels(); + expect_concordance <- function(i, expectation, iq = "minhq") { + expect_equal( + QuartetConcordance(tree, dat[, i], + method = ifelse(iq == "iq", "iqtree", "minhq")), + setNames(expectation, 11:15)) + } + # Expectations computed by working through tables manually + expect_concordance(1, c(NaN, NaN, 1, NaN, NaN)) + expect_concordance(2, c(1, NaN, NaN, NaN, NaN)) + expect_concordance(3, c(1, NaN, NaN, NaN, 1)) + expect_concordance(4, c(0, 0, 1, NaN, NaN)) + expect_concordance(5, c(0, 0, 4 / 6, 0, 1)) + expect_concordance(6, rep(NaN, 5)) + expect_concordance(7, c(1, NaN, NaN, NaN, NaN)) + expect_concordance(8, c(NaN, NaN, 0, NaN, NaN)) + expect_concordance(9, c(NaN, 0, 1 / 2, 0, NaN)) + + # Values calculated from summing results above + expect_equal(unname(QuartetConcordance(tree, dat, method = "minhq")), + c(5 + 3 + 1, + 0, + 12 + 8 + 4 + 1, + 0, + 4 + 3) / + c(5 + 3 + 4 + 3 + 1, + 4 + 3 + 2, + 12 + 8 + 6 + 2 + 2, + 6 + 2, + 4 + 3)) + + # Expectations computed by iq-tree + expect_concordance(iq = "iq", 1, c(0, 0, 1, 0, 0)) + expect_concordance(iq = "iq", 2, c(1, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 3, c(0.6, 0, 0, 0, 0.5)) + expect_concordance(iq = "iq", 4, c(0, 0, 2 / 3, 0, 0)) + expect_concordance(iq = "iq", 5, c(0, 0, 1 / 3, 0, 3 / 8)) + expect_concordance(iq = "iq", 6, c(0, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 7, c(1 / 5, 0, 0, 0, 0)) + expect_concordance(iq = "iq", 8, rep(0, 5)) + expect_concordance(iq = "iq", 9, c(0, 0, 1 / 12, 0, 0)) + expect_equal(unname(QuartetConcordance(tree, dat, method = "iqtree")), + c(56.7, 0, 85.4, 0, 62.5) / 100, tolerance = 0.01) + + collapsed <- CollapseNode(tree, c(12, 13)) + expect_equal( + unname(QuartetConcordance(collapsed, dat, method = "iqtree")), + unname(QuartetConcordance(tree, dat, method = "iqtree"))[-c(2, 3)] + ) +}) + +test_that("QuartetConcordance(method = minh)", { + tree <- Preorder( + ape::read.tree(text = "(a, (b, (c, (d, ((e, f), (g, h))))));")) + mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 1, 1, 1, 2, 2, 0, + 1, 0, 0, 0, 1, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 1, 1, 2, 2, 0, + 0, 0, 1, 1, 2, 2, 3, 3, 0, + 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 1, 2, 0, 0, 2, 2, 3, 0), 9, + dimnames = list(letters[1:9], NULL)) + dat <- MatrixToPhyDat(mataset) + + # plot(tree); nodelabels(); + expect_concordance <- function(i, expectation) { + expect_equal( + QuartetConcordance(tree, dat[, i], method = "minh", n = 1250), + setNames(expectation, 11:15), tolerance = 0.05) + } + + # Expectations computed by iq-tree + expect_concordance(1, c(0, 0, 1, 0, 0)) + expect_concordance(2, c(1, 0, 0, 0, 0)) + expect_concordance(3, c(0.6, 0, 0, 0, 0.5)) + expect_concordance(4, c(0, 0, 2 / 3, 0, 0)) + expect_concordance(5, c(0, 0, 1 / 3, 0, 3 / 8)) + expect_concordance(6, c(0, 0, 0, 0, 0)) + expect_concordance(7, c(1 / 5, 0, 0, 0, 0)) + expect_concordance(8, rep(0, 5)) + expect_concordance(9, c(0, 0, 1 / 12, 0, 0)) + expect_equal(tolerance = 0.05, + unname(QuartetConcordance(tree, dat, method = "minh", n = 1234)), + c(56.7, 0, 85.4, 0, 62.5) / 100) + + collapsed <- CollapseNode(tree, c(12, 13)) + expect_equal(tolerance = 0.05, + QuartetConcordance(collapsed, dat, method = "minh", n = 1234), + QuartetConcordance(tree, dat, method = "minh", n = 1234)[-(2:3)] + ) +}) + +test_that("QuartetConcordance() calculates correct values - weighting", { + labels <- letters[1:5] + tr <- PectinateTree(labels) + # plot(tr); nodelabels(adj = c(4, 0.5)) + + char <- MatrixToPhyDat(cbind( + c(a = 0, b = 0, c = 0, d = 1, e = 1), + c(0, 0, 1, 0, 1), + c(0, 1, 0, 0, 1) + )[, c(1:3, 1)]) + + # Is quartet concordant with character? + # Quartets labelled by omitted leaf + C <- 1 # Concordant + D <- 0 # Discordant + I <- NaN # Irrelevant + expected_concordance <- cbind( + ch1 = c(a = C, b = C, c = C, d = I, e = I), + ch2 = c(D, D, I, C, I), + ch3 = c(D, I, D, D, I), + ch4 = c(a = C, b = C, c = C, d = I, e = I)) + + conc <- vapply(labels, function(lab) { + quartet <- DropTip(tr, lab) + vapply(1:4, function(i) QuartetConcordance(quartet, char[, i]), double(1)) + }, c(ch1 = NaN, ch2 = NaN, ch3 = NaN, ch4 = NaN)) + expect_equal(conc, t(expected_concordance)) + + split_relevant <- list( + "8" = letters[3:5], + "9" = letters[1:3]) + + expect_equal( + QuartetConcordance(tr, char, method = "split"), # 8 = 0.75; 9 = 0.5 + vapply(split_relevant, function(splitQ) { + sum(expected_concordance[splitQ, ], na.rm = TRUE) / + sum(!is.nan(expected_concordance[splitQ, ])) + }, double(1)) + ) + + # Same proportions, different number of chars -> weights + char10 <- char[, rep(1:4, each = 10)] + expect_equal( + QuartetConcordance(tr, char, method = "split"), + QuartetConcordance(tr, char10, method = "split")) +}) + +test_that("QuartetConcordance() calculates correct values", { tree <- BalancedTree(8) splits <- as.Splits(tree) mataset <- matrix(c(0, 0, 0, 0, 1, 1, 1, 1, 0, @@ -29,6 +203,7 @@ test_that("QuartetConcordance() works", { "`dataset` must be a phyDat object") dat <- MatrixToPhyDat(mataset) expect_equal(unname(QuartetConcordance(tree, dat[, 1])), rep(1, 5)) + # plot(tree); nodelabels(); expect_equal(QuartetConcordance(tree, dat[, 2]), c("10" = 1/9, "11" = 0, "12" = 0,