diff --git a/NEWS.md b/NEWS.md index 06e8453af..a08a957e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # TreeSearch 1.7.0.9000 (development) -No changes yet. +- Fix regression in `MaximumLength()`. # TreeSearch 1.7.0 (2025-08-22) diff --git a/R/length_range.R b/R/length_range.R index 21abe73e5..66a7e8d0e 100644 --- a/R/length_range.R +++ b/R/length_range.R @@ -128,7 +128,7 @@ MinimumLength.numeric <- function (x, compress = NA) { } else { squish <- which.max(occurrences) tokensUsed <- tokensUsed + 1L - tokens <- tokens[, tokens[!squish], drop = FALSE] + tokens <- tokens[, !tokens[squish, ], drop = FALSE] } } lastDim <- dim(tokens) @@ -178,8 +178,13 @@ MaximumLength.numeric <- function(x, compress = NA) { nInapp <- sum(x == 0) regions <- max(1, nInapp - 1) steps <- 0 - if (sum(counts > 0) > 1) { + if (length(counts) > 0 && sum(counts > 0) > 1) { nState <- floor(log2(length(counts))) + 1L + # Guard against overflow: nState > 30 would cause nToken to overflow + # integer capacity and create prohibitively large matrices + if (nState > 30L) { + stop("MaximumLength() does not support more than 30 character states.") + } nToken <- 2 ^ nState - 1 counts <- c(counts, double(nToken - length(counts))) @@ -188,9 +193,10 @@ MaximumLength.numeric <- function(x, compress = NA) { tokenSums <- colSums(tokens) active <- c(rep(TRUE, nToken - 1), FALSE) - bitTokens <- as.raw(seq_len(nToken)) - nonIntersect <- outer(bitTokens, bitTokens, `&`) == 00 - unions <- matrix(tokenSums[as.integer(outer(bitTokens, bitTokens, `|`))], + # Use integer bitwise operations to avoid raw overflow when nToken > 255 + bitTokens <- seq_len(nToken) + nonIntersect <- outer(bitTokens, bitTokens, bitwAnd) == 0L + unions <- matrix(tokenSums[outer(bitTokens, bitTokens, bitwOr)], nToken, nToken) .Merge <- function(a, b) sum(2 ^ (which(tokens[, a] | tokens[, b]) - 1)) loopCount <- 0 diff --git a/tests/testthat/test-length_range.R b/tests/testthat/test-length_range.R index 3e18156cf..0ae93ac32 100644 --- a/tests/testthat/test-length_range.R +++ b/tests/testthat/test-length_range.R @@ -151,3 +151,37 @@ test_that("MaximumLength() handles inapplicable tokens", { expect_equal(MaximumLength("00112233{01}{23}{012}----"), 3 + 3 + 1 + 2) }) +test_that("MaximumLength() handles many states (>8) without overflow", { + # Regression test for crash with datasets having more than 8 character states + # When nToken > 255, as.raw() would overflow; now uses bitwAnd/bitwOr instead + + # Test with 10 states (nToken = 1023) + # This should complete without error (the specific result is secondary) + # 10 distinct character states (powers of 2 for bit representation) + manyStates <- c(1, 2, 4, 8, 16, 32, 64, 128, 256, 512) + expect_silent(result <- MaximumLength.numeric(manyStates)) + expect_equal(result, 9) # max steps for 10 distinct tokens + + # Test with combination of states including inapplicable (0) + manyStatesWithInapp <- c(0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 0) + expect_silent(result <- MaximumLength.numeric(manyStatesWithInapp)) + expect_equal(result, 9) # 9 steps + 0 extra for inapplicable regions + + # Test with large combined token value (all 10 states present in one tip) + # This is the scenario that triggered #203 + # 1 = only state 0, 1023 = all states 0-9 combined (2^10 - 1) + largeToken <- c(1, 1023) + expect_silent(result <- MaximumLength.numeric(largeToken)) + expect_equal(result, 0) # state 0 is subset of 1023, so no extra steps needed + + expect_error(expect_warning(MaximumLength.numeric(2 ^ (0:31))), + "does not support more than 30 character states") +}) + +test_that("MaximumLength() works with Vinther2008 dataset", { + # This dataset has 10 states (0-9) which caused overflow + data("inapplicable.datasets") + expect_silent(result <- MaximumLength(inapplicable.phyData[["Vinther2008"]])) + expect_true(is.numeric(result)) + expect_true(all(result >= 0)) +})