From d5caef66b81e8d4e0a4eed61edacb40e5e6684a2 Mon Sep 17 00:00:00 2001 From: novica Date: Thu, 4 Dec 2025 18:38:45 +0100 Subject: [PATCH 1/2] fix: vignette builds --- vignettes/tuningHyperparameters.Rmd | 74 +++++++++++++++++++---------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/vignettes/tuningHyperparameters.Rmd b/vignettes/tuningHyperparameters.Rmd index de9e7d4..dd9165a 100644 --- a/vignettes/tuningHyperparameters.Rmd +++ b/vignettes/tuningHyperparameters.Rmd @@ -53,38 +53,60 @@ Folds <- list( Now we need to define the scoring function. This function should, at a minimum, return a list with a ```Score``` element, which is the model evaluation metric we want to maximize. We can also retain other pieces of information created by the scoring function by including them as named elements of the returned list. In this case, we want to retain the optimal number of rounds determined by the ```xgb.cv```: ```{r eval = xgbAvail} + scoringFunction <- function(max_depth, min_child_weight, subsample) { - dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label) - - Pars <- list( - booster = "gbtree" - , eta = 0.01 - , max_depth = max_depth - , min_child_weight = min_child_weight - , subsample = subsample - , objective = "binary:logistic" - , eval_metric = "auc" + # Coerce types explicitly + max_depth <- as.integer(max_depth)[1] + min_child_weight <- as.numeric(min_child_weight)[1] + subsample <- as.numeric(subsample)[1] + + dtrain <- xgboost::xgb.DMatrix( + data = agaricus.train$data, + label = agaricus.train$label + ) + + Pars <- list( + booster = "gbtree", + eta = 0.01, + max_depth = max_depth, + min_child_weight = min_child_weight, + subsample = subsample, + objective = "binary:logistic", + eval_metric = "auc" ) - xgbcv <- xgb.cv( - params = Pars - , data = dtrain - , nround = 100 - , folds = Folds - , prediction = TRUE - , showsd = TRUE - , early_stopping_rounds = 5 - , maximize = TRUE - , verbose = 0) - - return( - list( - Score = max(xgbcv$evaluation_log$test_auc_mean) - , nrounds = xgbcv$best_iteration - ) + xgbcv <- xgboost::xgb.cv( + params = Pars, + data = dtrain, + nrounds = 100, # <- canonical argument name + folds = Folds, + prediction = FALSE, # set TRUE only if you actually need CV preds + showsd = TRUE, + early_stopping_rounds = 5, + maximize = TRUE, + verbose = 0 + ) + + # Compute Score robustly (scalar) + score_vec <- xgbcv$evaluation_log$test_auc_mean + Score <- as.numeric(max(score_vec, na.rm = TRUE))[1] + + # Derive a scalar "best nrounds" robustly + bi <- xgbcv$best_iteration + if (is.null(bi) || length(bi) != 1 || is.na(bi)) { + # fallback: the iteration where test AUC is maximized + bi <- which.max(score_vec) + if (length(bi) != 1 || is.na(bi)) bi <- 1L + } + BestNrounds <- as.integer(bi)[1] + + list( + Score = Score, + BestNrounds = BestNrounds ) } + ``` From 12bd3cba98424fbcb7d0502bfe2e80f1d613cdf6 Mon Sep 17 00:00:00 2001 From: novica Date: Thu, 4 Dec 2025 18:54:54 +0100 Subject: [PATCH 2/2] fix: test works --- tests/testthat/test-hyperparameterTuning.R | 125 +++++++++++++++------ 1 file changed, 88 insertions(+), 37 deletions(-) diff --git a/tests/testthat/test-hyperparameterTuning.R b/tests/testthat/test-hyperparameterTuning.R index 7b5f285..c7c9a25 100644 --- a/tests/testthat/test-hyperparameterTuning.R +++ b/tests/testthat/test-hyperparameterTuning.R @@ -18,52 +18,103 @@ testthat::test_that( , Fold3 = as.integer(seq(3,nrow(agaricus.train$data),by = 3)) ) - scoringFunction <- function( - max_depth - , max_leaves - , min_child_weight - , subsample - , colsample_bytree - , gamma - , lambda - , alpha - ) { - - dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label) + + scoringFunction <- function(max_depth, max_leaves, + min_child_weight, subsample, + colsample_bytree, gamma, lambda, alpha, + .debug = FALSE) { + + # ---- Type coercion & scalarization ---- + max_depth <- as.integer(max_depth)[1] + max_leaves <- as.integer(max_leaves)[1] + min_child_weight <- as.numeric(min_child_weight)[1] + subsample <- as.numeric(subsample)[1] + colsample_bytree <- as.numeric(colsample_bytree)[1] + gamma <- as.numeric(gamma)[1] + lambda <- as.numeric(lambda)[1] + alpha <- as.numeric(alpha)[1] + + # ---- Data (assumes 'agaricus.train' and 'Folds' exist) ---- + dtrain <- xgboost::xgb.DMatrix( + data = agaricus.train$data, + label = agaricus.train$label + ) + + # Base params Pars <- list( - booster = "gbtree" - , eta = 0.001 - , max_depth = max_depth - , max_leaves = max_leaves - , min_child_weight = min_child_weight - , subsample = subsample - , colsample_bytree = colsample_bytree - , gamma = gamma - , lambda = lambda - , alpha = alpha - , objective = "binary:logistic" - , eval_metric = "auc" + booster = "gbtree", + eta = 0.01, + max_depth = max_depth, + min_child_weight = min_child_weight, + subsample = subsample, + colsample_bytree = colsample_bytree, + gamma = gamma, + lambda = lambda, # L2 reg + alpha = alpha, # L1 reg + objective = "binary:logistic", + eval_metric = "auc" ) - xgbcv <- xgb.cv( - params = Pars - , data = dtrain - , nround = 100 - , folds = Folds - , early_stopping_rounds = 5 - , maximize = TRUE - , verbose = 0 + # If max_leaves is requested, enable histogram/lossguide (so xgboost uses it) + if (!is.na(max_leaves) && max_leaves > 0L) { + Pars$tree_method <- "hist" + Pars$grow_policy <- "lossguide" + Pars$max_leaves <- max_leaves + # It's common to leave max_depth as-is; alternatively set max_depth = 0 + # Pars$max_depth <- 0L + } + + # ---- Safe CV wrapper ---- + xgbcv <- try( + xgboost::xgb.cv( + params = Pars, + data = dtrain, + nrounds = 100, + folds = Folds, + prediction = FALSE, + showsd = TRUE, + early_stopping_rounds = 5, + maximize = TRUE, + verbose = 0 + ), + silent = TRUE ) - return( - list( - Score = max(xgbcv$evaluation_log$test_auc_mean) - , nrounds = xgbcv$best_iteration - ) + # On error: return worst score but keep scalars so bayesOpt can proceed + if (inherits(xgbcv, "try-error")) { + if (isTRUE(.debug)) message("xgb.cv error: ", as.character(xgbcv)) + return(list(Score = as.numeric(-Inf), BestNrounds = as.integer(1L))) + } + + # ---- Scalar Score ---- + score_vec <- as.numeric(xgbcv$evaluation_log$test_auc_mean) + if (!is.null(names(score_vec))) names(score_vec) <- NULL + Score <- as.numeric(max(score_vec, na.rm = TRUE))[1] + + # ---- Scalar best nrounds ---- + bi <- xgbcv$best_iteration + if (is.null(bi) || length(bi) != 1L || is.na(bi)) { + bi <- which.max(score_vec) + if (length(bi) != 1L || is.na(bi)) bi <- 1L + } + BestNrounds <- as.integer(bi)[1] + + if (isTRUE(.debug)) { + cat(sprintf( + "DEBUG | Score len=%d val=%.6f | BestNrounds len=%d val=%d\n", + length(Score), Score, length(BestNrounds), BestNrounds + )) + } + + list( + Score = Score, # must be scalar + BestNrounds = BestNrounds # must be scalar ) } + + bounds <- list( max_depth = c(1L, 5L) , max_leaves = c(2L,25L)