diff --git a/tests/testthat/test-hyperparameterTuning.R b/tests/testthat/test-hyperparameterTuning.R index 7b5f285..c7c9a25 100644 --- a/tests/testthat/test-hyperparameterTuning.R +++ b/tests/testthat/test-hyperparameterTuning.R @@ -18,52 +18,103 @@ testthat::test_that( , Fold3 = as.integer(seq(3,nrow(agaricus.train$data),by = 3)) ) - scoringFunction <- function( - max_depth - , max_leaves - , min_child_weight - , subsample - , colsample_bytree - , gamma - , lambda - , alpha - ) { - - dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label) + + scoringFunction <- function(max_depth, max_leaves, + min_child_weight, subsample, + colsample_bytree, gamma, lambda, alpha, + .debug = FALSE) { + + # ---- Type coercion & scalarization ---- + max_depth <- as.integer(max_depth)[1] + max_leaves <- as.integer(max_leaves)[1] + min_child_weight <- as.numeric(min_child_weight)[1] + subsample <- as.numeric(subsample)[1] + colsample_bytree <- as.numeric(colsample_bytree)[1] + gamma <- as.numeric(gamma)[1] + lambda <- as.numeric(lambda)[1] + alpha <- as.numeric(alpha)[1] + + # ---- Data (assumes 'agaricus.train' and 'Folds' exist) ---- + dtrain <- xgboost::xgb.DMatrix( + data = agaricus.train$data, + label = agaricus.train$label + ) + + # Base params Pars <- list( - booster = "gbtree" - , eta = 0.001 - , max_depth = max_depth - , max_leaves = max_leaves - , min_child_weight = min_child_weight - , subsample = subsample - , colsample_bytree = colsample_bytree - , gamma = gamma - , lambda = lambda - , alpha = alpha - , objective = "binary:logistic" - , eval_metric = "auc" + booster = "gbtree", + eta = 0.01, + max_depth = max_depth, + min_child_weight = min_child_weight, + subsample = subsample, + colsample_bytree = colsample_bytree, + gamma = gamma, + lambda = lambda, # L2 reg + alpha = alpha, # L1 reg + objective = "binary:logistic", + eval_metric = "auc" ) - xgbcv <- xgb.cv( - params = Pars - , data = dtrain - , nround = 100 - , folds = Folds - , early_stopping_rounds = 5 - , maximize = TRUE - , verbose = 0 + # If max_leaves is requested, enable histogram/lossguide (so xgboost uses it) + if (!is.na(max_leaves) && max_leaves > 0L) { + Pars$tree_method <- "hist" + Pars$grow_policy <- "lossguide" + Pars$max_leaves <- max_leaves + # It's common to leave max_depth as-is; alternatively set max_depth = 0 + # Pars$max_depth <- 0L + } + + # ---- Safe CV wrapper ---- + xgbcv <- try( + xgboost::xgb.cv( + params = Pars, + data = dtrain, + nrounds = 100, + folds = Folds, + prediction = FALSE, + showsd = TRUE, + early_stopping_rounds = 5, + maximize = TRUE, + verbose = 0 + ), + silent = TRUE ) - return( - list( - Score = max(xgbcv$evaluation_log$test_auc_mean) - , nrounds = xgbcv$best_iteration - ) + # On error: return worst score but keep scalars so bayesOpt can proceed + if (inherits(xgbcv, "try-error")) { + if (isTRUE(.debug)) message("xgb.cv error: ", as.character(xgbcv)) + return(list(Score = as.numeric(-Inf), BestNrounds = as.integer(1L))) + } + + # ---- Scalar Score ---- + score_vec <- as.numeric(xgbcv$evaluation_log$test_auc_mean) + if (!is.null(names(score_vec))) names(score_vec) <- NULL + Score <- as.numeric(max(score_vec, na.rm = TRUE))[1] + + # ---- Scalar best nrounds ---- + bi <- xgbcv$best_iteration + if (is.null(bi) || length(bi) != 1L || is.na(bi)) { + bi <- which.max(score_vec) + if (length(bi) != 1L || is.na(bi)) bi <- 1L + } + BestNrounds <- as.integer(bi)[1] + + if (isTRUE(.debug)) { + cat(sprintf( + "DEBUG | Score len=%d val=%.6f | BestNrounds len=%d val=%d\n", + length(Score), Score, length(BestNrounds), BestNrounds + )) + } + + list( + Score = Score, # must be scalar + BestNrounds = BestNrounds # must be scalar ) } + + bounds <- list( max_depth = c(1L, 5L) , max_leaves = c(2L,25L) diff --git a/vignettes/tuningHyperparameters.Rmd b/vignettes/tuningHyperparameters.Rmd index de9e7d4..dd9165a 100644 --- a/vignettes/tuningHyperparameters.Rmd +++ b/vignettes/tuningHyperparameters.Rmd @@ -53,38 +53,60 @@ Folds <- list( Now we need to define the scoring function. This function should, at a minimum, return a list with a ```Score``` element, which is the model evaluation metric we want to maximize. We can also retain other pieces of information created by the scoring function by including them as named elements of the returned list. In this case, we want to retain the optimal number of rounds determined by the ```xgb.cv```: ```{r eval = xgbAvail} + scoringFunction <- function(max_depth, min_child_weight, subsample) { - dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label) - - Pars <- list( - booster = "gbtree" - , eta = 0.01 - , max_depth = max_depth - , min_child_weight = min_child_weight - , subsample = subsample - , objective = "binary:logistic" - , eval_metric = "auc" + # Coerce types explicitly + max_depth <- as.integer(max_depth)[1] + min_child_weight <- as.numeric(min_child_weight)[1] + subsample <- as.numeric(subsample)[1] + + dtrain <- xgboost::xgb.DMatrix( + data = agaricus.train$data, + label = agaricus.train$label + ) + + Pars <- list( + booster = "gbtree", + eta = 0.01, + max_depth = max_depth, + min_child_weight = min_child_weight, + subsample = subsample, + objective = "binary:logistic", + eval_metric = "auc" ) - xgbcv <- xgb.cv( - params = Pars - , data = dtrain - , nround = 100 - , folds = Folds - , prediction = TRUE - , showsd = TRUE - , early_stopping_rounds = 5 - , maximize = TRUE - , verbose = 0) - - return( - list( - Score = max(xgbcv$evaluation_log$test_auc_mean) - , nrounds = xgbcv$best_iteration - ) + xgbcv <- xgboost::xgb.cv( + params = Pars, + data = dtrain, + nrounds = 100, # <- canonical argument name + folds = Folds, + prediction = FALSE, # set TRUE only if you actually need CV preds + showsd = TRUE, + early_stopping_rounds = 5, + maximize = TRUE, + verbose = 0 + ) + + # Compute Score robustly (scalar) + score_vec <- xgbcv$evaluation_log$test_auc_mean + Score <- as.numeric(max(score_vec, na.rm = TRUE))[1] + + # Derive a scalar "best nrounds" robustly + bi <- xgbcv$best_iteration + if (is.null(bi) || length(bi) != 1 || is.na(bi)) { + # fallback: the iteration where test AUC is maximized + bi <- which.max(score_vec) + if (length(bi) != 1 || is.na(bi)) bi <- 1L + } + BestNrounds <- as.integer(bi)[1] + + list( + Score = Score, + BestNrounds = BestNrounds ) } + ```