diff --git a/python/nvforest/nvforest/benchmark/README.md b/python/nvforest/nvforest/benchmark/README.md new file mode 100644 index 0000000..6a22048 --- /dev/null +++ b/python/nvforest/nvforest/benchmark/README.md @@ -0,0 +1,161 @@ +# nvforest Benchmark Suite + +Comprehensive benchmark comparing nvforest inference performance against native ML framework inference (sklearn, XGBoost, LightGBM). + +## Quick Start + +```bash +# Dry run - see what will be benchmarked +python -m nvforest.benchmark.benchmark run --dry-run + +# Quick test - verify setup with minimal parameters +python -m nvforest.benchmark.benchmark run --quick-test + +# Full benchmark +python -m nvforest.benchmark.benchmark run +``` + +## Usage + +### Running Benchmarks + +```bash +python -m nvforest.benchmark.benchmark run [OPTIONS] +``` + +**Options:** + +| Option | Short | Description | +|--------|-------|-------------| +| `--framework` | `-f` | Framework(s) to benchmark: `sklearn`, `xgboost`, `lightgbm`. Repeatable. Default: all available | +| `--dry-run` | `-n` | Print configuration without running | +| `--quick-test` | `-q` | Run with minimal parameters for quick verification | +| `--device` | `-d` | Device: `cpu`, `gpu`, or `both`. Default: `both` | +| `--model-type` | `-m` | Model type: `regressor`, `classifier`, or `both`. Default: `both` | +| `--output-dir` | `-o` | Output directory for results. Default: `benchmark/data/` | + +**Examples:** + +```bash +# Benchmark only sklearn on CPU +python -m nvforest.benchmark.benchmark run --framework sklearn --device cpu + +# Benchmark XGBoost and LightGBM classifiers only +python -m nvforest.benchmark.benchmark run -f xgboost -f lightgbm -m classifier + +# Quick test with specific framework +python -m nvforest.benchmark.benchmark run --quick-test --framework sklearn +``` + +### Analyzing Results + +```bash +python -m nvforest.benchmark.analyze RESULTS_FILE [OPTIONS] +``` + +**Options:** + +| Option | Short | Description | +|--------|-------|-------------| +| `--output` | `-o` | Output file for speedup heatmap plot | +| `--framework` | `-f` | Filter results to specific framework | +| `--device` | `-d` | Filter results to specific device (`cpu` or `gpu`) | +| `--plot-only` | | Only generate plot, skip summary | +| `--summary-only` | | Only print summary, skip plot | + +**Examples:** + +```bash +# Analyze results and generate plots +python -m nvforest.benchmark.analyze data/final_results.csv + +# Summary only for GPU results +python -m nvforest.benchmark.analyze data/final_results.csv --device gpu --summary-only +``` + +## Parameter Space + +### Full Benchmark + +| Parameter | Values | +|-----------|--------| +| `num_features` | 8, 32, 128, 512 | +| `max_depth` | 2, 4, 8, 16, 32 | +| `num_trees` | 16, 128, 1024 | +| `batch_size` | 1, 16, 128, 1024, 1,048,576, 16,777,216 | + +### Quick Test + +| Parameter | Values | +|-----------|--------| +| `num_features` | 32 | +| `max_depth` | 4 | +| `num_trees` | 16 | +| `batch_size` | 1024 | + +## Device Handling + +The `--device` parameter affects how both native frameworks and nvforest run inference: + +### nvforest +- **CPU**: Uses CPU inference backend +- **GPU**: Uses GPU inference backend with cupy arrays + +### XGBoost +- **CPU**: Standard CPU inference with DMatrix +- **GPU**: Models are trained with `device="cuda"` and use `inplace_predict` for GPU inference + +### LightGBM +- **CPU**: Standard CPU inference +- **GPU**: LightGBM GPU training and inference require the library to be built with GPU support: + ```bash + # Option 1: Build from source + cmake -DUSE_GPU=1 .. + + # Option 2: pip with GPU flag + pip install lightgbm --install-option=--gpu + ``` + When GPU is requested and LightGBM GPU support is available, models are trained with `device="gpu"` and inference uses the GPU-trained model. If GPU support is not available, training/inference falls back to CPU with a warning. + +### sklearn +- **CPU**: Standard CPU inference +- **GPU**: sklearn is CPU-only. For GPU benchmarks, native inference runs on CPU as a baseline. The speedup comparison reflects nvforest GPU vs sklearn CPU. + +> **Note**: When `device=gpu`, XGBoost models are trained on GPU which enables GPU-native inference. This provides a fair comparison between XGBoost GPU inference and nvforest GPU inference. + +## Output + +Results are saved as CSV files in the output directory: + +- `checkpoint_N.csv` - Periodic checkpoints during benchmark +- `final_results.csv` - Complete results + +**Columns:** + +| Column | Description | +|--------|-------------| +| `framework` | ML framework (sklearn, xgboost, lightgbm) | +| `model_type` | regressor or classifier | +| `device` | cpu or gpu | +| `num_features` | Number of input features | +| `max_depth` | Maximum tree depth | +| `num_trees` | Number of trees in ensemble | +| `batch_size` | Inference batch size | +| `native_time` | Native framework inference time (seconds) | +| `nvforest_time` | nvforest inference time (seconds) | +| `optimal_layout` | Layout selected by nvforest optimize() | +| `optimal_chunk_size` | Chunk size selected by nvforest optimize() | +| `speedup` | native_time / nvforest_time | + +## Dependencies + +Required: +- `click` +- `pandas` +- `numpy` + +Optional (for specific frameworks): +- `scikit-learn` - for sklearn benchmarks +- `xgboost` - for XGBoost benchmarks +- `lightgbm` - for LightGBM benchmarks +- `matplotlib`, `seaborn` - for result visualization diff --git a/python/nvforest/nvforest/benchmark/__init__.py b/python/nvforest/nvforest/benchmark/__init__.py new file mode 100644 index 0000000..7b8d58b --- /dev/null +++ b/python/nvforest/nvforest/benchmark/__init__.py @@ -0,0 +1,6 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +"""Benchmark suite for nvforest comparing against native ML framework inference.""" diff --git a/python/nvforest/nvforest/benchmark/analyze.py b/python/nvforest/nvforest/benchmark/analyze.py new file mode 100644 index 0000000..3689302 --- /dev/null +++ b/python/nvforest/nvforest/benchmark/analyze.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Analyze and visualize benchmark results. + +Usage: + python -m nvforest.benchmark.analyze results.csv + python -m nvforest.benchmark.analyze results.csv --output speedup_plots.png +""" + +import os +from typing import Optional + +import click +import pandas as pd + + +def plot_speedup_heatmaps( + df: pd.DataFrame, + title: str = "nvforest Speedups", + output_filename: str = "speedup_heatmaps.png", + speedup_column: str = "speedup", +) -> None: + """ + Generate speedup heatmaps from benchmark results. + + Parameters + ---------- + df : pd.DataFrame + Benchmark results DataFrame with columns: max_depth, num_trees, + num_features, batch_size, and the speedup column. + title : str + Title for the plot. + output_filename : str + Output file path for the plot. + speedup_column : str + Name of the column containing speedup values. + """ + try: + import matplotlib.pyplot as plt + import seaborn as sns + except ImportError: + raise ImportError( + "matplotlib and seaborn are required for plotting. " + "Install them with: pip install matplotlib seaborn" + ) + + # Get unique values for each parameter + max_depth_values = sorted(df["max_depth"].unique()) + num_trees_values = sorted(df["num_trees"].unique()) + + # Create grid of subplots + fig, axes = plt.subplots( + nrows=len(max_depth_values), + ncols=len(num_trees_values), + figsize=(len(num_trees_values) * 4, len(max_depth_values) * 4), + constrained_layout=True, + ) + + # Handle single row/column case + if len(max_depth_values) == 1: + axes = axes.reshape(1, -1) + if len(num_trees_values) == 1: + axes = axes.reshape(-1, 1) + + min_speedup = df[speedup_column].min() + max_speedup = df[speedup_column].max() + + for i, max_depth in enumerate(max_depth_values): + for j, num_trees in enumerate(num_trees_values): + ax = axes[i, j] + + # Filter data for the current max_depth and num_trees + subset = df[ + (df["max_depth"] == max_depth) & (df["num_trees"] == num_trees) + ] + + if subset.empty: + ax.set_visible(False) + continue + + # Pivot data for heatmap + heatmap_data = subset.pivot_table( + index="num_features", + columns="batch_size", + values=speedup_column, + ) + heatmap_data.columns = pd.Index( + [f"{x:.0e}" for x in heatmap_data.columns], name="Batch Size" + ) + + # Plot heatmap + sns.heatmap( + heatmap_data, + ax=ax, + vmin=min_speedup, + vmax=max_speedup, + center=1.0, + cmap="RdBu", + annot=True, + fmt=".1f", + cbar=False, + ) + + if i == 0: + ax.set_title(f"Tree count: {num_trees}") + if i == len(max_depth_values) - 1: + ax.set_xlabel("Batch Size") + else: + ax.set_xlabel("") + if j == 0: + ax.set_ylabel(f"Maximum Depth: {max_depth}\nFeature Count") + else: + ax.set_ylabel("") + + fig.suptitle(title, fontsize=24) + plt.savefig(output_filename, dpi=300) + plt.close() + print(f"Saved plot to {output_filename}") + + +def generate_summary_stats(df: pd.DataFrame) -> pd.DataFrame: + """ + Generate summary statistics from benchmark results. + + Parameters + ---------- + df : pd.DataFrame + Benchmark results DataFrame. + + Returns + ------- + pd.DataFrame + Summary statistics grouped by framework, model_type, and device. + """ + summary = ( + df.groupby(["framework", "model_type", "device"]) + .agg( + { + "speedup": ["mean", "median", "min", "max", "std"], + "native_time": ["mean", "sum"], + "nvforest_time": ["mean", "sum"], + } + ) + .round(3) + ) + + return summary + + +def print_summary(df: pd.DataFrame) -> None: + """Print a human-readable summary of benchmark results.""" + print("\n" + "=" * 60) + print("BENCHMARK SUMMARY") + print("=" * 60) + + # Overall stats + print(f"\nTotal benchmark runs: {len(df)}") + print(f"Frameworks tested: {', '.join(df['framework'].unique())}") + print(f"Model types: {', '.join(df['model_type'].unique())}") + print(f"Devices: {', '.join(df['device'].unique())}") + + # Speedup stats + print("\n" + "-" * 40) + print("SPEEDUP STATISTICS (native / nvforest)") + print("-" * 40) + + for framework in df["framework"].unique(): + print(f"\n{framework.upper()}:") + fw_df = df[df["framework"] == framework] + + for device in fw_df["device"].unique(): + dev_df = fw_df[fw_df["device"] == device] + valid_speedups = dev_df["speedup"].dropna() + + if len(valid_speedups) > 0: + print(f" {device}:") + print(f" Mean speedup: {valid_speedups.mean():.2f}x") + print(f" Median speedup: {valid_speedups.median():.2f}x") + print(f" Min speedup: {valid_speedups.min():.2f}x") + print(f" Max speedup: {valid_speedups.max():.2f}x") + + # Best configurations + print("\n" + "-" * 40) + print("TOP 5 CONFIGURATIONS BY SPEEDUP") + print("-" * 40) + + top_5 = df.nlargest(5, "speedup")[ + [ + "framework", + "model_type", + "device", + "num_trees", + "max_depth", + "batch_size", + "speedup", + ] + ] + print(top_5.to_string(index=False)) + + print("\n" + "=" * 60) + + +@click.command() +@click.argument("results_file", type=click.Path(exists=True)) +@click.option( + "--output", + "-o", + default=None, + help="Output file for speedup heatmap plot.", +) +@click.option( + "--framework", + "-f", + default=None, + help="Filter results to specific framework.", +) +@click.option( + "--device", + "-d", + default=None, + type=click.Choice(["cpu", "gpu"]), + help="Filter results to specific device.", +) +@click.option( + "--plot-only", + is_flag=True, + help="Only generate plot, skip summary output.", +) +@click.option( + "--summary-only", + is_flag=True, + help="Only print summary, skip plot generation.", +) +def analyze( + results_file: str, + output: Optional[str], + framework: Optional[str], + device: Optional[str], + plot_only: bool, + summary_only: bool, +): + """Analyze benchmark results from RESULTS_FILE.""" + df = pd.read_csv(results_file) + + # Apply filters + if framework: + df = df[df["framework"] == framework] + if device: + df = df[df["device"] == device] + + if df.empty: + raise click.ClickException("No data matches the specified filters.") + + # Print summary + if not plot_only: + print_summary(df) + + # Generate plot + if not summary_only: + if output is None: + base_name = os.path.splitext(results_file)[0] + output = f"{base_name}_speedup.png" + + title = "nvforest Speedup vs Native Inference" + if framework: + title += f" ({framework})" + if device: + title += f" [{device.upper()}]" + + try: + plot_speedup_heatmaps(df, title=title, output_filename=output) + except ImportError as e: + if not plot_only: + print(f"\nNote: {e}") + else: + raise click.ClickException(str(e)) + + +if __name__ == "__main__": + analyze() diff --git a/python/nvforest/nvforest/benchmark/benchmark.py b/python/nvforest/nvforest/benchmark/benchmark.py new file mode 100644 index 0000000..ce4a13b --- /dev/null +++ b/python/nvforest/nvforest/benchmark/benchmark.py @@ -0,0 +1,856 @@ +#!/usr/bin/env python +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Comprehensive benchmark comparing nvforest against native ML framework inference. + +Supports sklearn, XGBoost, and LightGBM models with both regressor and classifier +variants on CPU and GPU devices. + +Usage: + python -m nvforest.benchmark.benchmark run + python -m nvforest.benchmark.benchmark run --framework sklearn --framework xgboost + python -m nvforest.benchmark.benchmark run --dry-run + python -m nvforest.benchmark.benchmark run --quick-test +""" + +import gc +import logging +import os +import sys +from dataclasses import dataclass +from datetime import datetime +from time import perf_counter +from typing import Any, Callable, Optional + +import click +import numpy as np +import pandas as pd +import treelite + +import nvforest + +# Conditional imports for ML frameworks +try: + from sklearn.datasets import make_classification, make_regression + from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + + SKLEARN_AVAILABLE = True +except ImportError: + SKLEARN_AVAILABLE = False + +try: + import xgboost as xgb + + XGBOOST_AVAILABLE = True +except ImportError: + XGBOOST_AVAILABLE = False + +try: + import lightgbm as lgb + + LIGHTGBM_AVAILABLE = True +except ImportError: + LIGHTGBM_AVAILABLE = False + + +# Constants +DEFAULT_WARMUP_CYCLES = 3 +DEFAULT_BENCHMARK_CYCLES = 5 +MAX_BATCH_SIZE = 16_777_216 +TRAIN_SAMPLES = 10_000 +RANDOM_STATE = 0 + +DATA_DIR = os.path.join(os.path.dirname(__file__), "data") + +# Parameter spaces +FULL_VALUES = { + "num_features": [8, 32, 128, 512], + "max_depth": [2, 4, 8, 16, 32], + "num_trees": [16, 128, 1024], + "batch_size": [1, 16, 128, 1024, 1_048_576, MAX_BATCH_SIZE], +} + +QUICK_TEST_VALUES = { + "num_features": [32], + "max_depth": [4], + "num_trees": [16], + "batch_size": [1024], +} + + +def get_logger(): + """Configure and return the benchmark logger.""" + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(logging.INFO) + logger.addHandler(handler) + logger.propagate = False + return logger + + +LOGGER = get_logger() + + +@dataclass +class FrameworkConfig: + """Configuration for a ML framework.""" + + name: str + regressor_class: Optional[type] + classifier_class: Optional[type] + save_model: Callable[[Any, str], None] + load_native: Callable[[str, str], Any] # (path, device) -> model + predict_native: Callable[ + [Any, np.ndarray, str], np.ndarray + ] # (model, X, device) -> preds + model_extension: str + supports_gpu_native: bool = False + + +def _save_sklearn_model(model: Any, path: str) -> None: + """Save sklearn model as treelite checkpoint.""" + treelite.sklearn.import_model(model).serialize(path) + + +def _load_sklearn_model(path: str, device: str = "cpu") -> Any: + """Load sklearn model - returns None as we use the trained model directly. + + Note: sklearn doesn't support GPU, device parameter is ignored. + """ + return None + + +def _predict_sklearn( + model: Any, X: np.ndarray, device: str = "cpu" +) -> np.ndarray: + """Run sklearn prediction. + + Note: sklearn only supports CPU. If X is a cupy array, converts to numpy. + """ + if hasattr(X, "get"): # cupy array + X = X.get() + return model.predict(X) + + +def _save_xgboost_model(model: Any, path: str) -> None: + """Save XGBoost model in UBJSON format.""" + booster = model.get_booster() if hasattr(model, "get_booster") else model + booster.save_model(path) + + +def _load_xgboost_model(path: str, device: str = "cpu") -> Any: + """Load XGBoost booster from file.""" + booster = xgb.Booster() + booster.load_model(path) + if device == "gpu": + booster.set_param({"device": "cuda"}) + return booster + + +def _predict_xgboost( + model: Any, X: np.ndarray, device: str = "cpu" +) -> np.ndarray: + """Run XGBoost prediction on specified device.""" + if isinstance(model, xgb.Booster): + # For GPU, use inplace_predict which is faster and handles GPU data + if device == "gpu": + return model.inplace_predict(X) + dmatrix = xgb.DMatrix(X) + return model.predict(dmatrix) + # Scikit-learn API wrapper + return model.predict(X) + + +def _save_lightgbm_model(model: Any, path: str) -> None: + """Save LightGBM model in text format.""" + booster = model.booster_ if hasattr(model, "booster_") else model + booster.save_model(path) + + +def _load_lightgbm_model(path: str, device: str = "cpu") -> Any: + """Load LightGBM booster from file. + + Note: LightGBM GPU inference requires the library to be built with GPU support. + """ + return lgb.Booster(model_file=path) + + +def _check_lightgbm_gpu_available() -> bool: + """Check if LightGBM was built with GPU support.""" + try: + # Try to create a dummy dataset and check if GPU device works + # This is a lightweight check that doesn't require actual training + params = {"device": "gpu", "gpu_platform_id": 0, "gpu_device_id": 0} + _ = lgb.Dataset([], params=params) + # LightGBM will raise an error if GPU is not available when we try to use it + return True # Assume available, will fail gracefully during training/predict + except Exception: + return False + + +# Cache for LightGBM GPU availability +_LIGHTGBM_GPU_CHECKED = False +_LIGHTGBM_GPU_AVAILABLE = False + + +def _predict_lightgbm( + model: Any, X: np.ndarray, device: str = "cpu" +) -> np.ndarray: + """Run LightGBM prediction on specified device. + + LightGBM GPU inference requires the library to be built with GPU support + (cmake -DUSE_GPU=1 or pip install lightgbm --install-option=--gpu). + + When GPU is requested and available, uses GPU prediction. + Falls back to CPU if GPU is not available. + """ + global _LIGHTGBM_GPU_CHECKED, _LIGHTGBM_GPU_AVAILABLE + + # LightGBM requires numpy arrays (doesn't support cupy directly) + if hasattr(X, "get"): # cupy array + X = X.get() + + if device == "gpu": + try: + # LightGBM GPU prediction - works if model was trained with GPU + # and LightGBM was built with GPU support + return model.predict(X, num_threads=1) + except lgb.basic.LightGBMError as e: + if not _LIGHTGBM_GPU_CHECKED: + _LIGHTGBM_GPU_CHECKED = True + _LIGHTGBM_GPU_AVAILABLE = False + LOGGER.warning( + f"LightGBM GPU inference failed: {e}. " + "Ensure LightGBM is built with GPU support. " + "Falling back to CPU inference." + ) + return model.predict(X) + + return model.predict(X) + + +# Framework configurations +FRAMEWORKS: dict[str, FrameworkConfig] = {} + +if SKLEARN_AVAILABLE: + FRAMEWORKS["sklearn"] = FrameworkConfig( + name="sklearn", + regressor_class=RandomForestRegressor, + classifier_class=RandomForestClassifier, + save_model=_save_sklearn_model, + load_native=_load_sklearn_model, + predict_native=_predict_sklearn, + model_extension=".tl", + supports_gpu_native=False, + ) + +if XGBOOST_AVAILABLE: + FRAMEWORKS["xgboost"] = FrameworkConfig( + name="xgboost", + regressor_class=xgb.XGBRegressor, + classifier_class=xgb.XGBClassifier, + save_model=_save_xgboost_model, + load_native=_load_xgboost_model, + predict_native=_predict_xgboost, + model_extension=".ubj", + supports_gpu_native=True, + ) + +if LIGHTGBM_AVAILABLE: + FRAMEWORKS["lightgbm"] = FrameworkConfig( + name="lightgbm", + regressor_class=lgb.LGBMRegressor, + classifier_class=lgb.LGBMClassifier, + save_model=_save_lightgbm_model, + load_native=_load_lightgbm_model, + predict_native=_predict_lightgbm, + model_extension=".txt", + supports_gpu_native=True, + ) + + +def run_inference_benchmark( + predict_fn: Callable, + X: np.ndarray, + batch_size: int, + warmup_cycles: int = DEFAULT_WARMUP_CYCLES, + benchmark_cycles: int = DEFAULT_BENCHMARK_CYCLES, +) -> float: + """ + Run inference benchmark and return minimum elapsed time. + + Parameters + ---------- + predict_fn : Callable + Function that takes a batch and returns predictions. + X : np.ndarray + Full dataset to sample batches from. + batch_size : int + Size of each batch. + warmup_cycles : int + Number of warmup iterations. + benchmark_cycles : int + Number of timed iterations. + + Returns + ------- + float + Minimum elapsed time across benchmark cycles. + """ + available_batch_count = X.shape[0] // batch_size + + # Warmup + for cycle in range(warmup_cycles): + batch_index = cycle % available_batch_count + batch = X[batch_index * batch_size : (batch_index + 1) * batch_size] + predict_fn(batch) + + # Benchmark + elapsed = float("inf") + for cycle in range(warmup_cycles, warmup_cycles + benchmark_cycles): + batch_index = cycle % available_batch_count + batch = X[batch_index * batch_size : (batch_index + 1) * batch_size] + begin = perf_counter() + predict_fn(batch) + end = perf_counter() + elapsed = min(elapsed, end - begin) + + # Log throughput + throughput = batch_size / elapsed + throughput_gb = throughput * X.shape[1] * X.dtype.itemsize / 1e9 + if throughput_gb >= 0.01: + LOGGER.info(f" Throughput: {throughput_gb:.2f} GB/s") + elif throughput_gb * 1000 >= 0.01: + LOGGER.info(f" Throughput: {throughput_gb * 1000:.2f} MB/s") + else: + LOGGER.info(f" Throughput: {throughput_gb * 1e6:.2f} KB/s") + + return elapsed + + +def write_checkpoint( + results: dict, data_dir: str, final: bool = False +) -> None: + """Write results to checkpoint file.""" + if not hasattr(write_checkpoint, "counter"): + write_checkpoint.counter = 0 + + MAX_CHECKPOINTS = 6 + filename = ( + "final_results.csv" + if final + else f"checkpoint_{write_checkpoint.counter}.csv" + ) + + results_df = pd.DataFrame(results) + results_df.to_csv(os.path.join(data_dir, filename), index=False) + + write_checkpoint.counter = (write_checkpoint.counter + 1) % MAX_CHECKPOINTS + + +def train_model( + framework: FrameworkConfig, + model_type: str, + num_trees: int, + max_depth: int, + X_train: np.ndarray, + y_train: np.ndarray, + device: str = "cpu", +) -> Any: + """Train a model with the given framework and parameters. + + Parameters + ---------- + framework : FrameworkConfig + Framework configuration. + model_type : str + 'regressor' or 'classifier'. + num_trees : int + Number of trees/estimators. + max_depth : int + Maximum tree depth. + X_train : np.ndarray + Training features. + y_train : np.ndarray + Training labels. + device : str + Device to use for training ('cpu' or 'gpu'). + Only affects XGBoost currently. + """ + model_class = ( + framework.regressor_class + if model_type == "regressor" + else framework.classifier_class + ) + + if framework.name == "sklearn": + model = model_class( + n_estimators=num_trees, max_depth=max_depth, n_jobs=-1 + ) + elif framework.name == "xgboost": + # Use GPU for training if requested - enables GPU inference + xgb_device = "cuda" if device == "gpu" else "cpu" + model = model_class( + n_estimators=num_trees, + max_depth=max_depth, + tree_method="hist", + device=xgb_device, + n_jobs=-1, + ) + elif framework.name == "lightgbm": + # LightGBM GPU training requires library built with GPU support + # (cmake -DUSE_GPU=1 or pip install lightgbm --install-option=--gpu) + if device == "gpu": + model = model_class( + n_estimators=num_trees, + max_depth=max_depth, + device="gpu", + gpu_platform_id=0, + gpu_device_id=0, + n_jobs=1, # GPU mode uses single thread + verbose=-1, + ) + else: + model = model_class( + n_estimators=num_trees, + max_depth=max_depth, + n_jobs=-1, + verbose=-1, + ) + else: + raise ValueError(f"Unknown framework: {framework.name}") + + model.fit(X_train, y_train) + return model + + +def generate_data( + num_features: int, + num_samples: int, + model_type: str, + random_state: int = RANDOM_STATE, +) -> tuple[np.ndarray, np.ndarray]: + """Generate synthetic data for benchmarking.""" + if model_type == "regressor": + X, y = make_regression( + n_samples=num_samples, + n_features=num_features, + random_state=random_state, + ) + else: + X, y = make_classification( + n_samples=num_samples, + n_features=num_features, + n_informative=min(num_features, 10), + n_redundant=0, + random_state=random_state, + ) + + return X.astype("float32"), y.astype( + "float32" if model_type == "regressor" else "int32" + ) + + +def print_dry_run_info( + frameworks: list[str], + model_types: list[str], + devices: list[str], + param_values: dict, +) -> None: + """Print benchmark configuration for dry run.""" + print("\nBenchmark Configuration:") + print(f" Frameworks: {', '.join(frameworks)}") + print(f" Model types: {', '.join(model_types)}") + print(f" Devices: {', '.join(devices)}") + print("\nParameter space:") + for key, values in param_values.items(): + print(f" {key}: {values}") + + # Calculate total runs + total_params = ( + len(param_values["num_features"]) + * len(param_values["max_depth"]) + * len(param_values["num_trees"]) + * len(param_values["batch_size"]) + ) + total_runs = ( + len(frameworks) * len(model_types) * len(devices) * total_params + ) + + print(f"\nTotal benchmark configurations: {total_runs}") + print( + f" = {len(frameworks)} frameworks x {len(model_types)} model_types x " + f"{len(devices)} devices x {total_params} parameter combinations" + ) + + +def run_benchmark_suite( + frameworks: list[str], + model_types: list[str], + devices: list[str], + param_values: dict, + data_dir: str, +) -> None: + """ + Run the full benchmark suite. + + Parameters + ---------- + frameworks : list[str] + List of framework names to benchmark. + model_types : list[str] + List of model types ('regressor', 'classifier'). + devices : list[str] + List of devices ('cpu', 'gpu'). + param_values : dict + Dictionary of parameter names to lists of values. + data_dir : str + Directory to save results. + """ + os.makedirs(data_dir, exist_ok=True) + + results = { + "framework": [], + "model_type": [], + "device": [], + "num_features": [], + "max_depth": [], + "num_trees": [], + "batch_size": [], + "native_time": [], + "nvforest_time": [], + "optimal_layout": [], + "optimal_chunk_size": [], + "speedup": [], + } + + for model_type in model_types: + for num_features in param_values["num_features"]: + # Generate data once per feature count + X, y = generate_data(num_features, MAX_BATCH_SIZE, model_type) + X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES] + + for max_depth in param_values["max_depth"]: + for num_trees in param_values["num_trees"]: + for framework_name in frameworks: + framework = FRAMEWORKS[framework_name] + + for device in devices: + # Train model with appropriate device + # For XGBoost, training on GPU enables GPU inference + train_device = ( + device + if framework.supports_gpu_native + else "cpu" + ) + + cur_time = datetime.now().strftime("%H:%M:%S") + LOGGER.info( + f"{cur_time}: Training {framework_name} {model_type} " + f"({num_trees} trees, depth {max_depth}, {num_features} features) " + f"[train_device={train_device}]" + ) + + try: + model = train_model( + framework, + model_type, + num_trees, + max_depth, + X_train, + y_train, + device=train_device, + ) + except Exception as e: + LOGGER.warning( + f"Failed to train {framework_name} model: {e}" + ) + continue + + # Save model + model_path = os.path.join( + data_dir, + f"model_{framework_name}_{device}{framework.model_extension}", + ) + framework.save_model(model, model_path) + + # Load native model with device support + if framework.name == "sklearn": + native_model = model # sklearn uses trained model directly + else: + native_model = framework.load_native( + model_path, device + ) + + for batch_size in param_values["batch_size"]: + # Skip large batch + large feature combinations + if ( + batch_size == MAX_BATCH_SIZE + and num_features == 512 + ): + continue + + LOGGER.info( + f" {framework_name}/{model_type}/{device} " + f"batch_size={batch_size}" + ) + + # Prepare data for device + if device == "gpu": + try: + import cupy as cp + + X_device = cp.asarray(X) + except ImportError: + LOGGER.warning( + "cupy not available, skipping GPU benchmark" + ) + continue + else: + X_device = X + + # Native benchmark + # For GPU-capable frameworks, use GPU data; otherwise CPU + native_data = ( + X_device + if framework.supports_gpu_native + else X + ) + LOGGER.info( + f" Running native inference (device={device}, gpu_native={framework.supports_gpu_native})..." + ) + try: + native_time = run_inference_benchmark( + lambda batch, + m=native_model, + d=device: framework.predict_native( + m, batch, d + ), + native_data, + batch_size, + ) + except Exception as e: + LOGGER.warning( + f"Native inference failed: {e}" + ) + native_time = float("nan") + + # nvforest benchmark + LOGGER.info( + " Running nvforest inference..." + ) + try: + nvforest_model = nvforest.load_model( + model_path, + device=device, + precision="single", + ) + + # Optimize + batch = ( + X_device[:batch_size] + if device == "gpu" + else X[:batch_size] + ) + nvforest_model = nvforest_model.optimize( + data=batch + ) + optimal_layout = nvforest_model.layout + optimal_chunk_size = ( + nvforest_model.default_chunk_size + ) + + nvforest_time = run_inference_benchmark( + lambda batch, + m=nvforest_model: m.predict(batch), + X_device if device == "gpu" else X, + batch_size, + ) + except Exception as e: + LOGGER.warning( + f"nvforest inference failed: {e}" + ) + nvforest_time = float("nan") + optimal_layout = None + optimal_chunk_size = None + + # Record results + results["framework"].append(framework_name) + results["model_type"].append(model_type) + results["device"].append(device) + results["num_features"].append(num_features) + results["max_depth"].append(max_depth) + results["num_trees"].append(num_trees) + results["batch_size"].append(batch_size) + results["native_time"].append(native_time) + results["nvforest_time"].append(nvforest_time) + results["optimal_layout"].append( + optimal_layout + ) + results["optimal_chunk_size"].append( + optimal_chunk_size + ) + results["speedup"].append( + native_time / nvforest_time + if nvforest_time + and not np.isnan(nvforest_time) + else float("nan") + ) + + # Clean up GPU memory + if device == "gpu": + del nvforest_model + gc.collect() + + # Clean up native model after batch_size loop + if native_model is not model: + del native_model + + # Clean up trained model after device + del model + gc.collect() + + # Write checkpoint after each tree/depth combination + write_checkpoint(results, data_dir) + + # Clean up data + del X, y, X_train, y_train + gc.collect() + + # Write final results + write_checkpoint(results, data_dir, final=True) + LOGGER.info( + f"Benchmark complete. Results saved to {data_dir}/final_results.csv" + ) + + +@click.group() +def cli(): + """Benchmark suite for nvforest.""" + pass + + +@cli.command() +@click.option( + "--framework", + "-f", + "frameworks", + multiple=True, + type=click.Choice(["sklearn", "xgboost", "lightgbm"]), + help="Framework(s) to benchmark. Can be specified multiple times. Default: all available.", +) +@click.option( + "--dry-run", + "-n", + is_flag=True, + help="Print benchmark configuration without running.", +) +@click.option( + "--quick-test", + "-q", + is_flag=True, + help="Run with minimal parameters to verify setup.", +) +@click.option( + "--device", + "-d", + "devices", + multiple=True, + type=click.Choice(["cpu", "gpu", "both"]), + default=["both"], + help="Device(s) to benchmark on. Default: both.", +) +@click.option( + "--model-type", + "-m", + "model_types", + multiple=True, + type=click.Choice(["regressor", "classifier", "both"]), + default=["both"], + help="Model type(s) to benchmark. Default: both.", +) +@click.option( + "--output-dir", + "-o", + default=None, + help="Output directory for results. Default: benchmark/data/", +) +def run( + frameworks: tuple[str, ...], + dry_run: bool, + quick_test: bool, + devices: tuple[str, ...], + model_types: tuple[str, ...], + output_dir: Optional[str], +): + """Run the benchmark suite.""" + # Resolve frameworks + if not frameworks: + frameworks = tuple(FRAMEWORKS.keys()) + else: + # Validate frameworks are available + unavailable = [f for f in frameworks if f not in FRAMEWORKS] + if unavailable: + raise click.ClickException( + f"Framework(s) not available: {', '.join(unavailable)}. " + f"Install the required packages." + ) + + # Resolve devices + device_list = [] + for d in devices: + if d == "both": + device_list.extend(["cpu", "gpu"]) + else: + device_list.append(d) + device_list = list( + dict.fromkeys(device_list) + ) # Remove duplicates, preserve order + + # Resolve model types + model_type_list = [] + for m in model_types: + if m == "both": + model_type_list.extend(["regressor", "classifier"]) + else: + model_type_list.append(m) + model_type_list = list(dict.fromkeys(model_type_list)) + + # Select parameter space + param_values = QUICK_TEST_VALUES if quick_test else FULL_VALUES + + # Resolve output directory + if output_dir is None: + output_dir = DATA_DIR + + if dry_run: + print_dry_run_info( + list(frameworks), model_type_list, device_list, param_values + ) + return + + # Check for available frameworks + LOGGER.info(f"Available frameworks: {', '.join(FRAMEWORKS.keys())}") + LOGGER.info(f"Selected frameworks: {', '.join(frameworks)}") + LOGGER.info(f"Devices: {', '.join(device_list)}") + LOGGER.info(f"Model types: {', '.join(model_type_list)}") + LOGGER.info(f"Quick test: {quick_test}") + LOGGER.info(f"Output directory: {output_dir}") + + run_benchmark_suite( + frameworks=list(frameworks), + model_types=model_type_list, + devices=device_list, + param_values=param_values, + data_dir=output_dir, + ) + + +if __name__ == "__main__": + cli() diff --git a/python/nvforest/nvforest/benchmark/data/.gitkeep b/python/nvforest/nvforest/benchmark/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/python/nvforest/tests/test_benchmark.py b/python/nvforest/tests/test_benchmark.py new file mode 100644 index 0000000..8678d4e --- /dev/null +++ b/python/nvforest/tests/test_benchmark.py @@ -0,0 +1,713 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +import os +import tempfile + +import numpy as np +import pytest +from click.testing import CliRunner + +# Import benchmark components +from nvforest.benchmark.benchmark import ( + FRAMEWORKS, + FULL_VALUES, + QUICK_TEST_VALUES, + FrameworkConfig, + cli, + generate_data, + print_dry_run_info, + run_inference_benchmark, + train_model, + write_checkpoint, +) + + +class TestFrameworkConfig: + """Tests for FrameworkConfig dataclass.""" + + @pytest.mark.unit + def test_sklearn_config_exists(self): + """Test that sklearn framework config is available.""" + assert "sklearn" in FRAMEWORKS + config = FRAMEWORKS["sklearn"] + assert config.name == "sklearn" + assert config.model_extension == ".tl" + assert config.supports_gpu_native is False + + @pytest.mark.unit + def test_framework_config_has_required_fields(self): + """Test that all framework configs have required fields.""" + for name, config in FRAMEWORKS.items(): + assert isinstance(config, FrameworkConfig) + assert config.name == name + assert config.regressor_class is not None + assert config.classifier_class is not None + assert callable(config.save_model) + assert callable(config.load_native) + assert callable(config.predict_native) + assert isinstance(config.model_extension, str) + + +class TestParameterSpace: + """Tests for parameter space configurations.""" + + @pytest.mark.unit + def test_full_values_structure(self): + """Test that FULL_VALUES has expected keys.""" + expected_keys = { + "num_features", + "max_depth", + "num_trees", + "batch_size", + } + assert set(FULL_VALUES.keys()) == expected_keys + + @pytest.mark.unit + def test_quick_test_values_structure(self): + """Test that QUICK_TEST_VALUES has expected keys.""" + expected_keys = { + "num_features", + "max_depth", + "num_trees", + "batch_size", + } + assert set(QUICK_TEST_VALUES.keys()) == expected_keys + + @pytest.mark.unit + def test_quick_test_values_are_subset(self): + """Test that quick test values are subsets of full values.""" + for key in QUICK_TEST_VALUES: + for val in QUICK_TEST_VALUES[key]: + assert val in FULL_VALUES[key], ( + f"{val} not in FULL_VALUES[{key}]" + ) + + @pytest.mark.unit + def test_quick_test_is_minimal(self): + """Test that quick test has minimal parameter space.""" + for key in QUICK_TEST_VALUES: + assert len(QUICK_TEST_VALUES[key]) == 1 + + +class TestGenerateData: + """Tests for data generation.""" + + @pytest.mark.unit + def test_generate_regression_data(self): + """Test generating regression data.""" + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + assert X.shape == (100, 10) + assert y.shape == (100,) + assert X.dtype == np.float32 + assert y.dtype == np.float32 + + @pytest.mark.unit + def test_generate_classification_data(self): + """Test generating classification data.""" + X, y = generate_data( + num_features=10, num_samples=100, model_type="classifier" + ) + assert X.shape == (100, 10) + assert y.shape == (100,) + assert X.dtype == np.float32 + assert y.dtype == np.int32 + + @pytest.mark.unit + def test_generate_data_reproducible(self): + """Test that data generation is reproducible with same seed.""" + X1, y1 = generate_data( + num_features=10, + num_samples=100, + model_type="regressor", + random_state=42, + ) + X2, y2 = generate_data( + num_features=10, + num_samples=100, + model_type="regressor", + random_state=42, + ) + np.testing.assert_array_equal(X1, X2) + np.testing.assert_array_equal(y1, y2) + + +class TestTrainModel: + """Tests for model training.""" + + @pytest.mark.unit + def test_train_sklearn_regressor(self): + """Test training sklearn regressor.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + model = train_model( + FRAMEWORKS["sklearn"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + ) + assert hasattr(model, "predict") + preds = model.predict(X[:10]) + assert preds.shape == (10,) + + @pytest.mark.unit + def test_train_sklearn_classifier(self): + """Test training sklearn classifier.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="classifier" + ) + model = train_model( + FRAMEWORKS["sklearn"], + model_type="classifier", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + ) + assert hasattr(model, "predict") + preds = model.predict(X[:10]) + assert preds.shape == (10,) + + @pytest.mark.unit + def test_train_model_with_device_param(self): + """Test that train_model accepts device parameter.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + # sklearn ignores device, but should accept the parameter + model = train_model( + FRAMEWORKS["sklearn"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + device="cpu", + ) + assert hasattr(model, "predict") + + +class TestRunInferenceBenchmark: + """Tests for inference benchmarking.""" + + @pytest.mark.unit + def test_benchmark_returns_float(self): + """Test that benchmark returns a float timing.""" + X = np.random.rand(100, 10).astype(np.float32) + + def dummy_predict(batch): + return batch.sum(axis=1) + + elapsed = run_inference_benchmark( + predict_fn=dummy_predict, + X=X, + batch_size=10, + warmup_cycles=1, + benchmark_cycles=2, + ) + assert isinstance(elapsed, float) + assert elapsed > 0 + + @pytest.mark.unit + def test_benchmark_with_small_batch(self): + """Test benchmark with small batch size.""" + X = np.random.rand(50, 5).astype(np.float32) + + def dummy_predict(batch): + return batch.mean(axis=1) + + elapsed = run_inference_benchmark( + predict_fn=dummy_predict, + X=X, + batch_size=5, + warmup_cycles=1, + benchmark_cycles=1, + ) + assert elapsed > 0 + + +class TestWriteCheckpoint: + """Tests for checkpoint writing.""" + + @pytest.mark.unit + def test_write_checkpoint_creates_file(self): + """Test that checkpoint writing creates a file.""" + with tempfile.TemporaryDirectory() as tmpdir: + results = { + "framework": ["sklearn"], + "device": ["cpu"], + "speedup": [1.5], + } + write_checkpoint(results, tmpdir, final=False) + + # Check that a checkpoint file was created + files = os.listdir(tmpdir) + assert any(f.startswith("checkpoint_") for f in files) + + @pytest.mark.unit + def test_write_final_results(self): + """Test writing final results file.""" + with tempfile.TemporaryDirectory() as tmpdir: + results = { + "framework": ["sklearn", "xgboost"], + "device": ["cpu", "gpu"], + "speedup": [1.5, 2.0], + } + write_checkpoint(results, tmpdir, final=True) + + assert "final_results.csv" in os.listdir(tmpdir) + + +class TestPrintDryRunInfo: + """Tests for dry run info printing.""" + + @pytest.mark.unit + def test_print_dry_run_info_no_error(self, capsys): + """Test that dry run info prints without error.""" + print_dry_run_info( + frameworks=["sklearn"], + model_types=["regressor"], + devices=["cpu"], + param_values=QUICK_TEST_VALUES, + ) + captured = capsys.readouterr() + assert "Benchmark Configuration" in captured.out + assert "sklearn" in captured.out + assert "regressor" in captured.out + assert "cpu" in captured.out + + +class TestCLI: + """Tests for CLI commands.""" + + @pytest.mark.unit + def test_cli_dry_run(self): + """Test CLI dry run option.""" + runner = CliRunner() + result = runner.invoke(cli, ["run", "--dry-run"]) + assert result.exit_code == 0 + assert "Benchmark Configuration" in result.output + + @pytest.mark.unit + def test_cli_dry_run_with_framework(self): + """Test CLI dry run with specific framework.""" + runner = CliRunner() + result = runner.invoke( + cli, ["run", "--dry-run", "--framework", "sklearn"] + ) + assert result.exit_code == 0 + assert "sklearn" in result.output + + @pytest.mark.unit + def test_cli_dry_run_quick_test(self): + """Test CLI dry run with quick test.""" + runner = CliRunner() + result = runner.invoke(cli, ["run", "--dry-run", "--quick-test"]) + assert result.exit_code == 0 + # Quick test has fewer configurations + assert "Total benchmark configurations: 1" in result.output + + @pytest.mark.unit + def test_cli_dry_run_single_device(self): + """Test CLI dry run with single device.""" + runner = CliRunner() + result = runner.invoke(cli, ["run", "--dry-run", "--device", "cpu"]) + assert result.exit_code == 0 + assert "cpu" in result.output + + @pytest.mark.unit + def test_cli_dry_run_single_model_type(self): + """Test CLI dry run with single model type.""" + runner = CliRunner() + result = runner.invoke( + cli, ["run", "--dry-run", "--model-type", "regressor"] + ) + assert result.exit_code == 0 + assert "regressor" in result.output + + @pytest.mark.unit + def test_cli_invalid_framework(self): + """Test CLI with invalid framework.""" + runner = CliRunner() + result = runner.invoke( + cli, ["run", "--dry-run", "--framework", "invalid"] + ) + assert result.exit_code != 0 + + @pytest.mark.unit + def test_cli_multiple_frameworks(self): + """Test CLI with multiple frameworks.""" + runner = CliRunner() + result = runner.invoke( + cli, ["run", "--dry-run", "-f", "sklearn", "-f", "xgboost"] + ) + # May fail if xgboost not installed, but should not error on parsing + if result.exit_code == 0: + assert "sklearn" in result.output + + +class TestDeviceHandling: + """Tests for device-aware inference.""" + + @pytest.mark.unit + def test_sklearn_predict_native_accepts_device(self): + """Test that sklearn predict_native accepts device parameter.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + model = train_model( + FRAMEWORKS["sklearn"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + ) + + framework = FRAMEWORKS["sklearn"] + # Should work with device parameter + preds = framework.predict_native(model, X[:10], "cpu") + assert preds.shape == (10,) + + @pytest.mark.unit + def test_sklearn_load_native_accepts_device(self): + """Test that sklearn load_native accepts device parameter.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + framework = FRAMEWORKS["sklearn"] + # sklearn load_native returns None (uses trained model directly) + result = framework.load_native("dummy_path", "cpu") + assert result is None + + @pytest.mark.unit + def test_xgboost_predict_native_accepts_device(self): + """Test that xgboost predict_native accepts device parameter.""" + if "xgboost" not in FRAMEWORKS: + pytest.skip("xgboost not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + model = train_model( + FRAMEWORKS["xgboost"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + device="cpu", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + framework = FRAMEWORKS["xgboost"] + model_path = os.path.join( + tmpdir, f"model{framework.model_extension}" + ) + framework.save_model(model, model_path) + + # Load and predict with device parameter + native_model = framework.load_native(model_path, "cpu") + preds = framework.predict_native(native_model, X[:10], "cpu") + assert preds.shape == (10,) + + @pytest.mark.unit + def test_lightgbm_predict_native_accepts_device(self): + """Test that lightgbm predict_native accepts device parameter.""" + if "lightgbm" not in FRAMEWORKS: + pytest.skip("lightgbm not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + model = train_model( + FRAMEWORKS["lightgbm"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + framework = FRAMEWORKS["lightgbm"] + model_path = os.path.join( + tmpdir, f"model{framework.model_extension}" + ) + framework.save_model(model, model_path) + + # Load and predict with device parameter + native_model = framework.load_native(model_path, "cpu") + preds = framework.predict_native(native_model, X[:10], "cpu") + assert preds.shape == (10,) + + @pytest.mark.unit + def test_lightgbm_predict_native_gpu_fallback(self): + """Test that lightgbm predict_native handles GPU request gracefully.""" + if "lightgbm" not in FRAMEWORKS: + pytest.skip("lightgbm not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + # Train on CPU (GPU training requires special build) + model = train_model( + FRAMEWORKS["lightgbm"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + device="cpu", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + framework = FRAMEWORKS["lightgbm"] + model_path = os.path.join( + tmpdir, f"model{framework.model_extension}" + ) + framework.save_model(model, model_path) + + # Load and predict - should work even with gpu device request + # (will fall back to CPU if GPU not available) + native_model = framework.load_native(model_path, "gpu") + preds = framework.predict_native(native_model, X[:10], "gpu") + assert preds.shape == (10,) + + @pytest.mark.unit + def test_framework_supports_gpu_native_flag(self): + """Test that supports_gpu_native flag is set correctly.""" + if "sklearn" in FRAMEWORKS: + assert FRAMEWORKS["sklearn"].supports_gpu_native is False + if "xgboost" in FRAMEWORKS: + assert FRAMEWORKS["xgboost"].supports_gpu_native is True + if "lightgbm" in FRAMEWORKS: + assert FRAMEWORKS["lightgbm"].supports_gpu_native is True + + @pytest.mark.unit + def test_lightgbm_train_with_gpu_device_param(self): + """Test that LightGBM train_model accepts GPU device parameter. + + Note: Actual GPU training requires LightGBM built with GPU support. + This test verifies the code path works (may fall back to CPU). + """ + if "lightgbm" not in FRAMEWORKS: + pytest.skip("lightgbm not available") + + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + + # This may raise an error if LightGBM GPU is not available, + # which is expected behavior - the training code should handle it + try: + model = train_model( + FRAMEWORKS["lightgbm"], + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + device="gpu", + ) + assert hasattr(model, "predict") + except Exception as e: + # Expected if LightGBM GPU support is not available + assert "GPU" in str(e).upper() or "gpu" in str(e).lower() + + +class TestEndToEnd: + """End-to-end tests for benchmark functionality.""" + + @pytest.mark.unit + def test_sklearn_model_save_load(self): + """Test saving and loading sklearn model.""" + if "sklearn" not in FRAMEWORKS: + pytest.skip("sklearn not available") + + import nvforest + + with tempfile.TemporaryDirectory() as tmpdir: + framework = FRAMEWORKS["sklearn"] + X, y = generate_data( + num_features=10, num_samples=100, model_type="regressor" + ) + + # Train model + model = train_model( + framework, + model_type="regressor", + num_trees=5, + max_depth=3, + X_train=X, + y_train=y, + ) + + # Save model + model_path = os.path.join( + tmpdir, f"model{framework.model_extension}" + ) + framework.save_model(model, model_path) + + assert os.path.exists(model_path) + + # Load with nvforest + nvforest_model = nvforest.load_model(model_path, device="cpu") + assert nvforest_model is not None + + # Verify prediction works + preds = nvforest_model.predict(X[:10]) + assert preds.shape[0] == 10 + + +class TestAnalyze: + """Tests for the analyze module.""" + + @pytest.mark.unit + def test_generate_summary_stats(self): + """Test generating summary statistics.""" + import pandas as pd + + from nvforest.benchmark.analyze import generate_summary_stats + + df = pd.DataFrame( + { + "framework": ["sklearn", "sklearn", "xgboost", "xgboost"], + "model_type": [ + "regressor", + "regressor", + "regressor", + "regressor", + ], + "device": ["cpu", "cpu", "cpu", "cpu"], + "speedup": [1.5, 2.0, 1.8, 2.2], + "native_time": [0.1, 0.2, 0.15, 0.25], + "nvforest_time": [0.05, 0.1, 0.08, 0.12], + } + ) + + summary = generate_summary_stats(df) + assert summary is not None + assert len(summary) == 2 # Two frameworks + + @pytest.mark.unit + def test_print_summary(self, capsys): + """Test printing summary.""" + import pandas as pd + + from nvforest.benchmark.analyze import print_summary + + df = pd.DataFrame( + { + "framework": ["sklearn", "sklearn"], + "model_type": ["regressor", "regressor"], + "device": ["cpu", "cpu"], + "num_trees": [10, 20], + "max_depth": [4, 4], + "batch_size": [1024, 1024], + "speedup": [1.5, 2.0], + "native_time": [0.1, 0.2], + "nvforest_time": [0.05, 0.1], + } + ) + + print_summary(df) + captured = capsys.readouterr() + assert "BENCHMARK SUMMARY" in captured.out + assert "sklearn" in captured.out.upper() + + @pytest.mark.unit + def test_analyze_cli_file_not_found(self): + """Test analyze CLI with non-existent file.""" + from nvforest.benchmark.analyze import analyze + + runner = CliRunner() + result = runner.invoke(analyze, ["nonexistent.csv"]) + assert result.exit_code != 0 + + @pytest.mark.unit + def test_analyze_cli_with_results_file(self): + """Test analyze CLI with valid results file.""" + import pandas as pd + + from nvforest.benchmark.analyze import analyze + + with tempfile.TemporaryDirectory() as tmpdir: + # Create a sample results file + df = pd.DataFrame( + { + "framework": ["sklearn", "sklearn"], + "model_type": ["regressor", "regressor"], + "device": ["cpu", "cpu"], + "num_features": [32, 32], + "max_depth": [4, 8], + "num_trees": [16, 16], + "batch_size": [1024, 1024], + "native_time": [0.1, 0.2], + "nvforest_time": [0.05, 0.1], + "optimal_layout": ["depth_first", "breadth_first"], + "optimal_chunk_size": [8, 16], + "speedup": [2.0, 2.0], + } + ) + results_path = os.path.join(tmpdir, "results.csv") + df.to_csv(results_path, index=False) + + runner = CliRunner() + result = runner.invoke(analyze, [results_path, "--summary-only"]) + assert result.exit_code == 0 + assert "BENCHMARK SUMMARY" in result.output + + @pytest.mark.unit + def test_analyze_cli_with_filter(self): + """Test analyze CLI with framework filter.""" + import pandas as pd + + from nvforest.benchmark.analyze import analyze + + with tempfile.TemporaryDirectory() as tmpdir: + df = pd.DataFrame( + { + "framework": ["sklearn", "xgboost"], + "model_type": ["regressor", "regressor"], + "device": ["cpu", "cpu"], + "num_features": [32, 32], + "max_depth": [4, 4], + "num_trees": [16, 16], + "batch_size": [1024, 1024], + "native_time": [0.1, 0.15], + "nvforest_time": [0.05, 0.08], + "optimal_layout": ["depth_first", "depth_first"], + "optimal_chunk_size": [8, 8], + "speedup": [2.0, 1.875], + } + ) + results_path = os.path.join(tmpdir, "results.csv") + df.to_csv(results_path, index=False) + + runner = CliRunner() + result = runner.invoke( + analyze, + [results_path, "--summary-only", "--framework", "sklearn"], + ) + assert result.exit_code == 0 + assert "Total benchmark runs: 1" in result.output