Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python/nvforest/nvforest/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def apply(
"""
pass

@property
@abstractmethod
def num_features(self) -> int:
pass

@property
@abstractmethod
def num_outputs(self) -> int:
Expand Down
16 changes: 16 additions & 0 deletions python/nvforest/nvforest/_forest_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ def apply(
) -> DataType:
return self.forest.apply(X, chunk_size=chunk_size)

@property
def num_features(self) -> int:
return self.forest.num_features

@property
def num_outputs(self) -> int:
return self.forest.num_outputs
Expand Down Expand Up @@ -488,6 +492,10 @@ def apply(
) -> DataType:
return self.forest.apply(X, chunk_size=chunk_size)

@property
def num_features(self) -> int:
return self.forest.num_features

@property
def num_outputs(self) -> int:
return self.forest.num_outputs
Expand Down Expand Up @@ -603,6 +611,10 @@ def apply(
) -> DataType:
return self.forest.apply(X, chunk_size=chunk_size)

@property
def num_features(self) -> int:
return self.forest.num_features

@property
def num_outputs(self) -> int:
return self.forest.num_outputs
Expand Down Expand Up @@ -706,6 +718,10 @@ def apply(
) -> DataType:
return self.forest.apply(X, chunk_size=chunk_size)

@property
def num_features(self) -> int:
return self.forest.num_features

@property
def num_outputs(self) -> int:
return self.forest.num_outputs
Expand Down
12 changes: 12 additions & 0 deletions python/nvforest/nvforest/detail/forest_inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -355,12 +355,22 @@ class ForestInferenceImpl:
def elem_postprocessing(self) -> str:
return self.impl.elem_postprocessing()

def _validate_input_dims(self, X: DataType) -> None:
if len(X.shape) != 2:
raise ValueError("Expected a 2D array for X")
if X.shape[1] != self.num_features:
raise ValueError(
f"Expected {self.num_features} features in the input "
f"but X has {X.shape[1]} features"
)

def predict(
self,
X: DataType,
*,
chunk_size: Optional[int] = None,
) -> DataType:
self._validate_input_dims(X)
# Returns probabilities if the model is a classifier
return self.impl.predict(
X, chunk_size=(chunk_size or self.default_chunk_size)
Expand All @@ -372,6 +382,7 @@ class ForestInferenceImpl:
*,
chunk_size: Optional[int] = None,
) -> DataType:
self._validate_input_dims(X)
chunk_size = (chunk_size or self.default_chunk_size)
return self.impl.predict(
X, predict_type="per_tree", chunk_size=chunk_size
Expand All @@ -383,6 +394,7 @@ class ForestInferenceImpl:
*,
chunk_size: Optional[int] = None,
) -> DataType:
self._validate_input_dims(X)
chunk_size = (chunk_size or self.default_chunk_size)
return self.impl.predict(
X, predict_type="leaf_id", chunk_size=chunk_size
Expand Down
26 changes: 26 additions & 0 deletions python/nvforest/tests/test_nvforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,3 +856,29 @@ def test_wide_data():
# Inference should run without crashing
fm = nvforest.load_from_sklearn(clf)
_ = fm.predict(X)


@pytest.mark.parametrize("input_size", [4, 6], ids=["too_narrow", "too_wide"])
@pytest.mark.parametrize(
"predict_func",
[
nvforest.CPUForestInferenceClassifier.predict,
nvforest.CPUForestInferenceClassifier.predict_per_tree,
nvforest.CPUForestInferenceClassifier.apply,
],
ids=["predict", "predict_per_tree", "apply"],
)
def test_incorrect_data_shape(input_size, predict_func):
n_rows = 50
n_features = 5
X = np.random.normal(size=(n_rows, n_features)).astype(np.float32)
y = np.asarray([0, 1] * (n_rows // 2), dtype=np.int32)

clf = RandomForestClassifier(max_features="sqrt", n_estimators=10)
clf.fit(X, y)

fm = nvforest.load_from_sklearn(clf, device="cpu")
Comment on lines +877 to +880
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Cover the actual issue-72 loading path here.

This only exercises load_from_sklearn(..., device="cpu"), but issue #72 is about feature-count validation after loading a serialized model via ForestInference.load / nvforest.load_model. A loader-specific num_features regression would still slip through, so please add at least one saved-model case that goes through load_model and reproduces the user-facing path.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@python/nvforest/tests/test_nvforest.py` around lines 877 - 880, The test
currently only exercises nvforest.load_from_sklearn; add a saved-model test that
goes through the serialized-load path (use nvforest.save_model or
ForestInference.save if available to write the model to a temporary file, then
call nvforest.load_model or ForestInference.load with device="cpu") to reproduce
issue-72's validation path; ensure the loaded model's feature count is validated
by asserting expected behavior (e.g., raises or matches num_features) after
loading from disk so any regression in the load_model / ForestInference.load
path is caught.

assert fm.num_features == n_features
with pytest.raises(ValueError, match=f"Expected {n_features} features"):
X_test = np.zeros((1, input_size))
_ = predict_func(fm, X_test)
Comment on lines +882 to +884
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Assert the received feature count in the error message too.

Right now any message that starts with Expected 5 features passes. Matching the actual width (input_size) here would enforce the expected-vs-actual detail that users need when debugging shape errors.

Suggested assertion tightening
-    with pytest.raises(ValueError, match=f"Expected {n_features} features"):
+    with pytest.raises(
+        ValueError,
+        match=rf"Expected {n_features} features.*got {input_size}",
+    ):

As per coding guidelines "Error messages should be helpful and include expected vs actual values".

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
with pytest.raises(ValueError, match=f"Expected {n_features} features"):
X_test = np.zeros((1, input_size))
_ = predict_func(fm, X_test)
with pytest.raises(
ValueError,
match=rf"Expected {n_features} features.*got {input_size}",
):
X_test = np.zeros((1, input_size))
_ = predict_func(fm, X_test)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@python/nvforest/tests/test_nvforest.py` around lines 882 - 884, The test
currently only asserts the error starts with "Expected {n_features} features";
update the pytest.raises match to require the actual received feature count
(input_size) too so the exception message includes both expected and actual
counts. Locate the block using predict_func, fm, input_size and n_features and
change the match to assert something like "Expected {n_features} features, got
{input_size}" (or the project's chosen wording) so the raised ValueError
contains expected vs actual values.