Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions src/protify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def parse_arguments():
parser.add_argument("--scikit_cv", type=int, default=3, help="Number of cross-validation folds for scikit model.")
parser.add_argument("--scikit_random_state", type=int, default=None, help="Random state for scikit model (if None, uses global seed).")
parser.add_argument("--scikit_model_name", type=str, default=None, help="Name of the scikit model to use.")
parser.add_argument("--scikit_model_args", type=str, default=None, help="JSON string of hyperparameters to use (skips tuning). E.g. '{\"n_estimators\": 500, \"max_depth\": 7}'")
parser.add_argument("--use_scikit", action="store_true", default=False, help="Use scikit model (default: False).")
parser.add_argument("--n_jobs", type=int, default=1, help="Number of processes to use in scikit.") # TODO integrate with GUI and main

Expand Down Expand Up @@ -653,14 +654,25 @@ def run_scikit_scheme(self):
for data_name, dataset in self.datasets.items():
### find best scikit model and parameters via cross validation and lazy predict
X_train, y_train, X_valid, y_valid, X_test, y_test, label_type = self.prepare_scikit_dataset(model_name, dataset)
if label_type == 'singlelabel':
results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
elif label_type == 'regression':
results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)

# If a specific model is specified, skip LazyPredict and go straight to that model
if self.scikit_args.model_name is not None:
print_message(f"Skipping LazyPredict, using specified model: {self.scikit_args.model_name}")
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, model_results=None)
else:
raise ValueError(f'Label type {label_type} not supported')
### train and evaluate best model
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)
# Find best model via LazyPredict
if label_type == 'singlelabel':
results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
elif label_type == 'regression':
results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)
else:
raise ValueError(f'Label type {label_type} not supported')
# Train and evaluate best model with optimal hyperparameters
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)

# Log the results for plotting
metrics_dict = {'test_mcc': results.final_scores} if isinstance(results.final_scores, (int, float)) else results.final_scores
self.log_metrics(data_name, model_name, metrics_dict, split_name='test')

@log_method_calls
def generate_plots(self):
Expand Down
123 changes: 93 additions & 30 deletions src/protify/probes/lazy_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,29 @@
"LinearSVC",
"Perceptron",
"MLPClassifier",
"SGDClassifier"
"SGDClassifier",
# O(n²) memory models - too slow for large datasets
"LabelPropagation",
"LabelSpreading",
"SVC",
"NuSVC",
# Sequential ensemble models - slow for large datasets
"AdaBoostClassifier",
"BaggingClassifier",
# O(n×m) prediction time - slow for large test sets
"KNeighborsClassifier",
# Unbounded tree depth - very slow on high-dim data
"DecisionTreeClassifier",
"ExtraTreeClassifier",
"ExtraTreesClassifier",
# Fails on negative values after StandardScaler
"CategoricalNB",
# O(d²) or O(d³) - slow on high-dimensional data (4608 features)
"LinearDiscriminantAnalysis",
"QuadraticDiscriminantAnalysis",
# Requires estimator argument
"FixedThresholdClassifier",
"TunedThresholdClassifierCV",
]

removed_regressors = [
Expand All @@ -82,7 +104,16 @@
"LassoLarsCV",
"ElasticNetCV",
"LinearSVR",
"LassoLarsIC"
"LassoLarsIC",
# Sequential ensemble models - slow for large datasets
"AdaBoostRegressor",
"BaggingRegressor",
# O(n×m) prediction time - slow for large test sets
"KNeighborsRegressor",
# Unbounded tree depth - very slow on high-dim data
"DecisionTreeRegressor",
"ExtraTreeRegressor",
"ExtraTreesRegressor",
]

# Tuple of (name, class)
Expand Down Expand Up @@ -176,6 +207,16 @@
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))

# Update dicts with XGB and LGBM
CLASSIFIER_DICT["XGBClassifier"] = xgboost.XGBClassifier
CLASSIFIER_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
REGRESSOR_DICT["XGBRegressor"] = xgboost.XGBRegressor
REGRESSOR_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor
ALL_MODEL_DICT["XGBClassifier"] = xgboost.XGBClassifier
ALL_MODEL_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
ALL_MODEL_DICT["XGBRegressor"] = xgboost.XGBRegressor
ALL_MODEL_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor

numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)
Expand Down Expand Up @@ -309,6 +350,13 @@ def fit(self, X_train, X_test, y_train, y_test):
("categorical_high", categorical_transformer_high, categorical_high),
]
)

# Precompute preprocessing once for all models (major optimization for large datasets)
print_message("Preprocessing data once for all models...")
preprocess_start = time.time()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

if self.classifiers == "all":
self.classifiers = CLASSIFIERS
Expand All @@ -328,23 +376,25 @@ def fit(self, X_train, X_test, y_train, y_test):
total_start = time.time()

for name, model in tqdm(self.classifiers, desc="Training classifiers"):
print_message(f"Starting {name}...")
start = time.time()
try:
# Build model kwargs
model_kwargs = {}
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", model(random_state=self.random_state)),
]
)
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", model())]
)

pipe.fit(X_train, y_train)
self.models[name] = pipe
y_pred = pipe.predict(X_test)
model_kwargs["random_state"] = self.random_state
# Enable parallelization for models that support it
if "n_jobs" in model().get_params().keys():
model_kwargs["n_jobs"] = -1
# Enable verbose for boosting models to show iteration progress
if name in ("XGBClassifier", "LGBMClassifier"):
model_kwargs["verbose"] = 1

# Train directly on preprocessed data (no Pipeline needed)
clf = model(**model_kwargs)
clf.fit(X_train_transformed, y_train)
self.models[name] = clf
y_pred = clf.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred, normalize=True)
b_accuracy = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
Expand All @@ -362,6 +412,8 @@ def fit(self, X_train, X_test, y_train, y_test):
ROC_AUC.append(roc_auc)
F1.append(f1)
TIME.append(fit_time)

print_message(f" {name} completed in {fit_time:.1f}s | Acc: {accuracy:.3f} | F1: {f1:.3f}")

if self.custom_metric is not None:
custom_metric = self.custom_metric(y_test, y_pred)
Expand Down Expand Up @@ -548,6 +600,13 @@ def fit(self, X_train, X_test, y_train, y_test):
("categorical_high", categorical_transformer_high, categorical_high),
]
)

# Precompute preprocessing once for all models (major optimization for large datasets)
print_message("Preprocessing data once for all models...")
preprocess_start = time.time()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

if self.regressors == "all":
self.regressors = REGRESSORS
Expand All @@ -567,23 +626,25 @@ def fit(self, X_train, X_test, y_train, y_test):
total_start = time.time()

for name, model in tqdm(self.regressors, desc="Training regressors"):
print_message(f"Starting {name}...")
start = time.time()
try:
# Build model kwargs
model_kwargs = {}
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", model(random_state=self.random_state)),
]
)
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", model())]
)

pipe.fit(X_train, y_train)
self.models[name] = pipe
y_pred = pipe.predict(X_test)
model_kwargs["random_state"] = self.random_state
# Enable parallelization for models that support it
if "n_jobs" in model().get_params().keys():
model_kwargs["n_jobs"] = -1
# Enable verbose for boosting models to show iteration progress
if name in ("XGBRegressor", "LGBMRegressor"):
model_kwargs["verbose"] = 1

# Train directly on preprocessed data (no Pipeline needed)
reg = model(**model_kwargs)
reg.fit(X_train_transformed, y_train)
self.models[name] = reg
y_pred = reg.predict(X_test_transformed)

r_squared = r2_score(y_test, y_pred)
adj_rsquared = adjusted_rsquared(
Expand All @@ -597,6 +658,8 @@ def fit(self, X_train, X_test, y_train, y_test):
ADJR2.append(adj_rsquared)
RMSE.append(rmse)
TIME.append(fit_time)

print_message(f" {name} completed in {fit_time:.1f}s | R²: {r_squared:.3f} | RMSE: {rmse:.3f}")

if self.custom_metric:
custom_metric = self.custom_metric(y_test, y_pred)
Expand Down
63 changes: 52 additions & 11 deletions src/protify/probes/scikit_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,31 @@ def __init__(
random_state: Optional[int] = None,
# Specific model arguments (optional)
model_name: Optional[str] = None,
scikit_model_name: Optional[str] = None, # CLI arg name
scikit_model_args: Optional[str] = None, # CLI arg - JSON string
model_args: Optional[Dict[str, Any]] = None,
production_model: bool = False,
**kwargs,
):
import json
# Tuning arguments
self.n_iter = n_iter
self.cv = cv
self.random_state = random_state or get_global_seed()

# Specific model arguments
self.model_name = model_name
self.model_args = model_args if model_args is not None else {}
# Specific model arguments - scikit_model_name takes precedence (CLI arg)
self.model_name = scikit_model_name or model_name

# Parse scikit_model_args JSON string if provided (CLI), otherwise use model_args dict
if scikit_model_args is not None:
try:
self.model_args = json.loads(scikit_model_args)
print_message(f"Using pre-specified hyperparameters (skipping tuning): {self.model_args}")
except json.JSONDecodeError as e:
raise ValueError(f"Failed to parse --scikit_model_args JSON: {e}")
else:
self.model_args = model_args if model_args is not None else {}

self.production_model = production_model


Expand Down Expand Up @@ -93,19 +106,25 @@ def _tune_hyperparameters(
"""
param_distributions = HYPERPARAMETER_DISTRIBUTIONS.get(model_name, {})
if not param_distributions:
print_message(f"No hyperparameter distributions defined for {model_name}, using defaults")
return model_class(), {}

print_message(f"Running RandomizedSearchCV with {self.args.n_iter} iterations, {self.args.cv}-fold CV...")
print_message(f"Hyperparameter search space: {list(param_distributions.keys())}")

random_search = RandomizedSearchCV(
model_class(),
param_distributions=param_distributions,
n_iter=self.args.n_iter,
scoring=custom_scorer,
cv=self.args.cv,
random_state=self.args.random_state,
n_jobs=self.n_jobs
n_jobs=self.n_jobs,
verbose=2 # Show progress for each fit
)

random_search.fit(X_train, y_train)
print_message(f"Best CV score: {random_search.best_score_:.4f}")
return random_search.best_estimator_, random_search.best_params_

def find_best_regressor(
Expand Down Expand Up @@ -140,7 +159,8 @@ def find_best_regressor(

# Get best model name and class
best_model_name = initial_scores.index[0]
best_model_class = regressor.models[best_model_name].named_steps['regressor'].__class__
# Models are now stored directly (not as Pipeline) after optimization
best_model_class = regressor.models[best_model_name].__class__
print_message(f"Best model name: {best_model_name}")
print_message(f"Best model class: {best_model_class}")
print_message(f"Initial scores: \n{initial_scores}")
Expand Down Expand Up @@ -202,7 +222,8 @@ def find_best_classifier(

# Get best model name and class
best_model_name = initial_scores.index[0]
best_model_class = classifier.models[best_model_name].named_steps['classifier'].__class__
# Models are now stored directly (not as Pipeline) after optimization
best_model_class = classifier.models[best_model_name].__class__
print_message(f"Best model name: {best_model_name}")
print_message(f"Best model class: {best_model_class}")
print_message(f"Initial scores: \n{initial_scores}")
Expand Down Expand Up @@ -307,16 +328,36 @@ def run_specific_model(
raise ValueError(f"Model {model_name} not supported")

model_class = ALL_MODEL_DICT[model_name]
cls = model_class(**self.args.model_args)
cls.fit(X_train, y_train)
final_scores = scorer(cls, X_test, y_test)

# Skip tuning if model_args is already provided
if self.args.model_args:
print_message(f"Skipping hyperparameter tuning - using provided args: {self.args.model_args}")
best_model = model_class(**self.args.model_args)
best_params = self.args.model_args
else:
# Run hyperparameter tuning
print_message(f"Tuning hyperparameters for {model_name}")
best_model, best_params = self._tune_hyperparameters(
model_class,
model_name,
X_train,
y_train,
scorer
)
print_message(f"Best parameters: {best_params}")

# Train final model with best parameters
print_message(f"Training final model with best parameters")
best_model.fit(X_train, y_train)
final_scores = scorer(best_model, X_test, y_test)
print_message(f"Final scores: {final_scores}")

return ModelResults(
initial_scores=None,
best_model_name=model_name,
best_params=None,
best_params=best_params,
final_scores=final_scores,
best_model=cls
best_model=best_model
)
else:
raise ValueError("Either model_name must be specified in args or model_results must be provided")