diff --git a/src/protify/main.py b/src/protify/main.py index 5a49503..1f142cc 100644 --- a/src/protify/main.py +++ b/src/protify/main.py @@ -69,6 +69,7 @@ def parse_arguments(): parser.add_argument("--scikit_cv", type=int, default=3, help="Number of cross-validation folds for scikit model.") parser.add_argument("--scikit_random_state", type=int, default=None, help="Random state for scikit model (if None, uses global seed).") parser.add_argument("--scikit_model_name", type=str, default=None, help="Name of the scikit model to use.") + parser.add_argument("--scikit_model_args", type=str, default=None, help="JSON string of hyperparameters to use (skips tuning). E.g. '{\"n_estimators\": 500, \"max_depth\": 7}'") parser.add_argument("--use_scikit", action="store_true", default=False, help="Use scikit model (default: False).") parser.add_argument("--n_jobs", type=int, default=1, help="Number of processes to use in scikit.") # TODO integrate with GUI and main @@ -653,14 +654,25 @@ def run_scikit_scheme(self): for data_name, dataset in self.datasets.items(): ### find best scikit model and parameters via cross validation and lazy predict X_train, y_train, X_valid, y_valid, X_test, y_test, label_type = self.prepare_scikit_dataset(model_name, dataset) - if label_type == 'singlelabel': - results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid) - elif label_type == 'regression': - results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid) + + # If a specific model is specified, skip LazyPredict and go straight to that model + if self.scikit_args.model_name is not None: + print_message(f"Skipping LazyPredict, using specified model: {self.scikit_args.model_name}") + results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, model_results=None) else: - raise ValueError(f'Label type {label_type} not supported') - ### train and evaluate best model - results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results) + # Find best model via LazyPredict + if label_type == 'singlelabel': + results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid) + elif label_type == 'regression': + results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid) + else: + raise ValueError(f'Label type {label_type} not supported') + # Train and evaluate best model with optimal hyperparameters + results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results) + + # Log the results for plotting + metrics_dict = {'test_mcc': results.final_scores} if isinstance(results.final_scores, (int, float)) else results.final_scores + self.log_metrics(data_name, model_name, metrics_dict, split_name='test') @log_method_calls def generate_plots(self): diff --git a/src/protify/probes/lazy_predict.py b/src/protify/probes/lazy_predict.py index f19efa5..b989ade 100644 --- a/src/protify/probes/lazy_predict.py +++ b/src/protify/probes/lazy_predict.py @@ -55,7 +55,29 @@ "LinearSVC", "Perceptron", "MLPClassifier", - "SGDClassifier" + "SGDClassifier", + # O(n²) memory models - too slow for large datasets + "LabelPropagation", + "LabelSpreading", + "SVC", + "NuSVC", + # Sequential ensemble models - slow for large datasets + "AdaBoostClassifier", + "BaggingClassifier", + # O(n×m) prediction time - slow for large test sets + "KNeighborsClassifier", + # Unbounded tree depth - very slow on high-dim data + "DecisionTreeClassifier", + "ExtraTreeClassifier", + "ExtraTreesClassifier", + # Fails on negative values after StandardScaler + "CategoricalNB", + # O(d²) or O(d³) - slow on high-dimensional data (4608 features) + "LinearDiscriminantAnalysis", + "QuadraticDiscriminantAnalysis", + # Requires estimator argument + "FixedThresholdClassifier", + "TunedThresholdClassifierCV", ] removed_regressors = [ @@ -82,7 +104,16 @@ "LassoLarsCV", "ElasticNetCV", "LinearSVR", - "LassoLarsIC" + "LassoLarsIC", + # Sequential ensemble models - slow for large datasets + "AdaBoostRegressor", + "BaggingRegressor", + # O(n×m) prediction time - slow for large test sets + "KNeighborsRegressor", + # Unbounded tree depth - very slow on high-dim data + "DecisionTreeRegressor", + "ExtraTreeRegressor", + "ExtraTreesRegressor", ] # Tuple of (name, class) @@ -176,6 +207,16 @@ CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier)) # CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier)) +# Update dicts with XGB and LGBM +CLASSIFIER_DICT["XGBClassifier"] = xgboost.XGBClassifier +CLASSIFIER_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier +REGRESSOR_DICT["XGBRegressor"] = xgboost.XGBRegressor +REGRESSOR_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor +ALL_MODEL_DICT["XGBClassifier"] = xgboost.XGBClassifier +ALL_MODEL_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier +ALL_MODEL_DICT["XGBRegressor"] = xgboost.XGBRegressor +ALL_MODEL_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor + numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())] ) @@ -309,6 +350,13 @@ def fit(self, X_train, X_test, y_train, y_test): ("categorical_high", categorical_transformer_high, categorical_high), ] ) + + # Precompute preprocessing once for all models (major optimization for large datasets) + print_message("Preprocessing data once for all models...") + preprocess_start = time.time() + X_train_transformed = preprocessor.fit_transform(X_train) + X_test_transformed = preprocessor.transform(X_test) + print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s") if self.classifiers == "all": self.classifiers = CLASSIFIERS @@ -328,23 +376,25 @@ def fit(self, X_train, X_test, y_train, y_test): total_start = time.time() for name, model in tqdm(self.classifiers, desc="Training classifiers"): + print_message(f"Starting {name}...") start = time.time() try: + # Build model kwargs + model_kwargs = {} if "random_state" in model().get_params().keys(): - pipe = Pipeline( - steps=[ - ("preprocessor", preprocessor), - ("classifier", model(random_state=self.random_state)), - ] - ) - else: - pipe = Pipeline( - steps=[("preprocessor", preprocessor), ("classifier", model())] - ) - - pipe.fit(X_train, y_train) - self.models[name] = pipe - y_pred = pipe.predict(X_test) + model_kwargs["random_state"] = self.random_state + # Enable parallelization for models that support it + if "n_jobs" in model().get_params().keys(): + model_kwargs["n_jobs"] = -1 + # Enable verbose for boosting models to show iteration progress + if name in ("XGBClassifier", "LGBMClassifier"): + model_kwargs["verbose"] = 1 + + # Train directly on preprocessed data (no Pipeline needed) + clf = model(**model_kwargs) + clf.fit(X_train_transformed, y_train) + self.models[name] = clf + y_pred = clf.predict(X_test_transformed) accuracy = accuracy_score(y_test, y_pred, normalize=True) b_accuracy = balanced_accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") @@ -362,6 +412,8 @@ def fit(self, X_train, X_test, y_train, y_test): ROC_AUC.append(roc_auc) F1.append(f1) TIME.append(fit_time) + + print_message(f" {name} completed in {fit_time:.1f}s | Acc: {accuracy:.3f} | F1: {f1:.3f}") if self.custom_metric is not None: custom_metric = self.custom_metric(y_test, y_pred) @@ -548,6 +600,13 @@ def fit(self, X_train, X_test, y_train, y_test): ("categorical_high", categorical_transformer_high, categorical_high), ] ) + + # Precompute preprocessing once for all models (major optimization for large datasets) + print_message("Preprocessing data once for all models...") + preprocess_start = time.time() + X_train_transformed = preprocessor.fit_transform(X_train) + X_test_transformed = preprocessor.transform(X_test) + print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s") if self.regressors == "all": self.regressors = REGRESSORS @@ -567,23 +626,25 @@ def fit(self, X_train, X_test, y_train, y_test): total_start = time.time() for name, model in tqdm(self.regressors, desc="Training regressors"): + print_message(f"Starting {name}...") start = time.time() try: + # Build model kwargs + model_kwargs = {} if "random_state" in model().get_params().keys(): - pipe = Pipeline( - steps=[ - ("preprocessor", preprocessor), - ("regressor", model(random_state=self.random_state)), - ] - ) - else: - pipe = Pipeline( - steps=[("preprocessor", preprocessor), ("regressor", model())] - ) - - pipe.fit(X_train, y_train) - self.models[name] = pipe - y_pred = pipe.predict(X_test) + model_kwargs["random_state"] = self.random_state + # Enable parallelization for models that support it + if "n_jobs" in model().get_params().keys(): + model_kwargs["n_jobs"] = -1 + # Enable verbose for boosting models to show iteration progress + if name in ("XGBRegressor", "LGBMRegressor"): + model_kwargs["verbose"] = 1 + + # Train directly on preprocessed data (no Pipeline needed) + reg = model(**model_kwargs) + reg.fit(X_train_transformed, y_train) + self.models[name] = reg + y_pred = reg.predict(X_test_transformed) r_squared = r2_score(y_test, y_pred) adj_rsquared = adjusted_rsquared( @@ -597,6 +658,8 @@ def fit(self, X_train, X_test, y_train, y_test): ADJR2.append(adj_rsquared) RMSE.append(rmse) TIME.append(fit_time) + + print_message(f" {name} completed in {fit_time:.1f}s | R²: {r_squared:.3f} | RMSE: {rmse:.3f}") if self.custom_metric: custom_metric = self.custom_metric(y_test, y_pred) diff --git a/src/protify/probes/scikit_classes.py b/src/protify/probes/scikit_classes.py index 68dbfd2..9407dd3 100644 --- a/src/protify/probes/scikit_classes.py +++ b/src/protify/probes/scikit_classes.py @@ -34,18 +34,31 @@ def __init__( random_state: Optional[int] = None, # Specific model arguments (optional) model_name: Optional[str] = None, + scikit_model_name: Optional[str] = None, # CLI arg name + scikit_model_args: Optional[str] = None, # CLI arg - JSON string model_args: Optional[Dict[str, Any]] = None, production_model: bool = False, **kwargs, ): + import json # Tuning arguments self.n_iter = n_iter self.cv = cv self.random_state = random_state or get_global_seed() - # Specific model arguments - self.model_name = model_name - self.model_args = model_args if model_args is not None else {} + # Specific model arguments - scikit_model_name takes precedence (CLI arg) + self.model_name = scikit_model_name or model_name + + # Parse scikit_model_args JSON string if provided (CLI), otherwise use model_args dict + if scikit_model_args is not None: + try: + self.model_args = json.loads(scikit_model_args) + print_message(f"Using pre-specified hyperparameters (skipping tuning): {self.model_args}") + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse --scikit_model_args JSON: {e}") + else: + self.model_args = model_args if model_args is not None else {} + self.production_model = production_model @@ -93,8 +106,12 @@ def _tune_hyperparameters( """ param_distributions = HYPERPARAMETER_DISTRIBUTIONS.get(model_name, {}) if not param_distributions: + print_message(f"No hyperparameter distributions defined for {model_name}, using defaults") return model_class(), {} + print_message(f"Running RandomizedSearchCV with {self.args.n_iter} iterations, {self.args.cv}-fold CV...") + print_message(f"Hyperparameter search space: {list(param_distributions.keys())}") + random_search = RandomizedSearchCV( model_class(), param_distributions=param_distributions, @@ -102,10 +119,12 @@ def _tune_hyperparameters( scoring=custom_scorer, cv=self.args.cv, random_state=self.args.random_state, - n_jobs=self.n_jobs + n_jobs=self.n_jobs, + verbose=2 # Show progress for each fit ) random_search.fit(X_train, y_train) + print_message(f"Best CV score: {random_search.best_score_:.4f}") return random_search.best_estimator_, random_search.best_params_ def find_best_regressor( @@ -140,7 +159,8 @@ def find_best_regressor( # Get best model name and class best_model_name = initial_scores.index[0] - best_model_class = regressor.models[best_model_name].named_steps['regressor'].__class__ + # Models are now stored directly (not as Pipeline) after optimization + best_model_class = regressor.models[best_model_name].__class__ print_message(f"Best model name: {best_model_name}") print_message(f"Best model class: {best_model_class}") print_message(f"Initial scores: \n{initial_scores}") @@ -202,7 +222,8 @@ def find_best_classifier( # Get best model name and class best_model_name = initial_scores.index[0] - best_model_class = classifier.models[best_model_name].named_steps['classifier'].__class__ + # Models are now stored directly (not as Pipeline) after optimization + best_model_class = classifier.models[best_model_name].__class__ print_message(f"Best model name: {best_model_name}") print_message(f"Best model class: {best_model_class}") print_message(f"Initial scores: \n{initial_scores}") @@ -307,16 +328,36 @@ def run_specific_model( raise ValueError(f"Model {model_name} not supported") model_class = ALL_MODEL_DICT[model_name] - cls = model_class(**self.args.model_args) - cls.fit(X_train, y_train) - final_scores = scorer(cls, X_test, y_test) + + # Skip tuning if model_args is already provided + if self.args.model_args: + print_message(f"Skipping hyperparameter tuning - using provided args: {self.args.model_args}") + best_model = model_class(**self.args.model_args) + best_params = self.args.model_args + else: + # Run hyperparameter tuning + print_message(f"Tuning hyperparameters for {model_name}") + best_model, best_params = self._tune_hyperparameters( + model_class, + model_name, + X_train, + y_train, + scorer + ) + print_message(f"Best parameters: {best_params}") + + # Train final model with best parameters + print_message(f"Training final model with best parameters") + best_model.fit(X_train, y_train) + final_scores = scorer(best_model, X_test, y_test) + print_message(f"Final scores: {final_scores}") return ModelResults( initial_scores=None, best_model_name=model_name, - best_params=None, + best_params=best_params, final_scores=final_scores, - best_model=cls + best_model=best_model ) else: raise ValueError("Either model_name must be specified in args or model_results must be provided")