From 1931b5da28a021522973f0a2f123b9f2b4c446f5 Mon Sep 17 00:00:00 2001 From: Aidan Borkan <134334100+aidanborkan@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:16:19 -0600 Subject: [PATCH] Update train_model.py replace search for protein column with _proteomics instead use a REGEX --- predict_protein/train_model.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/predict_protein/train_model.py b/predict_protein/train_model.py index 4f88316..04433ba 100644 --- a/predict_protein/train_model.py +++ b/predict_protein/train_model.py @@ -133,8 +133,16 @@ def get_train_test(self, :return: x_train, x_test, y_train, y_test """ - y_df = self.df[[protein_to_do + '_proteomics']] - y_df = y_df.dropna(subset=[protein_to_do + '_proteomics']) + #y_df = self.df[[protein_to_do + '_proteomics']] + #AB 03.15 + + #y_df = y_df.dropna(subset=[protein_to_do + '_proteomics']) + #AB: use a REGEX + regex_pattern = protein_to_do + '_proteomics$' + y_df = self.df.filter(regex=regex_pattern) + # After filtering, the column names in y_df will be exactly those that matched the regex, + # so you can safely call dropna() on the entire DataFrame without specifying a subset. + y_df = y_df.dropna() # skip proteins with fewer than 20 samples # 2021-11-12 this should be filtered at the protein step (y_df) rather than the