From 1931b5da28a021522973f0a2f123b9f2b4c446f5 Mon Sep 17 00:00:00 2001
From: Aidan Borkan <134334100+aidanborkan@users.noreply.github.com>
Date: Fri, 15 Mar 2024 15:16:19 -0600
Subject: [PATCH] Update train_model.py

replace search for protein column with _proteomics instead use a REGEX
---
 predict_protein/train_model.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/predict_protein/train_model.py b/predict_protein/train_model.py
index 4f88316..04433ba 100644
--- a/predict_protein/train_model.py
+++ b/predict_protein/train_model.py
@@ -133,8 +133,16 @@ def get_train_test(self,
         :return:  x_train, x_test, y_train, y_test
         """
 
-        y_df = self.df[[protein_to_do + '_proteomics']]
-        y_df = y_df.dropna(subset=[protein_to_do + '_proteomics'])
+        #y_df = self.df[[protein_to_do + '_proteomics']]
+                           #AB 03.15
+                           
+        #y_df = y_df.dropna(subset=[protein_to_do + '_proteomics'])
+        #AB: use a REGEX
+        regex_pattern = protein_to_do + '_proteomics$'
+        y_df = self.df.filter(regex=regex_pattern)
+        # After filtering, the column names in y_df will be exactly those that matched the regex,
+        # so you can safely call dropna() on the entire DataFrame without specifying a subset.
+        y_df = y_df.dropna()
 
         # skip proteins with fewer than 20 samples
         # 2021-11-12 this should be filtered at the protein step (y_df) rather than the