From e5b48521442e799452333c8b7fa0f4519cf7f7f0 Mon Sep 17 00:00:00 2001 From: "Adelson D. de Araujo jr" Date: Sat, 30 Aug 2025 21:52:11 -0300 Subject: [PATCH 1/2] Feat: Add initial unit tests for Handler.__init__ in model_selection.py. --- tests/test_data/dummy_config.json | 15 +++++++++++ tests/test_data/dummy_data.csv | 5 ++++ tests/test_model_selection.py | 44 +++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 tests/test_data/dummy_config.json create mode 100644 tests/test_data/dummy_data.csv create mode 100644 tests/test_model_selection.py diff --git a/tests/test_data/dummy_config.json b/tests/test_data/dummy_config.json new file mode 100644 index 0000000..f4a79b2 --- /dev/null +++ b/tests/test_data/dummy_config.json @@ -0,0 +1,15 @@ +{ + "dataset_name": "dummy_dataset", + "code_name": "code", + "codes": ["A", "B", "C"], + "default_code": "A", + "language_featurizer": "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3", + "sent_hl_units": 10, + "sent_dropout": 0.5, + "consent_hl_units": 5, + "lags": 2, + "max_epochs": 1, + "callback_patience": 1, + "learning_rate": 0.001, + "batch_size": 32 +} \ No newline at end of file diff --git a/tests/test_data/dummy_data.csv b/tests/test_data/dummy_data.csv new file mode 100644 index 0000000..dba40ec --- /dev/null +++ b/tests/test_data/dummy_data.csv @@ -0,0 +1,5 @@ +dialog_id,username,text,code,timestamp +dialog_1,user_a,hello,A,2023-01-01 +dialog_1,user_b,hi there,B,2023-01-01 +dialog_2,user_a,how are you,C,2023-01-02 +dialog_2,user_b,fine thanks,A,2023-01-02 \ No newline at end of file diff --git a/tests/test_model_selection.py b/tests/test_model_selection.py new file mode 100644 index 0000000..77668ee --- /dev/null +++ b/tests/test_model_selection.py @@ -0,0 +1,44 @@ +import unittest +import os +import pandas as pd +from consent.model_selection import Handler +from consent import Config + +class TestHandler(unittest.TestCase): + + def setUp(self): + self.dummy_config_path = "tests/test_data/dummy_config.json" + self.dummy_data_path = "tests/test_data/dummy_data.csv" + + def test_init_with_config_file_path(self): + handler = Handler( + data_file_path=self.dummy_data_path, + config_file_path=self.dummy_config_path + ) + self.assertIsInstance(handler.config, Config) + self.assertIsInstance(handler.data_df, pd.DataFrame) + self.assertEqual(handler.config.dataset_name, "dummy_dataset") + self.assertEqual(handler.data_df.shape, (4, 5)) + self.assertIn("dialog_id", handler.data_df.columns) + self.assertIn("code", handler.data_df.columns) + + def test_init_missing_config_and_load_model(self): + with self.assertRaises(AssertionError) as cm: + Handler(data_file_path=self.dummy_data_path) + self.assertIn("Either provide a `config_file_path` or `load_model`.", str(cm.exception)) + + def test_init_invalid_config_file_path(self): + invalid_path = "tests/test_data/non_existent_config.json" + with self.assertRaises(FileNotFoundError): + Handler( + data_file_path=self.dummy_data_path, + config_file_path=invalid_path + ) + + def test_init_invalid_data_file_path(self): + invalid_path = "tests/test_data/non_existent_data.csv" + with self.assertRaises(FileNotFoundError): + Handler( + data_file_path=invalid_path, + config_file_path=self.dummy_config_path + ) \ No newline at end of file From 2d35f816e82586ff5f2e9d54f97bb225ffabd3e9 Mon Sep 17 00:00:00 2001 From: "Adelson D. de Araujo jr" Date: Sat, 30 Aug 2025 22:26:54 -0300 Subject: [PATCH 2/2] Feat: Add comprehensive unit tests for Handler.hyperparameter_tuning. --- consent/model_selection.py | 3 +- consent/utils.py | 5 +- tests/test_data/dummy_hyper_config.json | 15 +++ tests/test_model_selection.py | 163 +++++++++++++++++++++++- 4 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 tests/test_data/dummy_hyper_config.json diff --git a/consent/model_selection.py b/consent/model_selection.py index 5c61a24..87dc654 100644 --- a/consent/model_selection.py +++ b/consent/model_selection.py @@ -189,8 +189,7 @@ def hyperparameter_tuning(self, # generate predictions predictions = test_data.groupby('dialog_id')\ .apply(consent.predict_sequence) - results = pd.concat(predictions.apply(pd.DataFrame).values)\ - .reset_index() + results = predictions.reset_index() consent.cache = {} # clear cache for the next model # Evaluation metrics m = self.compute_metrics(results['code'], diff --git a/consent/utils.py b/consent/utils.py index 9747acd..83f4b08 100644 --- a/consent/utils.py +++ b/consent/utils.py @@ -7,9 +7,9 @@ def load_data(file_path, code_name): try: - data_df = pd.read_csv(file_path, index_col=0) + data_df = pd.read_csv(file_path) except: - data_df = pd.read_csv(file_path, index_col=0, sep='|') + data_df = pd.read_csv(file_path, sep='|') # Requires columns = username, text/message, group/dialog_id, data_df = data_df.rename(columns={ @@ -29,6 +29,7 @@ def train_test_split(data_df: pd.DataFrame, np.random.seed(random_state) test_groups = int(data_df['dialog_id'].nunique() * test_size) test_groups = np.random.choice(data_df['dialog_id'].unique(), test_groups) + print(f"train_test_split: test_groups = {test_groups}") # Added print statement train = data_df.loc[~data_df['dialog_id'].isin(test_groups)] test = data_df.loc[data_df['dialog_id'].isin(test_groups)] return train, test diff --git a/tests/test_data/dummy_hyper_config.json b/tests/test_data/dummy_hyper_config.json new file mode 100644 index 0000000..f87dc72 --- /dev/null +++ b/tests/test_data/dummy_hyper_config.json @@ -0,0 +1,15 @@ +{ + "dataset_name": "dummy_dataset", + "code_name": "code", + "codes": ["A", "B", "C"], + "default_code": "A", + "language_featurizer": "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3", + "sent_hl_units": [10, 20], + "sent_dropout": [0.5, 0.6], + "consent_hl_units": 5, + "lags": 2, + "max_epochs": 1, + "callback_patience": 1, + "learning_rate": 0.001, + "batch_size": 32 +} \ No newline at end of file diff --git a/tests/test_model_selection.py b/tests/test_model_selection.py index 77668ee..e1e685f 100644 --- a/tests/test_model_selection.py +++ b/tests/test_model_selection.py @@ -3,12 +3,14 @@ import pandas as pd from consent.model_selection import Handler from consent import Config +from unittest import mock class TestHandler(unittest.TestCase): def setUp(self): self.dummy_config_path = "tests/test_data/dummy_config.json" self.dummy_data_path = "tests/test_data/dummy_data.csv" + self.dummy_hyper_config_path = "tests/test_data/dummy_hyper_config.json" def test_init_with_config_file_path(self): handler = Handler( @@ -41,4 +43,163 @@ def test_init_invalid_data_file_path(self): Handler( data_file_path=invalid_path, config_file_path=self.dummy_config_path - ) \ No newline at end of file + ) + + @mock.patch('consent.model_selection.ParameterSampler') + @mock.patch('consent.model_selection.ConSent') + @mock.patch('consent.wandb') + def test_hyperparameter_tuning_parses_experiment_grid(self, mock_wandb, mock_ConSent, mock_ParameterSampler): + # Setup Handler with hyperparameter config + handler = Handler( + data_file_path=self.dummy_data_path, + config_file_path=self.dummy_hyper_config_path + ) + + # Configure the mock ConSent instance + # Create a dummy DataFrame that predict_sequence would return + dummy_predictions_df = pd.DataFrame({ + 'username': ['u1'], + 'text': ['t1'], + 'code': ['A'], + 'sent_code': ['A'], + 'consent_code': ['A'] + }) + # Set the return value for the predict_sequence method of the mock ConSent instance + mock_ConSent.return_value.predict_sequence.return_value = dummy_predictions_df + + # Mock the return value of ParameterSampler to control iterations + mock_ParameterSampler.return_value = iter([ + {'sent_hl_units': 10, 'sent_dropout': 0.5} # Just one iteration for this test + ]) + + # Call hyperparameter_tuning + handler.hyperparameter_tuning(n_iter=1, val_size=0.5) + + # Assert that ParameterSampler was called with the correct param_distributions + expected_param_distributions = { + 'sent_hl_units': [10, 20], + 'sent_dropout': [0.5, 0.6] + } + mock_ParameterSampler.assert_called_once_with( + param_distributions=expected_param_distributions, + n_iter=1, + random_state=handler.random_state + ) + + @mock.patch('consent.model_selection.ParameterSampler') + @mock.patch('consent.model_selection.ConSent') + @mock.patch('consent.wandb') + def test_hyperparameter_tuning_n_iter_and_combinations(self, mock_wandb, mock_ConSent, mock_ParameterSampler): + # Setup Handler with hyperparameter config + handler = Handler( + data_file_path=self.dummy_data_path, + config_file_path=self.dummy_hyper_config_path + ) + + # Configure the mock ConSent instance + dummy_predictions_df = pd.DataFrame({ + 'username': ['u1'], 'text': ['t1'], 'code': ['A'], + 'sent_code': ['A'], 'consent_code': ['A'] + }) + mock_ConSent.return_value.predict_sequence.return_value = dummy_predictions_df + + # Define the expected parameter combinations + expected_combinations = [ + {'sent_hl_units': 10, 'sent_dropout': 0.5}, + {'sent_hl_units': 10, 'sent_dropout': 0.6}, + {'sent_hl_units': 20, 'sent_dropout': 0.5}, + {'sent_hl_units': 20, 'sent_dropout': 0.6}, + ] + # Mock the return value of ParameterSampler to control iterations + mock_ParameterSampler.return_value = iter(expected_combinations) + + # Call hyperparameter_tuning with n_iter=-1 to test all combinations + handler.hyperparameter_tuning(n_iter=-1, val_size=0.5) + + # Assert that ParameterSampler was called with the correct param_distributions + expected_param_distributions = { + 'sent_hl_units': [10, 20], + 'sent_dropout': [0.5, 0.6] + } + mock_ParameterSampler.assert_called_once_with( + param_distributions=expected_param_distributions, + n_iter=len(expected_combinations), # Should be 4 combinations + random_state=handler.random_state + ) + # Assert that ConSent.train was called for each combination + self.assertEqual(mock_ConSent.return_value.train.call_count, len(expected_combinations)) + # Assert that wandb.log was called for each combination + self.assertEqual(mock_ConSent.return_value.wandb_run.log.call_count, len(expected_combinations)) + + @mock.patch('consent.model_selection.ParameterSampler') + @mock.patch('consent.model_selection.ConSent') + @mock.patch('consent.wandb') + @mock.patch('consent.model_selection.Handler.compute_metrics') # Mock compute_metrics method + @mock.patch('consent.model_selection.utils.train_test_split') # Mock train_test_split globally + def test_hyperparameter_tuning_calls_train_and_evaluation(self, mock_train_test_split, mock_compute_metrics, mock_wandb, mock_ConSent, mock_ParameterSampler): + # Define the expected train_data and test_data that would be returned by train_test_split + expected_train_data = pd.DataFrame({ + 'dialog_id': ['dialog_1', 'dialog_1'], + 'username': ['user_a', 'user_b'], + 'text': ['hello', 'hi there'], + 'code': ['A', 'B'], + 'timestamp': ['2023-01-01', '2023-01-01'] + }) + expected_test_data = pd.DataFrame({ + 'dialog_id': ['dialog_2', 'dialog_2'], + 'username': ['user_a', 'user_b'], + 'text': ['how are you', 'fine thanks'], + 'code': ['C', 'A'], + 'timestamp': ['2023-01-02', '2023-01-02'] + }) + mock_train_test_split.return_value = (expected_train_data, expected_test_data) + + # Setup Handler with hyperparameter config + handler = Handler( + data_file_path=self.dummy_data_path, + config_file_path=self.dummy_hyper_config_path + ) + + # Configure the mock ConSent instance + dummy_predictions_df = pd.DataFrame({ + 'username': ['u1', 'u2'], + 'text': ['t1', 't2'], + 'code': ['C', 'A'], # These should match the 'code' column of expected_test_data + 'sent_code': ['C', 'A'], + 'consent_code': ['C', 'A'] + }) + mock_ConSent.return_value.predict_sequence.return_value = dummy_predictions_df + + # Mock the return value of ParameterSampler to control iterations + expected_combinations = [ + {'sent_hl_units': 10, 'sent_dropout': 0.5}, + {'sent_hl_units': 20, 'sent_dropout': 0.6}, + ] + mock_ParameterSampler.return_value = iter(expected_combinations) + + # Mock the return value of compute_metrics + mock_compute_metrics.return_value = {'accuracy': 0.8} + + # Call hyperparameter_tuning + handler.hyperparameter_tuning(n_iter=len(expected_combinations), val_size=0.5) + + # Assert that ConSent.train was called for each combination + self.assertEqual(mock_ConSent.return_value.train.call_count, len(expected_combinations)) + # Assert that compute_metrics was called for each combination + self.assertEqual(mock_compute_metrics.call_count, len(expected_combinations)) + + # Assert train was called with the mocked train_data + actual_train_data_call = mock_ConSent.return_value.train.call_args[0][0] + pd.testing.assert_frame_equal(actual_train_data_call, expected_train_data) + self.assertEqual(mock_ConSent.return_value.train.call_args[1]['tf_verbosity'], 2) # tf_verbosity + + # Assert compute_metrics was called with the correct arguments + actual_y_true_call = mock_compute_metrics.call_args[0][0] + actual_y_pred_call = mock_compute_metrics.call_args[0][1] + actual_config_call = mock_compute_metrics.call_args[0][2] + + pd.testing.assert_series_equal(actual_y_true_call, expected_test_data['code']) + pd.testing.assert_series_equal(actual_y_pred_call, dummy_predictions_df['consent_code']) + self.assertIsInstance(actual_config_call, Config) + # You might want to assert specific attributes of the config if needed + self.assertEqual(actual_config_call.dataset_name, "dummy_dataset") \ No newline at end of file