diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 9faea2be4..6a34889e5 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1048,16 +1048,26 @@ def non_empty_within_except_last_row(self, other_value: dict): @type_operator(FIELD_DATAFRAME) def contains_all(self, other_value: dict): target = self.replace_prefix(other_value.get("target")) + value_is_literal: bool = other_value.get("value_is_literal", False) comparator = other_value.get("comparator") - if isinstance(comparator, list): - # get column as array of values - values = flatten_list(self.value, comparator) + if self.is_column_of_iterables( + self.value[target] + ) and self.is_column_of_iterables(self.value[comparator]): + comparison_data = self.get_comparator_data(comparator, value_is_literal) + results = [] + for i in range(len(self.value[target])): + target_val = self.value[target].iloc[i] + comp_val = comparison_data.iloc[i] + results.append(all(is_in(item, target_val) for item in comp_val)) else: - comparator = self.replace_prefix(comparator) - values = self.value[comparator].unique() - return self.value.convert_to_series( - set(values).issubset(set(self.value[target].unique())) - ) + if isinstance(comparator, list): + # get column as array of values + values = flatten_list(self.value, comparator) + else: + comparator = self.replace_prefix(comparator) + values = self.value[comparator].unique() + results = set(values).issubset(set(self.value[target].unique())) + return self.value.convert_to_series(results) @log_operator_execution @type_operator(FIELD_DATAFRAME) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 3f264531f..9a2be016d 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -58,3 +58,4 @@ class OperationParams: target: str = None value_is_reference: bool = False namespace: str = None + delimiter: str = None diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index 2a77adf09..801df7a08 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -42,6 +42,7 @@ from cdisc_rules_engine.operations.min_date import MinDate from cdisc_rules_engine.operations.minimum import Minimum from cdisc_rules_engine.operations.record_count import RecordCount +from cdisc_rules_engine.operations.split_by import SplitBy from cdisc_rules_engine.operations.valid_external_dictionary_code import ( ValidExternalDictionaryCode, ) @@ -121,6 +122,7 @@ class OperationsFactory(FactoryInterface): "domain_is_custom": DomainIsCustom, "domain_label": DomainLabel, "required_variables": RequiredVariables, + "split_by": SplitBy, "expected_variables": ExpectedVariables, "permissible_variables": PermissibleVariables, "study_domains": StudyDomains, diff --git a/cdisc_rules_engine/operations/split_by.py b/cdisc_rules_engine/operations/split_by.py new file mode 100644 index 000000000..307ec46b0 --- /dev/null +++ b/cdisc_rules_engine/operations/split_by.py @@ -0,0 +1,13 @@ +from cdisc_rules_engine.operations.base_operation import BaseOperation + + +class SplitBy(BaseOperation): + def _execute_operation(self): + if not all((self.params.target, self.params.delimiter)): + raise ValueError( + f"name and delimiter are required params for operation {self.params.operation_name}" + ) + + return self.evaluation_dataset[self.params.target].str.split( + self.params.delimiter + ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 3cf80f8a3..73a89b4c5 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -422,6 +422,7 @@ def perform_rule_operations( term_pref_term=operation.get("term_pref_term"), namespace=operation.get("namespace"), value_is_reference=operation.get("value_is_reference", False), + delimiter=operation.get("delimiter"), ) # execute operation diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index 2cf1ce254..7e9013d75 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -234,6 +234,13 @@ "required": ["id", "operator"], "type": "object" }, + { + "properties": { + "operator": { "const": "split_by" } + }, + "required": ["id", "operator", "delimiter", "name"], + "type": "object" + }, { "properties": { "operator": { "const": "study_domains" } diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index a73a63614..6214da388 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -1268,3 +1268,15 @@ Operations: ``` Note that a local XSD file is required for validation. The file must be stored in the folder indicated by the value of the `LOCAL_XSD_FILE_DIR` default file path and the mapping between the namespace and the local XSD file's `sub-folder/name` must be included in the value of the `LOCAL_XSD_FILE_MAP` default file path. + +### split_by + +Splits a dataset column by a given delimiter + +```yaml +Operations: + - name: PPSPEC + delimiter: ; + id: $ppspec_value + operator: split_by +``` diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index 3810a2841..377ff3882 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -746,6 +746,22 @@ True if all values in `value` are contained within the variable `name`. - "Unplanned Treatment" ``` +The operator also supports lists: + +```yaml +- name: "$spec_codelist" + operator: "contains_all" + value: "$ppspec_value" +``` + +Where: + +| $spec_codelist | $ppspec_value | +| :-------------------------- | :----------------: | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | + ### not_contains_all Complement of `contains_all` @@ -762,6 +778,22 @@ Complement of `contains_all` - "Unplanned Treatment" ``` +The operator also supports lists: + +```yaml +- name: "$spec_codelist" + operator: "not_contains_all" + value: "$ppspec_value" +``` + +Where: + +| $spec_codelist | $ppspec_value | +| :-------------------------- | :----------------: | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | + ### shares_at_least_one_element_with Will raise an issue if at least one of the values in `name` is the same as one of the values in `value`. See [shares_no_elements_with](#shares_no_elements_with). diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue890.py b/tests/QARegressionTests/test_Issues/test_CoreIssue890.py new file mode 100644 index 000000000..69bd65e83 --- /dev/null +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue890.py @@ -0,0 +1,162 @@ +import os +import subprocess +import unittest +import openpyxl +import pytest +from conftest import get_python_executable +from QARegressionTests.globals import ( + issue_datails_sheet, + rules_report_sheet, + issue_sheet_record_column, + issue_sheet_variable_column, + issue_sheet_values_column, +) + + +@pytest.mark.regression +class TestColumnConsistsOfDelimitedCodelists(unittest.TestCase): + def test_positive_dataset(self): + # Run the command in the terminal + command = [ + f"{get_python_executable()}", + "-m", + "core", + "validate", + "-s", + "send", + "-v", + "1-0", + "-dp", + os.path.join( + "tests", + "resources", + "CoreIssue890", + "unit-test-coreid-SENDIG282-positive.json", + ), + "-lr", + os.path.join("tests", "resources", "CoreIssue890", "Rule.yml"), + "-ct", + "sendct-2025-09-26", + ] + subprocess.run(command, check=True) + + # Get the latest created Excel file + files = os.listdir() + excel_files = [ + file + for file in files + if file.startswith("CORE-Report-") and file.endswith(".xlsx") + ] + excel_file_path = sorted(excel_files)[-1] + # # Open the Excel file + workbook = openpyxl.load_workbook(excel_file_path) + + # Go to the "Issue Details" sheet + sheet = workbook[issue_datails_sheet] + + record_column = sheet[issue_sheet_record_column] + variables_column = sheet[issue_sheet_variable_column] + values_column = sheet[issue_sheet_values_column] + + record_values = [cell.value for cell in record_column[1:]] + variables_values = [cell.value for cell in variables_column[1:]] + values_column_values = [cell.value for cell in values_column[1:]] + + # Remove None values using list comprehension + record_values = [value for value in record_values if value is not None] + variables_values = [value for value in variables_values if value is not None] + values_column_values = [ + value for value in values_column_values if value is not None + ] + rules_values = [ + row for row in workbook[rules_report_sheet].iter_rows(values_only=True) + ][1:] + rules_values = [row for row in rules_values if any(row)] + # Perform the assertion + # Ensure only two negative values are caught + assert rules_values[0][0] == "CDISC.SENDIG.SEND282" + assert len(record_values) == 0 + assert len(variables_values) == 0 + assert len(values_column_values) == 0 + if os.path.exists(excel_file_path): + os.remove(excel_file_path) + + def test_negaive_dataset(self): + # Run the command in the terminal + command = [ + f"{get_python_executable()}", + "-m", + "core", + "validate", + "-s", + "send", + "-v", + "1-0", + "-dp", + os.path.join( + "tests", + "resources", + "CoreIssue890", + "unit-test-coreid-SENDIG282-negative.json", + ), + "-lr", + os.path.join("tests", "resources", "CoreIssue890", "Rule.yml"), + "-ct", + "sendct-2025-09-26", + ] + subprocess.run(command, check=True) + + # Get the latest created Excel file + files = os.listdir() + excel_files = [ + file + for file in files + if file.startswith("CORE-Report-") and file.endswith(".xlsx") + ] + excel_file_path = sorted(excel_files)[-1] + # Open the Excel file + workbook = openpyxl.load_workbook(excel_file_path) + + # --- Dataset Details --- + dataset_sheet = workbook["Dataset Details"] + dataset_values = [row for row in dataset_sheet.iter_rows(values_only=True)][1:] + dataset_values = [row for row in dataset_values if any(row)] + assert len(dataset_values) > 0 + assert dataset_values[0][0] == "pp.xpt" + assert dataset_values[0][1] == "Pharmacokinetics Parameters" + assert dataset_values[0][-1] == 4 + + # --- Issue Summary --- + issue_summary_sheet = workbook["Issue Summary"] + summary_values = [ + row for row in issue_summary_sheet.iter_rows(values_only=True) + ][1:] + summary_values = [row for row in summary_values if any(row)] + assert len(summary_values) > 0 + assert summary_values[0][0] == "pp.xpt" + assert summary_values[0][1] == "CDISC.SENDIG.SEND282" + assert summary_values[0][3] == 2 + + # --- Issue Details --- + issue_details_sheet = workbook["Issue Details"] + details_values = [ + row for row in issue_details_sheet.iter_rows(values_only=True) + ][1:] + details_values = [row for row in details_values if any(row)] + assert all(row[0] == "CDISC.SENDIG.SEND282" for row in details_values) + assert len(details_values) == 2 + + # --- Rules Report --- + rules_values = [ + row for row in workbook["Rules Report"].iter_rows(values_only=True) + ][1:] + rules_values = [row for row in rules_values if any(row)] + assert len(rules_values) > 0 + assert rules_values[0][0] == "CDISC.SENDIG.SEND282" + + if os.path.exists(excel_file_path): + os.remove(excel_file_path) + + +# if __name__ == "__main__": +# unittest.main() diff --git a/tests/resources/CoreIssue890/Rule.yml b/tests/resources/CoreIssue890/Rule.yml new file mode 100644 index 000000000..2ac8a5682 --- /dev/null +++ b/tests/resources/CoreIssue890/Rule.yml @@ -0,0 +1,132 @@ +Authorities: + - Organization: CDISC + Standards: + - Name: SENDIG + References: + - Citations: + - Cited Guidance: + "Defines the type of specimen used for a measurement. Examples: + SERUM, PLASMA, URINE. If multiple specimen types are used for + a calculation (e.g., serum and urine for creatinine + clearance), then refer to Section 4.3.6.2 for guidance on how + to populate." + Document: IG v3.1 + Item: PPSPEC CDISC Notes + Section: 6.3.12 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND282" + Version: "1" + Version: "5.0" + Version: "3.1" + - Name: SENDIG + References: + - Citations: + - Cited Guidance: + "Defines the type of specimen used for a measurement. Examples: + SERUM, PLASMA, URINE. If multiple specimen types are used for + a calculation (e.g., serum and urine for creatinine + clearance), then refer to Section 4.3.6.2 for guidance on how + to populate." + Document: IG v3.1.1 + Item: PPSPEC CDISC Notes + Section: 6.3.12 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND282" + Version: "1" + Version: "5.0" + Version: "3.1.1" + - Name: SENDIG-DART + References: + - Citations: + - Cited Guidance: + "Defines the type of specimen used for a measurement. Examples: + SERUM, PLASMA, URINE. If multiple specimen types are used for + a calculation (e.g., serum and urine for creatinine + clearance), then refer to Section 4.3.6.2 for guidance on how + to populate." + Document: IG v3.1 + Item: PPSPEC CDISC Notes + Section: 6.3.12 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND282" + Version: "1" + Version: "5.0" + Version: "1.1" + - Name: SENDIG-DART + References: + - Citations: + - Cited Guidance: + "Defines the type of specimen used for a measurement. Examples: + SERUM, PLASMA, URINE. If multiple specimen types are used for + a calculation (e.g., serum and urine for creatinine + clearance), then refer to Section 4.3.6.2 for guidance on how + to populate." + Document: IG v3.1.1 + Item: PPSPEC CDISC Notes + Section: 6.3.12 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND282" + Version: "1" + Version: "5.0" + Version: "1.2" + - Name: SENDIG-GENETOX + References: + - Citations: + - Cited Guidance: + "Defines the type of specimen used for a measurement. Examples: + SERUM, PLASMA, URINE. If multiple specimen types are used for + a calculation (e.g., serum and urine for creatinine + clearance), then refer to Section 4.3.6.2 for guidance on how + to populate." + Document: IG v3.1.1 + Item: PPSPEC CDISC Notes + Section: 6.3.12 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND282" + Version: "1" + Version: "5.0" + Version: "1.0" +Check: + all: + - name: $spec_codelist + operator: not_contains_all + value: $ppspec_value +Core: + Id: CDISC.SENDIG.SEND282 + Status: Draft + Version: "1" +Operations: + - codelists: + - SPEC + level: term + id: $spec_codelist + operator: codelist_terms + returntype: value + - name: PPSPEC + delimiter: ; + id: $ppspec_value + operator: split_by +Description: If multiple specimen types are used for a calculation (e.g., serum + and urine for creatinine clearance), then this field should be populated with + values from the (SPEC) Controlled Terminology codelist delimited by a + semicolon. +Executability: Fully Executable +Outcome: + Message: The multiple specimens are not correctly separated by a semicolon + Output Variables: + - $ppspec_value + - $spec_codelist +Rule Type: Record Data +Scope: + Classes: + Include: + - ALL + Domains: + Include: + - ALL +Sensitivity: Record diff --git a/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-negative.json b/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-negative.json new file mode 100644 index 000000000..9d62965bd --- /dev/null +++ b/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-negative.json @@ -0,0 +1,82 @@ +{ + "datasets": [ + { + "filename": "pp.xpt", + "label": "Pharmacokinetics Parameters", + "domain": "PP", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier", + "type": "Char", + "length": 20 + }, + { + "name": "PPSEQ", + "label": "Sequence Number", + "type": "Num", + "length": 20 + }, + { + "name": "PPTESTCD", + "label": "Parameter Short Name", + "type": "Char", + "length": 20 + }, + { + "name": "PPTEST", + "label": "Parameter Name", + "type": "Char", + "length": 20 + }, + { + "name": "PPSPEC", + "label": "Specimen Material Type", + "type": "Char", + "length": 20 + } + ], + "records": { + "STUDYID": [ + "CDISCPILOT01", + "CDISCPILOT01", + "CDISCPILOT01", + "CDISCPILOT01" + ], + "DOMAIN": ["PP", "PP", "PP", "PP"], + "USUBJID": ["CDISC001", "CDISC001", "CDISC001", "CDISC001"], + "PPSEQ": [1, 2, 3, 4], + "PPTESTCD": ["TMAX", "TMAX", "TMAX", "TMAX"], + "PPTEST": [ + "Time of CMAX ", + "Time of CMAX ", + "Time of CMAX ", + "Time of CMAX " + ], + "PPSPEC": [ + "ABDOMINAL WALL", + "ADIPOSE TISSUE, BROWN", + "ABDOMINAL WALL|ADIPOSE TISSUE, BROWN", + "ABDOMINAL WALL;ADIPOSE TISSUE, BROWN&AIR SAC" + ] + } + } + ], + "standard": { + "product": "sendig", + "version": "3-1" + }, + "codelists": [] +} diff --git a/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-positive.json b/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-positive.json new file mode 100644 index 000000000..c93389420 --- /dev/null +++ b/tests/resources/CoreIssue890/unit-test-coreid-SENDIG282-positive.json @@ -0,0 +1,82 @@ +{ + "datasets": [ + { + "filename": "pp.xpt", + "label": "Pharmacokinetics Parameters", + "domain": "PP", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier", + "type": "Char", + "length": 20 + }, + { + "name": "PPSEQ", + "label": "Sequence Number", + "type": "Num", + "length": 20 + }, + { + "name": "PPTESTCD", + "label": "Parameter Short Name", + "type": "Char", + "length": 20 + }, + { + "name": "PPTEST", + "label": "Parameter Name", + "type": "Char", + "length": 20 + }, + { + "name": "PPSPEC", + "label": "Specimen Material Type", + "type": "Char", + "length": 20 + } + ], + "records": { + "STUDYID": [ + "CDISCPILOT01", + "CDISCPILOT01", + "CDISCPILOT01", + "CDISCPILOT01" + ], + "DOMAIN": ["PP", "PP", "PP", "PP"], + "USUBJID": ["CDISC001", "CDISC001", "CDISC001", "CDISC001"], + "PPSEQ": [1, 2, 3, 4], + "PPTESTCD": ["TMAX", "TMAX", "TMAX", "TMAX"], + "PPTEST": [ + "Time of CMAX ", + "Time of CMAX ", + "Time of CMAX ", + "Time of CMAX " + ], + "PPSPEC": [ + "ABDOMINAL WALL", + "ADIPOSE TISSUE, BROWN", + "ABDOMINAL WALL;ADIPOSE TISSUE, BROWN", + "ABDOMINAL WALL;ADIPOSE TISSUE, BROWN;AIR SAC" + ] + } + } + ], + "standard": { + "product": "sendig", + "version": "3-1" + }, + "codelists": [] +} diff --git a/tests/unit/test_check_operators/test_containment_checks.py b/tests/unit/test_check_operators/test_containment_checks.py index e18b57be7..4f32770d7 100644 --- a/tests/unit/test_check_operators/test_containment_checks.py +++ b/tests/unit/test_check_operators/test_containment_checks.py @@ -124,6 +124,54 @@ def test_does_not_contain_case_insensitive( DaskDataset, False, ), + ( + { + "target": [ + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ], + "VAR2": [ + ["TISSUE"], + ["TISSUE", "BROWN"], + ["TISSUE", "BROWN", "AIR SAC"], + ["ADIPOSE TISSUE", "AIR SAC"], + ], + }, + "VAR2", + PandasDataset, + [ + True, + True, + True, + True, + ], + ), + ( + { + "target": [ + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ], + "VAR2": [ + ["TISSUE"], + ["TISSUE", "BROWNNNN"], + ["TISSUE", "BROWN", "AIR SAC"], + ["ADIPOSE TISSUE", "AIR SAC", "UNKNOWN"], + ], + }, + "VAR2", + PandasDataset, + [ + True, + False, + True, + False, + ], + ), ], ) def test_contains_all(data, comparator, dataset_type, expected_result): @@ -150,6 +198,54 @@ def test_contains_all(data, comparator, dataset_type, expected_result): PandasDataset, True, ), + ( + { + "target": [ + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ], + "VAR2": [ + ["TISSUE"], + ["TISSUE", "BROWN"], + ["TISSUE", "BROWN", "AIR SAC"], + ["ADIPOSE TISSUE", "AIR SAC"], + ], + }, + "VAR2", + PandasDataset, + [ + False, + False, + False, + False, + ], + ), + ( + { + "target": [ + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ["TISSUE", "ADIPOSE TISSUE", "BROWN", "AIR SAC"], + ], + "VAR2": [ + ["TISSUE"], + ["TISSUE", "BROWNNNN"], + ["TISSUE", "BROWN", "AIR SAC"], + ["ADIPOSE TISSUE", "AIR SAC", "UNKNOWN"], + ], + }, + "VAR2", + PandasDataset, + [ + False, + True, + False, + True, + ], + ), ], ) def test_not_contains_all(data, comparator, dataset_type, expected_result): diff --git a/tests/unit/test_operations/test_split_by.py b/tests/unit/test_operations/test_split_by.py new file mode 100644 index 000000000..bd414e8c7 --- /dev/null +++ b/tests/unit/test_operations/test_split_by.py @@ -0,0 +1,42 @@ +import pandas as pd + +from cdisc_rules_engine.config import ConfigService +from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.operations.split_by import SplitBy +from cdisc_rules_engine.services.cache import CacheServiceFactory +from cdisc_rules_engine.services.data_services import DataServiceFactory + + +def test_split_by(operation_params: OperationParams): + dataset = PandasDataset.from_dict( + { + "target": [ + "ABDOMINAL WALL", + "ABDOMINAL WALL;ADIPOSE TISSUE, BROWN", + "ABDOMINAL WALL;ADIPOSE TISSUE, BROWN;AIR SAC", + ] + } + ) + config = ConfigService() + cache_service = CacheServiceFactory(config).get_cache_service() + data_service = DataServiceFactory(config, cache_service).get_data_service() + operation_params.delimiter = ";" + + operation = SplitBy( + operation_params, + dataset, + cache_service, + data_service, + ) + result = operation.execute() + + assert result[operation_params.operation_id].equals( + pd.Series( + [ + ["ABDOMINAL WALL"], + ["ABDOMINAL WALL", "ADIPOSE TISSUE, BROWN"], + ["ABDOMINAL WALL", "ADIPOSE TISSUE, BROWN", "AIR SAC"], + ], + ) + )