From ac5ed38e5296129b82390e9c50c27b1c4c5e7c03 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 4 Nov 2025 16:50:32 +0100 Subject: [PATCH 1/7] ignore not existing domains in the operations --- cdisc_rules_engine/utilities/rule_processor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index de335af09..eb1777b3c 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -44,7 +44,6 @@ from cdisc_rules_engine.interfaces.data_service_interface import ( DataServiceInterface, ) -from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError class RuleProcessor: @@ -480,10 +479,11 @@ def _execute_operation( ), ) if domain_details is None: - raise DomainNotFoundError( - f"Operation {operation_params.operation_name} requires Domain " - f"{operation_params.domain} but Domain not found in dataset" + logger.warning( + f"Domain {operation_params.domain} doesn't exist in the dataset" ) + operation_params.dataframe[operation_params.operation_id] = None + return operation_params.dataframe filename = get_dataset_name_from_details(domain_details) file_path: str = os.path.join( get_directory_path(operation_params.dataset_path), From bdc4f3c1f7acd8ee6a7952eb98fccb420ecd0f5d Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 4 Nov 2025 18:11:07 +0100 Subject: [PATCH 2/7] change log level --- cdisc_rules_engine/utilities/rule_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index eb1777b3c..c3fdba712 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -479,7 +479,7 @@ def _execute_operation( ), ) if domain_details is None: - logger.warning( + logger.info( f"Domain {operation_params.domain} doesn't exist in the dataset" ) operation_params.dataframe[operation_params.operation_id] = None From 7cc8439fff9658f7a902fd0201a0bb58a7c9d977 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Thu, 6 Nov 2025 00:31:59 +0100 Subject: [PATCH 3/7] add regression tests for core issue 720 --- .../test_Issues/test_CoreIssue720.py | 149 ++++++++++++++++++ .../CoreIssue720/Invalid_datasets.json | 149 ++++++++++++++++++ tests/resources/CoreIssue720/Rule.yml | 66 ++++++++ .../CoreIssue720/Valid_datasets.json | 149 ++++++++++++++++++ 4 files changed, 513 insertions(+) create mode 100644 tests/QARegressionTests/test_Issues/test_CoreIssue720.py create mode 100644 tests/resources/CoreIssue720/Invalid_datasets.json create mode 100644 tests/resources/CoreIssue720/Rule.yml create mode 100644 tests/resources/CoreIssue720/Valid_datasets.json diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue720.py b/tests/QARegressionTests/test_Issues/test_CoreIssue720.py new file mode 100644 index 000000000..239d2959c --- /dev/null +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue720.py @@ -0,0 +1,149 @@ +import os +import subprocess +import unittest + +import openpyxl +import pytest +from conftest import get_python_executable +from QARegressionTests.globals import ( + issue_datails_sheet, + rules_report_sheet, +) + + +@pytest.mark.regression +class TestCoreIssue720(unittest.TestCase): + def test_negative_dataset(self): + """Negative scenario: SPECIES missing -> expect one populated issue.""" + command = [ + f"{get_python_executable()}", + "-m", + "core", + "validate", + "-s", + "usdm", + "-v", + "4-0", + "-dp", + os.path.join("tests", "resources", "CoreIssue720", "Invalid_datasets.json"), + "-lr", + os.path.join("tests", "resources", "CoreIssue720", "Rule.yml"), + ] + subprocess.run(command, check=True) + + excel_files = [ + f + for f in os.listdir() + if f.startswith("CORE-Report-") and f.endswith(".xlsx") + ] + assert excel_files, "No CORE report generated" + excel_file_path = sorted(excel_files)[-1] + wb = openpyxl.load_workbook(excel_file_path) + + # Conformance Details + assert "Conformance Details" in wb.sheetnames + conf_rows = [ + row for row in wb["Conformance Details"].iter_rows(values_only=True) + ] + assert conf_rows[6][0] == "Standard" + assert conf_rows[6][1] == "USDM" + assert conf_rows[7][0] == "Version" + assert conf_rows[7][1] == "V4.0" + + # Entity Details + entity_rows = [row for row in wb["Entity Details"].iter_rows(values_only=True)][ + 1: + ] + entity_rows = [r for r in entity_rows if any(r)] + assert len(entity_rows) >= 2 + assert entity_rows[0][0] == "DM" and entity_rows[0][1] == 4 + assert entity_rows[1][0] == "TS" and entity_rows[1][1] == 4 + + # Issue Summary + summary_rows = [row for row in wb["Issue Summary"].iter_rows(values_only=True)][ + 1: + ] + summary_rows = [r for r in summary_rows if any(r)] + assert summary_rows[0][0] == "dm.xpt" + assert summary_rows[0][1] == "CDISC.SENDIG.105" + assert summary_rows[0][3] == 1 + + # Issue Details + details_rows = [ + row for row in wb[issue_datails_sheet].iter_rows(values_only=True) + ][1:] + details_rows = [r for r in details_rows if any(r)] + assert details_rows, "Issue Details should have at least one populated row" + first_issue = details_rows[0] + assert first_issue[0] == "CDISC.SENDIG.105" + assert first_issue[1] == "SEND105" + assert "SPECIES in not present" in first_issue[2] + assert first_issue[3] == "fully executable" + assert first_issue[4] == "dm.xpt" + assert first_issue[7] == "$distinct_tsparmcd, $distinct_txparmcd, SPECIES" + assert "Not in dataset" in first_issue[8] + + # Rules Report + rules_rows = [ + row for row in wb[rules_report_sheet].iter_rows(values_only=True) + ][1:] + rules_rows = [r for r in rules_rows if any(r)] + assert rules_rows, "Rules Report must have at least one populated row" + assert rules_rows[0][0] == "CDISC.SENDIG.105" + assert rules_rows[0][-1] == "SUCCESS" + + if os.path.exists(excel_file_path): + os.remove(excel_file_path) + + def test_positive_dataset(self): + """Positive scenario: SPECIES present -> expect no issue rows.""" + command = [ + f"{get_python_executable()}", + "-m", + "core", + "validate", + "-s", + "usdm", + "-v", + "4-0", + "-dp", + os.path.join("tests", "resources", "CoreIssue720", "Valid_datasets.json"), + "-lr", + os.path.join("tests", "resources", "CoreIssue720", "Rule.yml"), + ] + subprocess.run(command, check=True) + + excel_files = [ + f + for f in os.listdir() + if f.startswith("CORE-Report-") and f.endswith(".xlsx") + ] + assert excel_files, "No CORE report generated" + excel_file_path = sorted(excel_files)[-1] + wb = openpyxl.load_workbook(excel_file_path) + + # Issue Summary empty + summary_rows = [row for row in wb["Issue Summary"].iter_rows(values_only=True)][ + 1: + ] + summary_rows = [r for r in summary_rows if any(r)] + assert summary_rows == [] + + # Issue Details empty + details_rows = [ + row for row in wb[issue_datails_sheet].iter_rows(values_only=True) + ][1:] + details_rows = [r for r in details_rows if any(r)] + assert details_rows == [] + + # Rules Report has success row + rules_rows = [ + row for row in wb[rules_report_sheet].iter_rows(values_only=True) + ][1:] + rules_rows = [r for r in rules_rows if any(r)] + assert len(rules_rows) == 1 + assert rules_rows[0][0] == "CDISC.SENDIG.105" + assert rules_rows[0][-1] == "SUCCESS" + + if os.path.exists(excel_file_path): + os.remove(excel_file_path) diff --git a/tests/resources/CoreIssue720/Invalid_datasets.json b/tests/resources/CoreIssue720/Invalid_datasets.json new file mode 100644 index 000000000..e703c0a62 --- /dev/null +++ b/tests/resources/CoreIssue720/Invalid_datasets.json @@ -0,0 +1,149 @@ +{ + "datasets": [ + { + "filename": "dm.xpt", + "label": "Demographics", + "domain": "DM", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier", + "type": "Char", + "length": 20 + }, + { + "name": "SUBJID", + "label": "Subject Identifier for the Study", + "type": "Char", + "length": 12 + }, + { + "name": "RFSTDTC", + "label": "Subject Reference Start Date/Time", + "type": "Char", + "length": 10 + }, + { + "name": "RFENDTC", + "label": "Subject Reference End Date/Time", + "type": "Char", + "length": 10 + }, + { + "name": "RFXSTDTC", + "label": "Date/Time of First Study Treatment", + "type": "Char", + "length": 10 + }, + { + "name": "RFXENDTC", + "label": "Date/Time of Last Study Treatment", + "type": "Char", + "length": 10 + }, + { + "name": "ARMCD", + "label": "Planned Arm Code", + "type": "Char", + "length": 8 + }, + { + "name": "ARM", + "label": "Description of Planned Arm", + "type": "Char", + "length": 28 + }, + { + "name": "SETCD", + "label": "Set Code", + "type": "Char", + "length": 8 + } + ], + "records": { + "STUDYID": ["CDISCCORE01", "CDISCCORE01", "CDISCCORE01", "CDISCCORE01"], + "DOMAIN": ["DM", "DM", "DM", "DM"], + "USUBJID": [ + "015246-099-0000-00000", + "015246-099-0000-00001", + "015246-099-0000-00002", + "015246-099-0000-00003" + ], + "SUBJID": [ + "099000000000", + "099000000001", + "099000000002", + "099000000003" + ], + "RFSTDTC": ["", "", "", "2018-05-08T11:20"], + "RFENDTC": ["", "", "", "2018-08-29T10:35"], + "RFXSTDTC": ["", "", "", "2018-05-08T11:20"], + "RFXENDTC": ["", "", "", "2018-08-29T10:35"], + "ARMCD": ["", "", "", "IKD"], + "ARM": ["IKd", "", "", "IKd"], + "SETCD": ["SET1", "SET22", "SET333", "SET12345"] + } + }, + { + "filename": "ts.xpt", + "label": "Trial Summary", + "domain": "TS", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "TSSEQ", + "label": "Sequence Number ", + "type": "Char", + "length": 8 + }, + { + "name": "TSGRPID", + "label": "Group Identifier", + "type": "Char", + "length": 8 + }, + { + "name": "TSPARMCD", + "label": "Trial Summary Parameter Short Name ", + "type": "Char", + "length": 8 + } + ], + "records": { + "STUDYID": ["CDISCCORE01", "CDISCCORE01", "CDISCCORE01", "CDISCCORE01"], + "DOMAIN": ["TS", "TS", "TS", "TS"], + "TSSEQ": ["1", "2", "3", "4"], + "TSGRPID": ["", "", "", ""], + "TSPARMCD": ["SSTYP", "SPECIESS", "ROUTE", "STITLE"] + } + } + ], + "standard": { + "product": "sendig", + "version": "3-1" + }, + "codelists": [] +} diff --git a/tests/resources/CoreIssue720/Rule.yml b/tests/resources/CoreIssue720/Rule.yml new file mode 100644 index 000000000..832e63d3b --- /dev/null +++ b/tests/resources/CoreIssue720/Rule.yml @@ -0,0 +1,66 @@ +Authorities: + - Organization: CDISC + Standards: + - Name: SENDIG + References: + - Citations: + - Cited Guidance: + If this variable is excluded in the DM domain, the information + must be present at a higher level (either Trial Sets or Trial + Summary). + Document: IG v3.1 + Item: Specification + Section: 5.1.1 + Origin: SEND Conformance Rules + Rule Identifier: + Id: "SEND105" + Version: "2" + Version: "5.0" + Version: "3.1" +Check: + all: + - name: $distinct_tsparmcd + operator: does_not_contain + value: SPECIES + value_is_literal: true + - name: $distinct_txparmcd + operator: does_not_contain + value: SPECIES + value_is_literal: true + - name: SPECIES + operator: not_exists +Core: + Id: CDISC.SENDIG.105 + Status: Draft + Version: "1" +Description: Raise an error when the variable SPECIES is not present in DM, and + there is no record TSPARMCD=SPECIES in TS and there is no record + TXPARMCD=SPECIES in TX +Executability: Fully Executable +Operations: + # gets the distinct values of TSPARMCD + - domain: TS + id: $distinct_tsparmcd + name: TSPARMCD + operator: distinct + - domain: TX + id: $distinct_txparmcd + name: TXPARMCD + operator: distinct +Outcome: + Message: + SPECIES in not present in DM and there is no record TSPARMCD=SPECIES in + TS and there is no record TXPARMCD=SPECIES in TX + Output Variables: + - $distinct_tsparmcd + - $distinct_txparmcd + - SPECIES +Rule Type: Record Data +Scope: + Classes: + Include: + - ALL + Domains: + Include: + - DM +Sensitivity: Dataset diff --git a/tests/resources/CoreIssue720/Valid_datasets.json b/tests/resources/CoreIssue720/Valid_datasets.json new file mode 100644 index 000000000..a6617d702 --- /dev/null +++ b/tests/resources/CoreIssue720/Valid_datasets.json @@ -0,0 +1,149 @@ +{ + "datasets": [ + { + "filename": "dm.xpt", + "label": "Demographics", + "domain": "DM", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier", + "type": "Char", + "length": 20 + }, + { + "name": "SUBJID", + "label": "Subject Identifier for the Study", + "type": "Char", + "length": 12 + }, + { + "name": "RFSTDTC", + "label": "Subject Reference Start Date/Time", + "type": "Char", + "length": 10 + }, + { + "name": "RFENDTC", + "label": "Subject Reference End Date/Time", + "type": "Char", + "length": 10 + }, + { + "name": "RFXSTDTC", + "label": "Date/Time of First Study Treatment", + "type": "Char", + "length": 10 + }, + { + "name": "RFXENDTC", + "label": "Date/Time of Last Study Treatment", + "type": "Char", + "length": 10 + }, + { + "name": "ARMCD", + "label": "Planned Arm Code", + "type": "Char", + "length": 8 + }, + { + "name": "ARM", + "label": "Description of Planned Arm", + "type": "Char", + "length": 28 + }, + { + "name": "SETCD", + "label": "Set Code", + "type": "Char", + "length": 8 + } + ], + "records": { + "STUDYID": ["CDISCCORE01", "CDISCCORE01", "CDISCCORE01", "CDISCCORE01"], + "DOMAIN": ["DM", "DM", "DM", "DM"], + "USUBJID": [ + "015246-099-0000-00000", + "015246-099-0000-00001", + "015246-099-0000-00002", + "015246-099-0000-00003" + ], + "SUBJID": [ + "099000000000", + "099000000001", + "099000000002", + "099000000003" + ], + "RFSTDTC": ["", "", "", "2018-05-08T11:20"], + "RFENDTC": ["", "", "", "2018-08-29T10:35"], + "RFXSTDTC": ["", "", "", "2018-05-08T11:20"], + "RFXENDTC": ["", "", "", "2018-08-29T10:35"], + "ARMCD": ["", "", "", "IKD"], + "ARM": ["IKd", "", "", "IKd"], + "SETCD": ["SET1", "SET22", "SET333", "SET12345"] + } + }, + { + "filename": "ts.xpt", + "label": "Trial Summary", + "domain": "TS", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 12 + }, + { + "name": "DOMAIN", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "TSSEQ", + "label": "Sequence Number ", + "type": "Char", + "length": 8 + }, + { + "name": "TSGRPID", + "label": "Group Identifier", + "type": "Char", + "length": 8 + }, + { + "name": "TSPARMCD", + "label": "Trial Summary Parameter Short Name ", + "type": "Char", + "length": 8 + } + ], + "records": { + "STUDYID": ["CDISCCORE01", "CDISCCORE01", "CDISCCORE01", "CDISCCORE01"], + "DOMAIN": ["TS", "TS", "TS", "TS"], + "TSSEQ": ["1", "2", "3", "4"], + "TSGRPID": ["", "", "", ""], + "TSPARMCD": ["SSTYP", "SPECIES", "ROUTE", "STITLE"] + } + } + ], + "standard": { + "product": "sendig", + "version": "3-1" + }, + "codelists": [] +} From 703f3b08d5adc3fbef956b70112cc770b5be2433 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 7 Nov 2025 10:42:44 +0100 Subject: [PATCH 4/7] add unit test for handling operations on nonexistent domains --- .../test_utilities/test_rule_processor.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index 7c7bfaea1..0ac4cb582 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -1155,3 +1155,41 @@ def test_duplicate_for_targets(): assert ( len([cond for cond in check[1] if cond["value"]["target"] == target]) == 1 ) + + +def test_operation_nonexistent_domain_sets_none(mock_data_service, caplog): + """Operation on a domain not present in datasets should create a column with None values + and log domain-missing message.""" + caplog.set_level("INFO") + df = PandasDataset.from_dict({"DOMAIN": ["LB", "LB"], "LBSEQ": [1, 2]}) + rule = { + "operations": [ + {"operator": "distinct", "domain": "AE", "name": "AESEQ", "id": "$ae_ids"} + ] + } + processor = RuleProcessor(mock_data_service, InMemoryCacheService()) + # Use SDTMDatasetMetadata objects (dict caused AttributeError: unsplit_name) + datasets_metadata = [ + SDTMDatasetMetadata(name="LB", filename="lb.xpt", first_record={"DOMAIN": "LB"}) + ] + result = processor.perform_rule_operations( + rule=rule, + dataset=df.copy(), + domain="LB", + datasets=datasets_metadata, + dataset_path="lb.xpt", + standard="sdtmig", + standard_version="3-1-2", + standard_substandard=None, + ) + # Assert new column created + assert "$ae_ids" in result.columns + # All values should be None + assert result["$ae_ids"].isnull().all() + # Original data preserved + assert result["LBSEQ"].tolist() == [1, 2] + # Log contains domain missing message + assert any( + "Domain AE" in rec.message and "doesn't exist" in rec.message + for rec in caplog.records + ) From aa936e0627cfa7357acbd64bdae9f64ad8459504 Mon Sep 17 00:00:00 2001 From: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com> Date: Mon, 10 Nov 2025 18:00:52 -0500 Subject: [PATCH 5/7] Cg0288 (#1419) * attribute incorrectly named * tests, operator working * docs --------- Co-authored-by: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> --- .../check_operators/dataframe_operators.py | 6 + cdisc_rules_engine/models/operation_params.py | 1 - .../operations/get_codelist_attributes.py | 182 ++++++-- cdisc_rules_engine/rules_engine.py | 1 - .../utilities/rule_processor.py | 7 +- resources/schema/Operations.md | 13 +- .../test_get_codelist_attributes.py | 402 ++++++++++++++++-- 7 files changed, 530 insertions(+), 82 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 0e8eda382..9faea2be4 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -683,6 +683,12 @@ def is_contained_by(self, other_value): elif self.is_column_of_iterables(comparison_data): results = vectorized_is_in(target_data, comparison_data) else: + if isinstance(comparison_data, pd.Series): + comparison_data = comparison_data.apply( + lambda x: list(x) if isinstance(x, set) else x + ) + elif isinstance(comparison_data, set): + comparison_data = list(comparison_data) results = target_data.isin(comparison_data) return self.value.convert_to_series(results) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 55907a959..3f264531f 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -36,7 +36,6 @@ class OperationParams: codelists: list = None ct_attribute: str = None ct_package_types: List[str] = None - ct_package: list = None ct_packages: list = None ct_version: str = None ct_package_type: str = None diff --git a/cdisc_rules_engine/operations/get_codelist_attributes.py b/cdisc_rules_engine/operations/get_codelist_attributes.py index bf887a504..23aea7f74 100644 --- a/cdisc_rules_engine/operations/get_codelist_attributes.py +++ b/cdisc_rules_engine/operations/get_codelist_attributes.py @@ -3,6 +3,28 @@ from cdisc_rules_engine.models.dataset import DaskDataset +def _get_ct_package_dask( + row, ct_target, ct_version, ct_packages, standard, substandard +): + if pd.isna(row[ct_version]) or str(row[ct_version]).strip() == "": + return "" + target_val = str(row[ct_target]).strip() if pd.notna(row[ct_target]) else "" + if target_val in ("CDISC", "CDISC CT"): + std = standard.lower() + if "tig" in std: + std = substandard.lower() + if "adam" in std: + prefix = "adamct" + elif "send" in std: + prefix = "sendct" + else: + prefix = "sdtmct" + pkg = f"{prefix}-{row[ct_version]}" + else: + pkg = f"{target_val}-{row[ct_version]}" + return pkg if pkg in ct_packages else "" + + class CodeListAttributes(BaseOperation): """ A class for fetching codelist attributes for a trial summary domain. @@ -42,23 +64,59 @@ def _get_codelist_attributes(self): ct_name = "CT_PACKAGE" # a column for controlled term package names # Get controlled term attribute column name specified in rule ct_attribute = self.params.ct_attribute - + ct_target = self.params.target + ct_version = self.params.ct_version + ct_packages = self.params.ct_packages + df = self.params.dataframe # 2.0 build codelist from cache # ------------------------------------------------------------------- ct_cache = self._get_ct_from_library_metadata( ct_key=ct_name, ct_val=ct_attribute ) - # 3.0 get dataset records - # ------------------------------------------------------------------- - ct_data = self._get_ct_from_dataset(ct_key=ct_name, ct_val=ct_attribute) + def get_ct_package(row): + if pd.isna(row[ct_version]) or str(row[ct_version]).strip() == "": + return "" + target_val = str(row[ct_target]).strip() if pd.notna(row[ct_target]) else "" + # Handle CDISC CT packages + if target_val in ("CDISC", "CDISC CT"): + standard = self.params.standard.lower() + if "tig" in standard: + # use substandard for relevant TIG CT + standard = self.params.standard_substandard.lower() + if "adam" in standard: + prefix = "adamct" + elif "send" in standard: + prefix = "sendct" + else: + prefix = "sdtmct" + pkg = f"{prefix}-{row[ct_version]}" + else: + # Handle external codelists + pkg = f"{target_val}-{row[ct_version]}" + return pkg if pkg in ct_packages else "" - # 4.0 merge the two datasets by CC - # ------------------------------------------------------------------- - cc_key = ct_data[ct_name] - ct_list = ct_cache[(ct_cache[ct_name].isin(cc_key))] - ds_len = self.params.dataframe.len() - result = pd.Series([ct_list[ct_attribute].values[0] for _ in range(ds_len)]) + if isinstance(df, DaskDataset): + row_packages = df.data.apply( + _get_ct_package_dask, + axis=1, + meta=(None, "object"), + args=( + ct_target, + ct_version, + ct_packages, + self.params.standard, + self.params.standard_substandard, + ), + ) + else: + row_packages = df.data.apply(get_ct_package, axis=1) + package_to_codelist = {} + for _, row in ct_cache.iterrows(): + package_to_codelist[row[ct_name]] = row[ct_attribute] + result = row_packages.apply( + lambda pkg: package_to_codelist.get(pkg, set()) if pkg else set() + ) return result def _get_ct_from_library_metadata(self, ct_key: str, ct_val: str): @@ -75,14 +133,16 @@ def _get_ct_from_library_metadata(self, ct_key: str, ct_val: str): retrieved from the cache. """ ct_packages = self.params.ct_packages - ct_term_maps = ( - [] - if ct_packages is None - else [ + ct_term_maps = [] + for package in ct_packages: + parts = package.rsplit("-", 3) + if len(parts) >= 4: + ct_package_type = parts[0] + version = "-".join(parts[1:]) + self.library_metadata._load_ct_package_data(ct_package_type, version) + ct_term_maps.append( self.library_metadata.get_ct_package_metadata(package) or {} - for package in ct_packages - ] - ) + ) # convert codelist to dataframe ct_result = {ct_key: [], ct_val: []} @@ -138,21 +198,81 @@ def _get_ct_from_dataset(self, ct_key: str, ct_val: str): return result def _add_codelist(self, ct_key, ct_val, ct_term_maps, ct_result): - """ - Adds codelist information to the result dictionary. - - Args: - ct_key (str): The key for identifying the codelist. - ct_val (str): The value associated with the codelist. - ct_term_maps (list[dict]): A list of dictionaries containing - codelist information. - ct_result (dict): The dictionary to store the codelist information. - - Returns: - dict: The updated ct_result dictionary. - """ for item in ct_term_maps: ct_result[ct_key].append(item.get("package")) - codes = set(code for code in item.keys() if code != "package") + codes = self._extract_codes_by_attribute(item, ct_val) ct_result[ct_val].append(codes) return ct_result + + def _extract_codes_by_attribute( + self, ct_package_data: dict, ct_attribute: str + ) -> set: + submission_lookup = ct_package_data.get("submission_lookup", {}) + + if ct_attribute == "Term CCODE": + return self._extract_term_codes(submission_lookup) + elif ct_attribute == "Codelist CCODE": + return self._extract_codelist_codes(submission_lookup) + elif ct_attribute in ("Term Value", "Term Submission Value"): + return self._extract_term_values(submission_lookup) + elif ct_attribute == "Codelist Value": + return self._extract_codelist_values(submission_lookup) + elif ct_attribute == "Term Preferred Term": + return self._extract_preferred_terms(submission_lookup, ct_package_data) + else: + raise ValueError(f"Unsupported ct_attribute: {ct_attribute}") + + def _extract_codelist_values(self, submission_lookup: dict) -> set: + codes = set() + for term_name, term_data in submission_lookup.items(): + term_code = term_data.get("term") + if term_code and term_code == "N/A": + codes.add(term_name) + return codes + + def _extract_term_codes(self, submission_lookup: dict) -> set: + codes = set() + for term_data in submission_lookup.values(): + term_code = term_data.get("term") + if term_code and term_code != "N/A": + codes.add(term_code) + return codes + + def _extract_codelist_codes(self, submission_lookup: dict) -> set: + codes = set() + for term_data in submission_lookup.values(): + codelist_code = term_data.get("codelist") + if codelist_code: + codes.add(codelist_code) + return codes + + def _extract_term_values(self, submission_lookup: dict) -> set: + codes = set() + for term_name, term_data in submission_lookup.items(): + term_code = term_data.get("term") + if term_code and term_code != "N/A": + codes.add(term_name) + return codes + + def _extract_preferred_terms( + self, submission_lookup: dict, ct_package_data: dict + ) -> set: + codes = set() + for term_name, term_data in submission_lookup.items(): + if not isinstance(term_data, dict): + continue + term_code = term_data.get("term") + if not term_code or term_code == "N/A": + continue + codelist_id = term_data.get("codelist") + if not codelist_id or codelist_id not in ct_package_data: + continue + codelist_info = ct_package_data[codelist_id] + terms = codelist_info.get("terms", []) + for term in terms: + if term.get("conceptId") == term_code: + pref_term = term.get("preferredTerm") + if pref_term: + codes.add(pref_term) + break + return codes diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index bcb3b37ab..795f8d185 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -92,7 +92,6 @@ def __init__( ) self.data_processor = DataProcessor(self.data_service, self.cache) self.ct_packages = kwargs.get("ct_packages", []) - self.ct_package = kwargs.get("ct_package") self.external_dictionaries = external_dictionaries self.define_xml_path: str = kwargs.get("define_xml_path") self.validate_xml: bool = kwargs.get("validate_xml") diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index c3fdba712..09432c22e 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -389,17 +389,16 @@ def perform_rule_operations( standard_version=standard_version, standard_substandard=standard_substandard, external_dictionaries=external_dictionaries, - ct_version=operation.get("version"), + ct_version=operation.get("ct_version"), ct_package_type=RuleProcessor._ct_package_type_api_name( operation.get("ct_package_type") ), - ct_attribute=operation.get("attribute"), + ct_attribute=operation.get("ct_attribute"), ct_package_types=[ RuleProcessor._ct_package_type_api_name(ct_package_type) for ct_package_type in operation.get("ct_package_types", []) ], - ct_packages=kwargs.get("ct_packages"), - ct_package=kwargs.get("codelist_term_maps"), + ct_packages=operation.get("ct_packages", kwargs.get("ct_packages", [])), attribute_name=operation.get("attribute_name", ""), key_name=operation.get("key_name", ""), key_value=operation.get("key_value", ""), diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index dd6c12739..a3b8f3a59 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -152,16 +152,25 @@ Returns a list of valid extensible codelist term's submission values. Used for e ### get_codelist_attributes -Fetches attribute values for a codelist specified in a dataset (like TS) +Fetches controlled terminology attribute values from CT packages based on row-specific CT package and version references. + +**Required Parameters:** + +- `ct_attribute`: Attribute to extract - `"Term CCODE"`, `"Codelist CCODE"`, `"Term Value"`, `"Codelist Value"`, or `"Term Preferred Term"` +- `target`: Column containing CT reference (e.g., "TSVCDREF") +- `ct_version`: Column containing CT version (e.g., "TSVCDVER") +- `ct_packages`: List of CT packages to search (e.g., `["sdtmct-2020-03-27"]`) ```yaml -- id: $TERM_CCODES +- id: $VALID_TERM_CODES name: TSVCDREF operator: get_codelist_attributes ct_attribute: Term CCODE ct_version: TSVCDVER + target: TSVCDREF ct_packages: - sdtmct-2020-03-27 + - sdtmct-2022-12-16 ``` ### valid_codelist_dates diff --git a/tests/unit/test_operations/test_get_codelist_attributes.py b/tests/unit/test_operations/test_get_codelist_attributes.py index bc46faafe..97eaf67e6 100644 --- a/tests/unit/test_operations/test_get_codelist_attributes.py +++ b/tests/unit/test_operations/test_get_codelist_attributes.py @@ -6,7 +6,6 @@ ) import pandas as pd import pytest -from typing import List from cdisc_rules_engine.models.operation_params import OperationParams @@ -30,15 +29,91 @@ [ { "package": "sdtmct-2020-03-27", - "C49487": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C25473": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "N": {"codelist": "C49487", "term": "C49487"}, + "Y": {"codelist": "C25473", "term": "C25473"}, + "MAYBE": {"codelist": "C141663", "term": "C141663"}, + }, + "C49487": { + "extensible": False, + "preferredTerm": "No", + "submissionValue": "N", + "terms": [ + { + "conceptId": "C49487", + "submissionValue": "N", + "preferredTerm": "No", + } + ], + }, + "C25473": { + "extensible": False, + "preferredTerm": "Yes", + "submissionValue": "Y", + "terms": [ + { + "conceptId": "C25473", + "submissionValue": "Y", + "preferredTerm": "Yes", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Maybe", + "submissionValue": "MAYBE", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "MAYBE", + "preferredTerm": "Maybe", + } + ], + }, }, { "package": "sdtmct-2022-12-16", - "C141657": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C141656": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "A": {"codelist": "C141657", "term": "C141657"}, + "B": {"codelist": "C141656", "term": "C141656"}, + "C": {"codelist": "C141663", "term": "C141663"}, + }, + "C141657": { + "extensible": False, + "preferredTerm": "Option A", + "submissionValue": "A", + "terms": [ + { + "conceptId": "C141657", + "submissionValue": "A", + "preferredTerm": "Option A", + } + ], + }, + "C141656": { + "extensible": False, + "preferredTerm": "Option B", + "submissionValue": "B", + "terms": [ + { + "conceptId": "C141656", + "submissionValue": "B", + "preferredTerm": "Option B", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Option C", + "submissionValue": "C", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "C", + "preferredTerm": "Option C", + } + ], + }, }, ], PandasDataset, @@ -64,20 +139,97 @@ [ { "package": "sdtmct-2020-03-27", - "C49487": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C25473": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "N": {"codelist": "C49487", "term": "C49487"}, + "Y": {"codelist": "C25473", "term": "C25473"}, + "MAYBE": {"codelist": "C141663", "term": "C141663"}, + }, + "C49487": { + "extensible": False, + "preferredTerm": "No", + "submissionValue": "N", + "terms": [ + { + "conceptId": "C49487", + "submissionValue": "N", + "preferredTerm": "No", + } + ], + }, + "C25473": { + "extensible": False, + "preferredTerm": "Yes", + "submissionValue": "Y", + "terms": [ + { + "conceptId": "C25473", + "submissionValue": "Y", + "preferredTerm": "Yes", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Maybe", + "submissionValue": "MAYBE", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "MAYBE", + "preferredTerm": "Maybe", + } + ], + }, }, { "package": "sdtmct-2022-12-16", - "C141657": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C141656": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "A": {"codelist": "C141657", "term": "C141657"}, + "B": {"codelist": "C141656", "term": "C141656"}, + "C": {"codelist": "C141663", "term": "C141663"}, + }, + "C141657": { + "extensible": False, + "preferredTerm": "Option A", + "submissionValue": "A", + "terms": [ + { + "conceptId": "C141657", + "submissionValue": "A", + "preferredTerm": "Option A", + } + ], + }, + "C141656": { + "extensible": False, + "preferredTerm": "Option B", + "submissionValue": "B", + "terms": [ + { + "conceptId": "C141656", + "submissionValue": "B", + "preferredTerm": "Option B", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Option C", + "submissionValue": "C", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "C", + "preferredTerm": "Option C", + } + ], + }, }, ], PandasDataset, {"C141656", "C141663", "C141657"}, ) + test_set3 = ( ["sdtmct-2020-03-27"], { @@ -91,15 +243,91 @@ [ { "package": "sdtmct-2020-03-27", - "C49487": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C25473": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "N": {"codelist": "C49487", "term": "C49487"}, + "Y": {"codelist": "C25473", "term": "C25473"}, + "MAYBE": {"codelist": "C141663", "term": "C141663"}, + }, + "C49487": { + "extensible": False, + "preferredTerm": "No", + "submissionValue": "N", + "terms": [ + { + "conceptId": "C49487", + "submissionValue": "N", + "preferredTerm": "No", + } + ], + }, + "C25473": { + "extensible": False, + "preferredTerm": "Yes", + "submissionValue": "Y", + "terms": [ + { + "conceptId": "C25473", + "submissionValue": "Y", + "preferredTerm": "Yes", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Maybe", + "submissionValue": "MAYBE", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "MAYBE", + "preferredTerm": "Maybe", + } + ], + }, }, { "package": "sdtmct-2022-12-16", - "C141657": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C141656": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "A": {"codelist": "C141657", "term": "C141657"}, + "B": {"codelist": "C141656", "term": "C141656"}, + "C": {"codelist": "C141663", "term": "C141663"}, + }, + "C141657": { + "extensible": False, + "preferredTerm": "Option A", + "submissionValue": "A", + "terms": [ + { + "conceptId": "C141657", + "submissionValue": "A", + "preferredTerm": "Option A", + } + ], + }, + "C141656": { + "extensible": False, + "preferredTerm": "Option B", + "submissionValue": "B", + "terms": [ + { + "conceptId": "C141656", + "submissionValue": "B", + "preferredTerm": "Option B", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Option C", + "submissionValue": "C", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "C", + "preferredTerm": "Option C", + } + ], + }, }, ], DaskDataset, @@ -125,15 +353,91 @@ [ { "package": "sdtmct-2020-03-27", - "C49487": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C25473": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "N": {"codelist": "C49487", "term": "C49487"}, + "Y": {"codelist": "C25473", "term": "C25473"}, + "MAYBE": {"codelist": "C141663", "term": "C141663"}, + }, + "C49487": { + "extensible": False, + "preferredTerm": "No", + "submissionValue": "N", + "terms": [ + { + "conceptId": "C49487", + "submissionValue": "N", + "preferredTerm": "No", + } + ], + }, + "C25473": { + "extensible": False, + "preferredTerm": "Yes", + "submissionValue": "Y", + "terms": [ + { + "conceptId": "C25473", + "submissionValue": "Y", + "preferredTerm": "Yes", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Maybe", + "submissionValue": "MAYBE", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "MAYBE", + "preferredTerm": "Maybe", + } + ], + }, }, { "package": "sdtmct-2022-12-16", - "C141657": {"extensible": False, "allowed_terms": ["A", "B", "C"]}, - "C141656": {"extensible": False, "allowed_terms": ["X", "Y", "Z"]}, - "C141663": {"extensible": False, "allowed_terms": []}, + "submission_lookup": { + "A": {"codelist": "C141657", "term": "C141657"}, + "B": {"codelist": "C141656", "term": "C141656"}, + "C": {"codelist": "C141663", "term": "C141663"}, + }, + "C141657": { + "extensible": False, + "preferredTerm": "Option A", + "submissionValue": "A", + "terms": [ + { + "conceptId": "C141657", + "submissionValue": "A", + "preferredTerm": "Option A", + } + ], + }, + "C141656": { + "extensible": False, + "preferredTerm": "Option B", + "submissionValue": "B", + "terms": [ + { + "conceptId": "C141656", + "submissionValue": "B", + "preferredTerm": "Option B", + } + ], + }, + "C141663": { + "extensible": False, + "preferredTerm": "Option C", + "submissionValue": "C", + "terms": [ + { + "conceptId": "C141663", + "submissionValue": "C", + "preferredTerm": "Option C", + } + ], + }, }, ], DaskDataset, @@ -154,18 +458,18 @@ def test_get_codelist_attributes( ct_list, ): """ - Unit test for DataProcessor.get_column_order_from_library. - Mocks cache call to return metadata. + Unit test for CodeListAttributes operation. + Tests that the operation returns the correct term codes based on CT version. """ # 1.0 set parameters operation_params.dataframe = dataset_type.from_dict(ts_data) operation_params.domain = "TS" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.ct_attribute: str = "TSVALCD" - operation_params.ct_version: str = "TSVCDVER" + operation_params.ct_attribute = "Term CCODE" # Changed from TSVALCD + operation_params.ct_version = "TSVCDVER" operation_params.target = "TSVCDREF" - operation_params.ct_packages: list = ct_packages + operation_params.ct_packages = ct_packages # 2.0 add CT data to cache cache = InMemoryCacheService.get_instance() @@ -187,16 +491,28 @@ def test_get_codelist_attributes( library_metadata, ) - result: pd.DataFrame = operation.execute() - - variables: List[str] = ct_list - expected: pd.Series = pd.Series( - [ - variables, - variables, - variables, - variables, - variables, - ] - ) - assert result[operation_params.operation_id].equals(expected) + result = operation.execute() + + # Extract the operation_id column which contains the sets + result_series = result[operation_params.operation_id] + + # Expected: Each row gets the ct_list only if its version matches ct_packages + # For test_set1 and test_set3: All rows with version 2020-03-27 should get ct_list + # For test_set2 and test_set4: Only rows 3 and 4 with version 2022-12-16 should get ct_list + + if ct_packages == ["sdtmct-2020-03-27"]: + # Rows 0, 1, 2 have version 2020-03-27 (match) + # Rows 3, 4 have empty version (no match) + expected = pd.Series([ct_list, ct_list, ct_list, set(), set()]) + else: # ct_packages == ["sdtmct-2022-12-16"] + # Rows 0, 1, 2 have version 2020-03-27 (no match) + # Rows 3, 4 have version 2022-12-16 (match) + expected = pd.Series([set(), set(), set(), ct_list, ct_list]) + + # Compare the series - each element should already be a set + assert len(result_series) == len(expected) + for i in range(len(result_series)): + # Both result_series.iloc[i] and expected.iloc[i] should be sets already + assert ( + result_series.iloc[i] == expected.iloc[i] + ), f"Row {i}: {result_series.iloc[i]} != {expected.iloc[i]}" From f46237d1beecdbb47eed3926208a9d95845984b4 Mon Sep 17 00:00:00 2001 From: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:32:11 -0500 Subject: [PATCH 6/7] version (#1427) * version * fix --- cdisc_rules_engine/utilities/rule_processor.py | 2 +- resources/schema/Operations.json | 2 +- resources/schema/Operations.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 09432c22e..0dc9b15ec 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -389,7 +389,7 @@ def perform_rule_operations( standard_version=standard_version, standard_substandard=standard_substandard, external_dictionaries=external_dictionaries, - ct_version=operation.get("ct_version"), + ct_version=operation.get("version"), ct_package_type=RuleProcessor._ct_package_type_api_name( operation.get("ct_package_type") ), diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index e89fc7757..2cf1ce254 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -93,7 +93,7 @@ "name", "ct_attribute", "ct_packages", - "ct_version" + "version" ], "type": "object" }, diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index a3b8f3a59..a73a63614 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -158,7 +158,7 @@ Fetches controlled terminology attribute values from CT packages based on row-sp - `ct_attribute`: Attribute to extract - `"Term CCODE"`, `"Codelist CCODE"`, `"Term Value"`, `"Codelist Value"`, or `"Term Preferred Term"` - `target`: Column containing CT reference (e.g., "TSVCDREF") -- `ct_version`: Column containing CT version (e.g., "TSVCDVER") +- `version`: Column containing CT version (e.g., "TSVCDVER") - `ct_packages`: List of CT packages to search (e.g., `["sdtmct-2020-03-27"]`) ```yaml @@ -166,7 +166,7 @@ Fetches controlled terminology attribute values from CT packages based on row-sp name: TSVCDREF operator: get_codelist_attributes ct_attribute: Term CCODE - ct_version: TSVCDVER + version: TSVCDVER target: TSVCDREF ct_packages: - sdtmct-2020-03-27 From ea7c51c3ee091b326d508bd4b84c9d5f76b580d1 Mon Sep 17 00:00:00 2001 From: Richard Marshall <113628824+ASL-rmarshall@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:37:49 +0000 Subject: [PATCH 7/7] Add JSON Schema Check to Rule_Type.json (#1431) --- resources/schema/Rule_Type.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/schema/Rule_Type.json b/resources/schema/Rule_Type.json index ca1270b90..332345be2 100644 --- a/resources/schema/Rule_Type.json +++ b/resources/schema/Rule_Type.json @@ -34,6 +34,10 @@ "const": "Domain Presence Check", "title": "Content domain presence at study level" }, + { + "const": "JSON Schema Check", + "title": "Apply JSON schema validation to a JSON file" + }, { "const": "JSONata", "title": "Apply a JSONata query to a JSON file"