Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
24183b1
715: USDM schema validation
alexfurmenkov Oct 6, 2025
3ae47c0
add dataset builder
alexfurmenkov Oct 8, 2025
57e5096
715: add USDM 3.0.0 schema
alexfurmenkov Oct 8, 2025
a175ebe
add dataset
alexfurmenkov Oct 9, 2025
adefb06
add minor changes
alexfurmenkov Oct 9, 2025
d29a8fd
add test for 715 issue
alexfurmenkov Oct 14, 2025
7bc9137
add unit test
alexfurmenkov Oct 14, 2025
be7eaa8
add correct usdm 3.0 schema
alexfurmenkov Oct 15, 2025
221ab84
builder is modified
alexfurmenkov Oct 15, 2025
ad30568
reverted operations
alexfurmenkov Oct 15, 2025
05ba92c
modified rule
alexfurmenkov Oct 16, 2025
b2a444a
Merge remote-tracking branch 'origin/715-usdm-schema-validation' into…
alexfurmenkov Oct 16, 2025
4ef41de
fix test for CoreIssue715
alexfurmenkov Oct 19, 2025
9d4e159
fix unit test
alexfurmenkov Oct 21, 2025
c98bbe6
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Oct 21, 2025
de0a869
Merge branch 'main' into 715-usdm-schema-validation
RamilCDISC Oct 24, 2025
5830d67
update JsonSchemaCheckDatasetBuilder to report more details
alexfurmenkov Oct 28, 2025
2938baa
add json and pkl file conversion utility
alexfurmenkov Oct 28, 2025
e65c1e1
Merge branch '715-usdm-schema-validation' of https://github.com/cdisc…
alexfurmenkov Oct 28, 2025
5275f87
final updates to JsonSchemaCheckDatasetBuilder
alexfurmenkov Oct 29, 2025
0f0969f
interface updates in JsonSchemaCheckDatasetBuilder
alexfurmenkov Oct 29, 2025
a8eea73
update regression tests
alexfurmenkov Oct 29, 2025
310826e
fix unit test for JsonSchemaCheckDatasetBuilder
alexfurmenkov Oct 29, 2025
9e53c1d
update regression tests
alexfurmenkov Oct 30, 2025
da7dfbf
address type errors in report generation
alexfurmenkov Oct 30, 2025
badf8dd
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Oct 30, 2025
f598c17
validate each dataset from the json file
alexfurmenkov Oct 31, 2025
3885943
update dataset builder
alexfurmenkov Oct 31, 2025
a385844
Added process_error for smart filtering of errors
ASL-rmarshall Oct 31, 2025
3a96787
Fix entity resolution
ASL-rmarshall Oct 31, 2025
576fbd0
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Nov 2, 2025
9c9904a
Merge remote-tracking branch 'origin/715-usdm-schema-validation' into…
alexfurmenkov Nov 2, 2025
e21c395
integrate dataset caching
alexfurmenkov Nov 2, 2025
25b661d
Adjust context for requried/additional property errors
ASL-rmarshall Nov 2, 2025
a05887a
update regression tests
alexfurmenkov Nov 3, 2025
eab29f3
fix unit tests for JsonSchemaCheckDatasetBuilder
alexfurmenkov Nov 3, 2025
3d4ec2f
Merge branch '715-usdm-schema-validation' of https://github.com/cdisc…
alexfurmenkov Nov 3, 2025
ec7722f
verify cache service call
alexfurmenkov Nov 3, 2025
281040f
Merge branch 'main' into 715-usdm-schema-validation
gerrycampion Nov 3, 2025
ac09454
Merge branch 'main' into 715-usdm-schema-validation
gerrycampion Nov 4, 2025
3ccb5a1
fix typo in Rule_Type.md
alexfurmenkov Nov 4, 2025
7192612
add instructions for updating the schema
alexfurmenkov Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -534,3 +534,17 @@ Then run normally: `core.exe validate -rest -of -config -commands
---

**Note:** Setting `DATASET_SIZE_THRESHOLD=0` tells the engine to use Dask processing for all datasets regardless of size, size threshold defaults to 1/4 of available RAM so datasets larger than this will use Dask. See env.example to see what the CLI .env file should look like

## Updating USDM JSON Schema

Currently, the engine supports USDM JSON Schema validation against versions 3.0 and 4.0. The schema definition files are located at:

- `resources/cache/usdm-3-0-schema.pkl`
- `resources/cache/usdm-4-0-schema.pkl`

These schema definitions were derived from the OpenAPI specs located in the `https://github.com/cdisc-org/DDF-RA` repo, so in order to update the existing schemas or create a new one, run:

1. `git --no-pager --git-dir DDF-RA.git show --format=format:"%B" {required tag (example: v3.0.0)}:Deliverables/API/USDM_API.json > USDM_API_{required version}.json`
2. Use `scripts/openapi-to-json.py` script to convert the OpenAPI spec to JSON schema definition
3. Use `scripts/json_pkl_converter.py` script to convert the JSON file to `.pkl`
4. Place the `.pkl` file to `resources/cache`
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# flake8: noqa
from typing import Type

from cdisc_rules_engine.dataset_builders.json_schema_check_dataset_builder import (
JsonSchemaCheckDatasetBuilder,
)
from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import (
JSONataDatasetBuilder,
)
Expand Down Expand Up @@ -77,6 +80,7 @@ class DatasetBuilderFactory(FactoryInterface):
RuleTypes.VALUE_CHECK_WITH_DATASET_METADATA.value: ValueCheckDatasetMetadataDatasetBuilder,
RuleTypes.VALUE_CHECK_WITH_VARIABLE_METADATA.value: ValueCheckVariableMetadataDatasetBuilder,
RuleTypes.JSONATA.value: JSONataDatasetBuilder,
RuleTypes.JSON_SCHEMA_CHECK.value: JsonSchemaCheckDatasetBuilder,
}

@classmethod
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import copy
import json
from copy import deepcopy
import re

from jsonschema import validators, exceptions
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.models.dataset import DatasetInterface
from cdisc_rules_engine.utilities.utils import tag_source


class JsonSchemaCheckDatasetBuilder(BaseDatasetBuilder):
dataset_template = {
"json_path": [],
"error_attribute": [],
"error_value": [],
"validator": [],
"validator_value": [],
"message": [],
"dataset": [],
"id": [],
"_path": [],
}

def build(self, **kwargs) -> DatasetInterface:
return self.get_dataset()

def _get_cached_dataset(self) -> dict[str, list[str]]:
cache_key: str = (
f"json_schema_validation_result_{self.data_service.dataset_path}"
)
if cached := self.cache.get(cache_key):
return cached

schema = self.library_metadata.standard_schema_definition
cls = validators.validator_for(schema)
cls.check_schema(schema)
validator = cls(schema)

errtree = exceptions.ErrorTree(validator.iter_errors(self.data_service.json))

errlist = copy.deepcopy(self.dataset_template)
self.list_errors(errtree, errlist)

self.cache.add(cache_key, errlist)

return errlist

def get_dataset(self, **kwargs) -> DatasetInterface:
dataset = self._get_cached_dataset()
records = [
{key: dataset[key][i] for key in dataset}
for i in range(len(next(iter(dataset.values()))))
]
filtered = [
row for row in records if row["dataset"] == self.dataset_metadata.name
]
return tag_source(
(
self.dataset_implementation.from_records(filtered, **kwargs)
if filtered
else self.dataset_implementation.from_dict(
self.dataset_template, **kwargs
)
),
self.dataset_metadata,
)

def list_errors(self, tree: exceptions.ErrorTree, errlist: dict[str, list]):
if tree.errors:
for ve in tree.errors.values():
self.process_error(error=ve, errlist=errlist)

if len(tree._contents) > 0:
for k, v in tree._contents.items():
self.list_errors(
tree=v,
errlist=errlist,
)

def get_instance_by_path(self, instance: dict, path_list: list) -> dict:
_inst = deepcopy(instance)
for p in path_list:
_inst = _inst[p]
return _inst

def get_parent_path(self, path_list: list):
return list(path_list)[0 : (-1 - int(isinstance(path_list[-1], int)))]

def parse_error(
self,
error: exceptions.ValidationError,
errlist: dict[str, list],
errpath: list,
):
errctx = self.get_instance_by_path(self.data_service.json, errpath)
errattr = (
self.get_attributes_from_message(error.message)
if error.validator in ["required", "additionalProperties"]
else (
"{}[{}]".format(error.absolute_path[-2], error.absolute_path[-1])
if isinstance(error.absolute_path[-1], int)
else error.absolute_path[-1]
)
)
errlist["json_path"].append(error.json_path)
errlist["error_attribute"].append(errattr)
errlist["error_value"].append(json.dumps(error.instance))
errlist["validator"].append(error.validator)
errlist["validator_value"].append(str(error.validator_value))
errlist["message"].append(
error.message.replace(str(error.instance), f"[Value of {errattr}]")
if len(str(error.instance)) > len(errattr) + 11
and str(error.instance) in error.message
else error.message
)
errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "")
errlist["id"].append(errctx.get("id", "") if errctx else "")
errlist["_path"].append("/" + "/".join(map(str, errpath)))

def list_context_errors(
self,
error: exceptions.ValidationError,
errlist: dict[str, list],
skip_subschemas: list = [],
):
if error.context:
for vec in error.context:
if (
skip_subschemas == []
or list(vec.schema_path)[0] not in skip_subschemas
):
self.process_error(error=vec, errlist=errlist)

def process_error(
self, error: exceptions.ValidationError, errlist: dict[str, list]
):
if error.validator == "anyOf":
skip_ssi = []
refs = [
ss["$ref"].split("/")[-1]
for ss in error.schema["anyOf"]
if "$ref" in ss
]
for vec in error.context:
if (
list(vec.relative_path) == ["instanceType"]
and vec.validator == "const"
and vec.instance in refs
) or (
list(vec.relative_path) == []
and vec.validator == "type"
and vec.validator_value == "null"
):
skip_ssi.append(list(vec.schema_path)[0])
self.list_context_errors(
error=error, errlist=errlist, skip_subschemas=skip_ssi
)
else:
self.parse_error(
error=error,
errlist=errlist,
errpath=(
error.absolute_path
if error.validator in ["required", "additionalProperties"]
else self.get_parent_path(error.absolute_path)
),
)
self.list_context_errors(error=error, errlist=errlist)

def get_attributes_from_message(self, message: str) -> list[str]:
return re.findall(r"'([^, ]+)'", message)
1 change: 1 addition & 0 deletions cdisc_rules_engine/enums/rule_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ class RuleTypes(BaseEnum):
)
VALUE_CHECK_WITH_DATASET_METADATA = "Value Check with Dataset Metadata"
VALUE_CHECK_WITH_VARIABLE_METADATA = "Value Check with Variable Metadata"
JSON_SCHEMA_CHECK = "JSON Schema Check"
10 changes: 10 additions & 0 deletions cdisc_rules_engine/models/library_metadata_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class LibraryMetadataContainer:
def __init__(
self,
standard_metadata={},
standard_schema_definition={},
model_metadata={},
ct_package_metadata={},
variable_codelist_map={},
Expand All @@ -15,6 +16,7 @@ def __init__(
cache_path: str = "",
):
self._standard_metadata = standard_metadata
self._standard_schema_definition = standard_schema_definition
self._model_metadata = model_metadata
self._ct_package_metadata = ct_package_metadata
self._variable_codelist_map = variable_codelist_map
Expand All @@ -30,6 +32,14 @@ def standard_metadata(self):
def standard_metadata(self, value):
self._standard_metadata = value

@property
def standard_schema_definition(self):
return self._standard_schema_definition

@standard_schema_definition.setter
def standard_schema_definition(self, value):
self._standard_schema_definition = value

@property
def variable_codelist_map(self):
return self._variable_codelist_map
Expand Down
Binary file added resources/cache/usdm-3-0-schema.pkl
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the source for creating these files? If possible, the logic for creating these should be added to update-cache call.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A combination of https://github.com/cdisc-org/DDF-RA/blob/main/Deliverables/API/USDM_API.json and this script:

import os
import argparse
import json

def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_file", help="USDM OpenAPI JSON file")
    args = parser.parse_args()
    return args

args = parse_arguments()

filename = os.path.split(args.input_file)[-1]

outfname = "".join(filename.split(".")[0])+"_schemas"

with open(args.input_file) as f:
    openapi = json.load(f)

jschema = {"$defs": {}}

def replace_deep(data, a, b):
    if isinstance(data, str):
        return data.replace(a, b)
    elif isinstance(data, dict):
        return {k: replace_deep(v, a, b) for k, v in data.items()}
    elif isinstance(data, list):
        return [replace_deep(v, a, b) for v in data]
    else:
        # nothing to do?
        return data
    
for sn,sd in openapi["components"]["schemas"].items():
    if sn == "Wrapper-Input":
        for k, v in sd.items():
            jschema[k] = replace_deep(replace_deep(v,"components/schemas","$defs"),"-Input","")
    elif not sn.endswith("-Output"):
        #jschema["$defs"][sn] = to_json_schema(replace_deep(sd,"components/schemas","$defs"))
        jschema["$defs"][sn.replace("-Input","")] = replace_deep(replace_deep(sd,"components/schemas","$defs"),"-Input","")

for v in jschema["$defs"].values():
    v.update({"additionalProperties": False})
    for pn, pd in v.get("properties", {}).items():
        if pn in v.get("required", []) and pd.get("type","") == "array":
            pd.update({"minItems": 1})

with open(os.path.join(''.join(os.path.split(args.input_file)[0:-1]),outfname+'.json'), "w", encoding="utf-8") as f:
    json.dump(jschema, f, ensure_ascii=False, indent=4)

Btw, this command was used to retrieve the 3rd version of the schema:
git --no-pager --git-dir DDF-RA.git show --format=format:"%B" v3.0.0:Deliverables/API/USDM_API.json > USDM_API_v3-0-0.json

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gerrycampion I am not sure if it should be placed into the update cache call since the schema definition is being loaded into the LibraryMetadataContainer inside of the get_library_metadata_from_cache function:

library_metadata: LibraryMetadataContainer = get_library_metadata_from_cache(args)

https://github.com/cdisc-org/cdisc-rules-engine/pull/1375/files#diff-645421107f064022b54581bdaf972ee1baa090acef121e83d4b27be3f50ed802R146

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency, you could - and probably should - also get the v4 spec (and any subsequent versions) in the same way as you get the v3 spec:

git --no-pager --git-dir DDF-RA.git show --format=format:"%B" v4.0.0:Deliverables/API/USDM_API.json > USDM_API_v4-0-0.json

(It's worth noting that git ... show --format=format:"%B" is used because this does a binary transfer that avoids any platform-specific encoding).

Though ideally the specs would be in the Library already...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ASL-rmarshall, Yes, I was using the git command to also retrieve the 4th version

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexfurmenkov If you can't put it in the update-cache, I think you should at least add a github action for it, like "prerelease-update-usdm-schema.yml"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated readme with USDM Schema update instructions. We've agreed that this is going to be updated rarely, so just manual instructions should be good for now

Binary file not shown.
Binary file added resources/cache/usdm-4-0-schema.pkl
Binary file not shown.
14 changes: 14 additions & 0 deletions resources/schema/Rule_Type.md
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,17 @@ Attach define xml metadata at variable level
- `library_variable_data_type`
- `library_variable_ccode`
- `variable_has_empty_values`

## JSON Schema Check

#### Columns:

- `json_path`
- `error_attribute`
- `error_value`
- `validator`
- `validator_value`
- `message`
- `dataset`
- `id`
- `_path`
47 changes: 47 additions & 0 deletions scripts/json_pkl_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import argparse
import os
import json
import pickle


def parse_arguments():
parser = argparse.ArgumentParser(description="Convert between JSON and PKL files.")
parser.add_argument(
"-i", "--input_file", required=True, help="Input file (.json or .pkl)"
)
args = parser.parse_args()
return args


def json_to_pkl(json_path, pkl_path):
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
with open(pkl_path, "wb") as f:
pickle.dump(data, f)
print(f"Converted {json_path} to {pkl_path}")


def pkl_to_json(pkl_path, json_path):
with open(pkl_path, "rb") as f:
data = pickle.load(f)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"Converted {pkl_path} to {json_path}")


def main():
args = parse_arguments()
input_file = args.input_file
base, ext = os.path.splitext(input_file)
if ext.lower() == ".json":
out_file = base + ".pkl"
json_to_pkl(input_file, out_file)
elif ext.lower() == ".pkl":
out_file = base + ".json"
pkl_to_json(input_file, out_file)
else:
print("Unsupported file extension. Please provide a .json or .pkl file.")


if __name__ == "__main__":
main()
59 changes: 59 additions & 0 deletions scripts/openapi-to-json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import argparse
import json


def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", help="USDM OpenAPI JSON file")
args = parser.parse_args()
return args


args = parse_arguments()

filename = os.path.split(args.input_file)[-1]

outfname = "".join(filename.split(".")[0]) + "_schemas"

with open(args.input_file) as f:
openapi = json.load(f)

jschema = {"$defs": {}}


def replace_deep(data, a, b):
if isinstance(data, str):
return data.replace(a, b)
elif isinstance(data, dict):
return {k: replace_deep(v, a, b) for k, v in data.items()}
elif isinstance(data, list):
return [replace_deep(v, a, b) for v in data]
else:
# nothing to do?
return data


for sn, sd in openapi["components"]["schemas"].items():
if sn == "Wrapper-Input":
for k, v in sd.items():
jschema[k] = replace_deep(
replace_deep(v, "components/schemas", "$defs"), "-Input", ""
)
elif not sn.endswith("-Output"):
jschema["$defs"][sn.replace("-Input", "")] = replace_deep(
replace_deep(sd, "components/schemas", "$defs"), "-Input", ""
)

for v in jschema["$defs"].values():
v.update({"additionalProperties": False})
for pn, pd in v.get("properties", {}).items():
if pn in v.get("required", []) and pd.get("type", "") == "array":
pd.update({"minItems": 1})

with open(
os.path.join("".join(os.path.split(args.input_file)[0:-1]), outfname + ".json"),
"w",
encoding="utf-8",
) as f:
json.dump(jschema, f, ensure_ascii=False, indent=4)
Loading
Loading