cdisc-org · gerrycampion · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.github/workflows/check-schema-markdown.yml b/.github/workflows/check-schema-markdown.yml
@@ -0,0 +1,36 @@
+name: Check Schema has been merged with Markdown Descriptions
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - "resources/schema/rule/**"
+  push:
+    paths:
+      - "resources/schema/rule/**"
+    branches:
+      - main
+permissions:
+  contents: read
+
+jobs:
+  check-schema-markdown:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Run schema markdown merge script
+        run: |
+          python scripts/merge_schema_markdown.py
+      - uses: actions/setup-node@v6
+      - name: Run Prettier to format merged schema files
+        run: |
+          npm i prettier
+          npx prettier resources/schema/rule-merged/*.json --write
+      - uses: CatChen/check-git-status-action@7b45cb4ce3e00a8bce4910dc2d5f2785235a6d7e # v2.1.2
+        with:
+          fail-if-not-clean: true
+          request-changes-if-not-clean: ${{ github.event_name == 'pull_request' }}
+          request-changes-comment: Updated schema has not been merged with markdown descriptions. Please run the "Merge Schema with Markdown Descriptions" workflow to update the merged schema files.
+          targets: resources/schema/rule-merged
diff --git a/.github/workflows/merge-schema-markdown.yml b/.github/workflows/merge-schema-markdown.yml
@@ -1,13 +1,6 @@
 name: Merge Schema with Markdown Descriptions
-
 on:
   workflow_dispatch:
-  # Only on push to main to avoid overwriting checks in PRs
-  push:
-    paths:
-      - "resources/schema/rule/**"
-    branches:
-      - main
 
 jobs:
   merge-schema-markdown:

diff --git a/resources/schema/rule-merged/MetaVariables.json b/resources/schema/rule-merged/MetaVariables.json
@@ -46,6 +46,14 @@
       "const": "define_dataset_structure",
       "markdownDescription": "\nItemGroupDef.Structure\n"
     },
+    {
+      "const": "define_dataset_variables",
+      "markdownDescription": "\nList of ItemGroupDef.ItemRef.ItemDef.Name in XML document order (as they appear in ItemRef, no sorting applied)\n"
+    },
+    {
+      "const": "define_dataset_variable_order",
+      "markdownDescription": "\nList of ItemGroupDef.ItemRef.ItemDef.Name, sorted by ItemRef OrderNumber when present, otherwise by XML document order\n"
+    },
     {
       "const": "define_variable_allowed_terms",
       "markdownDescription": "\nItemGroupDef.ItemDef.CodeList.CodeListItem.Decode.TranslatedText\n"
@@ -226,6 +234,10 @@
       "const": "variable_has_empty_values",
       "markdownDescription": "\nTrue/False value indicating whether a variable has any empty values\n"
     },
+    {
+      "const": "variable_is_empty",
+      "markdownDescription": "\nTrue/False value indicating whether a variable is completely empty\n"
+    },
     {
       "const": "variable_label",
       "markdownDescription": "\nVariable long label\n"

diff --git a/resources/schema/rule-merged/Operations.json b/resources/schema/rule-merged/Operations.json
@@ -253,6 +253,16 @@
       "required": ["id", "operator", "name"],
       "type": "object"
     },
+    {
+      "properties": {
+        "operator": {
+          "const": "minus",
+          "markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. Uses [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A \u2216 B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n  - id: $expected_variables\n    operator: expected_variables\n  - id: $dataset_variables\n    operator: get_column_order_from_dataset\n  - id: $expected_minus_dataset\n    name: $expected_variables\n    operator: minus\n    subtract: $dataset_variables\n```\n"
+        }
+      },
+      "required": ["id", "operator", "name", "subtract"],
+      "type": "object"
+    },
     {
       "properties": {
         "operator": {
@@ -277,7 +287,7 @@
       "properties": {
         "operator": {
           "const": "record_count",
-          "markdownDescription": "\nIf no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.\n\nIf both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.\n\n**Wildcard Filtering:** Filter values ending with % will match any records where the column value starts with the specified prefix. For example, RACE% will match RACE1, RACE2, RACE3, etc. This is useful for matching related variables with numeric or alphabetic suffixes.\n\n**Regex Transformation:** If regex is provided along with group, the regex pattern will be applied to transform grouping column values before grouping. The regex is only applied to columns where the pattern matches the data type. For example, using regex `^\\d{4}-\\d{2}-\\d{2}` on a column containing `2022-01-14T08:00` will extract `2022-01-14` for grouping purposes.\n\nIf group is provided, group_aliases may also be provided to assign new grouping variable names so that results grouped by the values in one set of grouping variables can be merged onto a dataset according to the same grouping value(s) stored in different set of grouping variables. When both group and group_aliases are provided, columns are renamed according to corresponding list position (i.e., the 1st column in group is renamed to the 1st column in group_aliases, etc.). If there are more columns listed in group than in group_aliases, only the group columns with corresponding group_aliases columns will be renamed. If there are more columns listed in group_aliases than in group, the extra column names in group_aliases will be ignored.\n\nExample: return the number of records in a dataset.\n\n```yaml\n- operator: record_count\n  id: $records_in_dataset\n```\n\nExample: return the number of records where STUDYID = \"CDISC01\" and FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n  id: $flagged_cdisc01_records_in_dataset\n  filter:\n    STUDYID: \"CDISC01\"\n    FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and timing variables, extracting only the date portion from datetime values.\n\n```yaml\n- operator: record_count\n  id: $records_per_usubjid_date\n  group:\n    - USUBJID\n    - --TESTCD\n    - $TIMING_VARIABLES\n  regex: \"^\\d{4}-\\d{2}-\\d{2}\"\n```\n\nExample: return the number of records where QNAM starts with \"RACE\" (matches RACE1, RACE2, RACE3, etc.) per USUBJID.\n\n```yaml\n- operation: record_count\n  id: $race_records_in_dataset\n  filter:\n    QNAM: \"RACE%\"\n  group:\n    - \"USUBJID\"\n```\n\nExample: return the number of records grouped by USUBJID.\n\n```yaml\n- operator: record_count\n  id: $records_per_usubjid\n  group:\n    - USUBJID\n```\n\nExample: return the number of records grouped by USUBJID where FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n  id: $flagged_records_per_usubjid\n  group:\n    - USUBJID\n  filter:\n    FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and IDVARVAL where QNAM = \"TEST1\" and IDVAR = \"GROUPID\", renaming the IDVARVAL column to GROUPID for subsequent merging.\n\n```yaml\n- operator: record_count\n  id: $test1_records_per_usubjid_groupid\n  group:\n    - USUBJID\n    - IDVARVAL\n  filter:\n    QNAM: \"TEST1\"\n    IDVAR: \"GROUPID\"\n  group_aliases:\n    - USUBJID\n    - GROUPID\n```\n\nExample: Group the StudyIdentifier dataset by parent_id and merge the result back to the context dataset StudyVersion using StudyVersion.id == StudyIdentifier.parent_id\n\n```yaml\nScope:\n  Entities:\n    Include:\n      - StudyVersion\nOperations:\n  - domain: StudyIdentifier\n    filter:\n      parent_entity: \"StudyVersion\"\n      parent_rel: \"studyIdentifiers\"\n      rel_type: \"definition\"\n      studyIdentifierScope.organizationType.code: \"C70793\"\n      studyIdentifierScope.organizationType.codeSystem: \"http://www.cdisc.org\"\n    group:\n      - parent_id\n    group_aliases:\n      - id\n    id: $num_sponsor_ids\n    operator: record_count\n```\n"
+          "markdownDescription": "\nIf no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.\n\nIf both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.\n\n**Wildcard Filtering:** Filter values ending with % will match any records where the column value starts with the specified prefix. For example, RACE% will match RACE1, RACE2, RACE3, etc. This is useful for matching related variables with numeric or alphabetic suffixes.\n\n**Regex Transformation:** If regex is provided along with group, the regex pattern will be applied to transform grouping column values before grouping. The regex is only applied to columns where the pattern matches the data type. For example, using regex `^\\d{4}-\\d{2}-\\d{2}` on a column containing `2022-01-14T08:00` will extract `2022-01-14` for grouping purposes.\n\nIf group is provided, group_aliases may also be provided to assign new grouping variable names so that results grouped by the values in one set of grouping variables can be merged onto a dataset according to the same grouping value(s) stored in different set of grouping variables. When both group and group_aliases are provided, columns are renamed according to corresponding list position (i.e., the 1st column in group is renamed to the 1st column in group_aliases, etc.). If there are more columns listed in group than in group_aliases, only the group columns with corresponding group_aliases columns will be renamed. If there are more columns listed in group_aliases than in group, the extra column names in group_aliases will be ignored.\n\nExample: return the number of records in a dataset.\n\n```yaml\n- operator: record_count\n  id: $records_in_dataset\n```\n\nExample: return the number of records where STUDYID = \"CDISC01\" and FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n  id: $flagged_cdisc01_records_in_dataset\n  filter:\n    STUDYID: \"CDISC01\"\n    FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and timing variables, extracting only the date portion from datetime values.\n\n```yaml\n- operator: record_count\n  id: $records_per_usubjid_date\n  group:\n    - USUBJID\n    - --TESTCD\n    - $TIMING_VARIABLES\n  regex: \"^\\d{4}-\\d{2}-\\d{2}\"\n```\n\nExample: return the number of records where QNAM starts with \"RACE\" (matches RACE1, RACE2, RACE3, etc.) per USUBJID.\n\n```yaml\n- operation: record_count\n  id: $race_records_in_dataset\n  filter:\n    QNAM: \"RACE&\"\n  group:\n    - \"USUBJID\"\n```\n\nExample: return the number of records grouped by USUBJID.\n\n```yaml\n- operator: record_count\n  id: $records_per_usubjid\n  group:\n    - USUBJID\n```\n\nExample: return the number of records grouped by USUBJID where FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n  id: $flagged_records_per_usubjid\n  group:\n    - USUBJID\n  filter:\n    FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and IDVARVAL where QNAM = \"TEST1\" and IDVAR = \"GROUPID\", renaming the IDVARVAL column to GROUPID for subsequent merging.\n\n```yaml\n- operator: record_count\n  id: $test1_records_per_usubjid_groupid\n  group:\n    - USUBJID\n    - IDVARVAL\n  filter:\n    QNAM: \"TEST1\"\n    IDVAR: \"GROUPID\"\n  group_aliases:\n    - USUBJID\n    - GROUPID\n```\n\nExample: Group the StudyIdentifier dataset by parent_id and merge the result back to the context dataset StudyVersion using StudyVersion.id == StudyIdentifier.parent_id\n\n```yaml\nScope:\n  Entities:\n    Include:\n      - StudyVersion\nOperations:\n  - domain: StudyIdentifier\n    filter:\n      parent_entity: \"StudyVersion\"\n      parent_rel: \"studyIdentifiers\"\n      rel_type: \"definition\"\n      studyIdentifierScope.organizationType.code: \"C70793\"\n      studyIdentifierScope.organizationType.codeSystem: \"http://www.cdisc.org\"\n    group:\n      - parent_id\n    group_aliases:\n      - id\n    id: $num_sponsor_ids\n    operator: record_count\n```\n"
         }
       },
       "required": ["id", "operator"],
@@ -455,7 +465,7 @@
       "properties": {
         "operator": {
           "const": "variable_exists",
-          "markdownDescription": "\nFlag an error if MIDS is in the dataset currently being evaluated and the TM domain is not present in the study\n\nRule Type: Domain Presence Check\n\n```yaml\nCheck:\n  all:\n    - name: $MIDS_EXISTS\n      operator: equal_to\n      value: true\n    - name: TM\n      operator: not_exists\nOperations:\n  - id: $MIDS_EXISTS\n    name: MIDS\n    operator: variable_exists\n```\n"
+          "markdownDescription": "\nOperation operates only on original submission datasets regardless of rule type. Flags an error if a column exists is in the submission dataset currently being evaluated.\n\nRule Type: Domain Presence Check\n\n```yaml\nCheck:\n  all:\n    - name: $MIDS_EXISTS\n      operator: equal_to\n      value: true\n    - name: TM\n      operator: not_exists\nOperations:\n  - id: $MIDS_EXISTS\n    name: MIDS\n    operator: variable_exists\n```\n"
         }
       },
       "required": ["id", "operator"],
@@ -465,7 +475,7 @@
       "properties": {
         "operator": {
           "const": "variable_is_null",
-          "markdownDescription": "\nReturns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty.\nThe operation can work with both direct variable names and define metadata references (variables starting with \"define_variable\").\n\n```yaml\nOperations:\n  - operator: variable_is_null\n    name: USUBJID\n    id: $aeterm_is_null\n```\n"
+          "markdownDescription": "\nReturns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty.\nThe operation supports two sources via the `source` parameter:\n\n- **`submission`** : checks against the raw submission dataset\n- **`evaluation`** (default): checks against the evaluation dataset built based on the rule type\n\n```yaml"
         }
       },
       "required": ["id", "operator"],
@@ -659,6 +669,12 @@
       "type": "string",
       "enum": ["code", "value", "pref_term"]
     },
+    "source": {
+      "type": "string"
+    },
+    "subtract": {
+      "type": "string"
+    },
     "term_value": {
       "type": "string"
     },

diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json
@@ -910,7 +910,7 @@
       "properties": {
         "operator": {
           "const": "target_is_sorted_by",
-          "markdownDescription": "\nTrue if the values in `name` are ordered according to the values specified by `value` in ascending/descending order, grouped by the values in `within`. Each `value` requires a variable `name` and an ordering of 'asc' or 'desc' specified by `order`. `within` accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 'YYYY-MM-DD' format\n\n```yaml\nCheck:\n  all:\n    - name: --SEQ\n      within:\n        - USUBJID\n        - MIDSTYPE\n      operator: target_is_sorted_by\n      value:\n        - name: --STDTC\n          sort_order: asc\n```\n"
+          "markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.\n\n```yaml\nCheck:\n  all:\n    - name: --SEQ\n      within:\n        - USUBJID\n        - MIDSTYPE\n      operator: target_is_sorted_by\n      value:\n        - name: --STDTC\n          sort_order: asc\n          null_position: last\n```\n"
         }
       },
       "required": ["operator", "value", "within"],
@@ -1076,6 +1076,10 @@
               "name": {
                 "$ref": "Operator.json#/properties/name"
               },
+              "null_position": {
+                "enum": ["first", "last"],
+                "type": "string"
+              },
               "order": {
                 "$ref": "Operator.json#/properties/order"
               }