From 8dc76a472b520b3265acb214fd6a4e4f7a1c7519 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 5 Jun 2025 16:22:14 -0500 Subject: [PATCH 1/2] Schema update for umm-c (1.18.2 to 1.18.4) #328 --- pyQuARC/code/checker.py | 3 ++ pyQuARC/code/schema_validator.py | 85 +++++++++++++++++++++++++------- pyQuARC/code/utils.py | 8 +++ tests/test_downloader.py | 4 +- 4 files changed, 81 insertions(+), 19 deletions(-) diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py index 4bb401c7..93993024 100644 --- a/pyQuARC/code/checker.py +++ b/pyQuARC/code/checker.py @@ -14,6 +14,9 @@ from .string_validator import StringValidator from .url_validator import UrlValidator +from .schema_validator import SchemaValidator +from .constants import UMM_C # or however you define metadata format + from .constants import ECHO10_C, SCHEMA_PATHS diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..26f2d315 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,11 +3,22 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url +from .utils import read_json_schema_from_url +from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G + + +SUPPORTED_UMM_C_VERSIONS = ["v1.18.4", "v1.18.3", "v1.18.2"] +DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default + +# Define UMM-G versions if you want to make it flexible as well +SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"] +DEFAULT_UMM_G_VERSION = "v1.6.6" + +SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" -from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C class SchemaValidator: @@ -21,6 +32,10 @@ def __init__( self, check_messages, metadata_format=ECHO10_C, + # Add a new parameter for UMM-C version + umm_c_version=DEFAULT_UMM_C_VERSION, + # Add a new parameter for UMM-G version (if you want to make it flexible too) + umm_g_version=DEFAULT_UMM_G_VERSION ): """ Args: @@ -29,8 +44,27 @@ def __init__( validation_paths (list of str): The path of the fields in the metadata that need to be validated. In the form ['Collection/StartDate', ...]. + umm_c_version (str): The specific UMM-C version to use for validation (e.g., "v1.18.4"). + umm_g_version (str): The specific UMM-G version to use for validation (e.g., "v1.6.6"). + check_messages (dict): A dictionary of check messages for errors. """ self.metadata_format = metadata_format + # Validate and store the UMM-C version + if umm_c_version not in SUPPORTED_UMM_C_VERSIONS: + raise ValueError( + f"Unsupported UMM-C version: {umm_c_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_C_VERSIONS)}" + ) + self.umm_c_version = umm_c_version + + # Validate and store the UMM-G version + if umm_g_version not in SUPPORTED_UMM_G_VERSIONS: + raise ValueError( + f"Unsupported UMM-G version: {umm_g_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_G_VERSIONS)}" + ) + self.umm_g_version = umm_g_version + if metadata_format.startswith("umm-"): self.validator_func = self.run_json_validator else: @@ -61,9 +95,16 @@ def read_json_schema(self): """ Reads the json schema file """ + if self.metadata_format == UMM_C: + schema_url = (f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json") + return read_json_schema_from_url(schema_url) + + if self.metadata_format == UMM_G: + schema_url = (f"{SCHEMA_CDN_BASE}/granule/{self.umm_g_version}/umm-g-json-schema.json") + return read_json_schema_from_url(schema_url) + with open(SCHEMA_PATHS[f"{self.metadata_format}-json-schema"]) as schema_file: - schema = json.load(schema_file) - return schema + return json.load(schema_file) def run_json_validator(self, content_to_validate): """ @@ -77,21 +118,30 @@ def run_json_validator(self, content_to_validate): schema_store = {} if self.metadata_format == UMM_C: - with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as schema_file: - schema_base = json.load(schema_file) - # workaround to read local referenced schema file (only supports uri) - schema_store = { - schema_base.get("$id", "/umm-cmn-json-schema.json"): schema_base, - schema_base.get("$id", "umm-cmn-json-schema.json"): schema_base, - } - errors = {} + #umm_cmn_schema_url = f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json" + # If it's *not* versioned and always the latest or a specific fixed version, adjust this URL + # e.g., f"{SCHEMA_CDN_BASE}/common/umm-cmn-json-schema.json" or from SCHEMA_PATHS + + try: + with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as common_schema_file: + schema_base = json.load(common_schema_file) + # 1. Add the schema using its $id (most common canonical reference) + if "$id" in schema_base: + schema_store[schema_base["$id"]] = schema_base + + # 2. Add the schema using the full URL you fetched it from (if different from $id or for robustness) + schema_store["/umm-cmn-json-schema.json"] = schema_base + schema_store["umm-cmn-json-schema.json"] = schema_base + except Exception as e: + print(f"Error loading UMM Common schema from {SCHEMA_PATHS['umm-cmn-json-schema']}: {e}") + print("Schema validation for UMM-C might proceed without common schema, leading to incomplete validation.") + errors = {} resolver = RefResolver.from_schema(schema, store=schema_store) - validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +186,14 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/utils.py b/pyQuARC/code/utils.py index 1fe82270..f0544d45 100644 --- a/pyQuARC/code/utils.py +++ b/pyQuARC/code/utils.py @@ -82,3 +82,11 @@ def get_date_time(dt_str): except ValueError: continue return None + +def read_json_schema_from_url(url): + """ + Downloads and returns a JSON schema from a given URL. + """ + response = requests.get(url) + response.raise_for_status() + return response.json() diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..5a6ca777 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 9469487dd1674edf90736797163e58edd23ec46d Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 8 Oct 2025 15:17:14 -0500 Subject: [PATCH 2/2] Schema update for echo-c and echo-g #328 --- pyQuARC/code/schema_validator.py | 55 +++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 26f2d315..48ab62f0 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -19,7 +19,10 @@ SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" - +REMOTE_XML_SCHEMAS = { + "echo10_collection": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Collection.xsd", + "echo10_granule": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Granule.xsd" +} class SchemaValidator: """ @@ -71,26 +74,54 @@ def __init__( self.validator_func = self.run_xml_validator self.check_messages = check_messages + + def read_xml_schema(self): """ - Reads the xml schema file + Reads the XML schema file (either from a remote URL or local path). """ - # The XML schema file (echo10_xml.xsd) imports another schema file (MetadataCommon.xsd) - # Python cannot figure out the import if they are in a different location than the calling script - # Thus we need to set an environment variable to let it know where the files are located - # Path to catalog must be a url + from urllib.request import urlopen + + # Maintain XML catalog handling catalog_path = f"file:{pathname2url(str(SCHEMA_PATHS['catalog']))}" - # Temporarily set the environment variable os.environ["XML_CATALOG_FILES"] = os.environ.get( "XML_CATALOG_FILES", catalog_path ) - with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: - file_content = schema_file.read().encode() - xmlschema_doc = etree.parse(BytesIO(file_content)) - schema = etree.XMLSchema(xmlschema_doc) - return schema + def get_raw_schema_url(browse_url: str) -> str: + """Convert /browse/ URL into /raw/ for direct XML download.""" + if "/browse/" in browse_url: + return browse_url.replace("/browse/", "/raw/") + "?at=refs%2Fheads%2Fmaster" + return browse_url + # Select remote schema if metadata_format matches + schema_url = REMOTE_XML_SCHEMAS.get(self.metadata_format) + try: + if schema_url: + raw_url = get_raw_schema_url(schema_url) + print(f"Fetching schema remotely from: {raw_url}") + import ssl + ssl_context = ssl._create_unverified_context() # Disable certificate check safely for this fetch + with urlopen(raw_url, context=ssl_context) as response: + file_content = response.read() + else: + # Fallback to local schema file + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + + except Exception as e: + print(f"⚠️ Remote fetch failed or unavailable for {self.metadata_format}: {e}") + print("Falling back to local schema file...") + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + def read_json_schema(self): """ Reads the json schema file