From 5fee2f4e89fc60604cdfb233967141f9e602adfe Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 3 Jun 2025 12:23:13 -0500 Subject: [PATCH 1/2] Entrytitle - Revised #325 --- pyQuARC/code/schema_validator.py | 11 ++++++----- pyQuARC/code/string_validator.py | 14 ++++++++++---- pyQuARC/schemas/check_messages_override.json | 12 +++++++++++- tests/test_downloader.py | 4 ++-- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..e88a3e1e 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,8 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, + format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +137,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/string_validator.py b/pyQuARC/code/string_validator.py index 1bd27715..8ba756c1 100644 --- a/pyQuARC/code/string_validator.py +++ b/pyQuARC/code/string_validator.py @@ -1,7 +1,7 @@ from .base_validator import BaseValidator from .gcmd_validator import GcmdValidator from .utils import cmr_request, collection_in_cmr, if_arg, set_cmr_prms - +import re class StringValidator(BaseValidator): """ @@ -38,15 +38,21 @@ def length_check(string, extent, relation): def compare(first, second, relation): """ Compares two strings based on the relationship - Returns: - (dict) An object with the validity of the check and the instance + (dict) An object with the validity of the check and the instance """ + + # Check if 'first' and 'second' contain any special characters + first_clean = re.sub(r'[^a-zA-Z0-9]', '', first).upper() + second_clean = re.sub(r'[^a-zA-Z0-9]', '', second).upper() + + # If either string contains special characters, return a warning or handle as needed return { - "valid": BaseValidator.compare(first.upper(), second.upper(), relation), + "valid": BaseValidator.compare(first_clean, second_clean, relation), "value": (first, second), } + @staticmethod @if_arg def controlled_keywords_check(value, keywords_list): diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..2fee2195 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,11 @@ -{} +{ + "shortname_uniqueness": { + "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" + }, + "remediation": "Recommend providing a more descriptive title for the dataset. " + } +} + diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..ca1762c8 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000042-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1018577631-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 884d9bb52edd6d9ba1c9d1cddcee1c6742bd604a Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 13:45:14 -0500 Subject: [PATCH 2/2] code change before dev merge - LA --- pyQuARC/schemas/check_messages.json | 2 +- pyQuARC/schemas/check_messages_override.json | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..03562612 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -53,7 +53,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" }, - "remediation": "The EntryTitle/DataSetId should not be identical to the ShortName. Recommend providing a descriptive, formal title for the dataset. " + "remediation": "Recommend providing a more descriptive title for the dataset. " }, "abstract_length_check": { "failure": "The abstract provided may be inadequate based on length.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 2fee2195..311847da 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,11 +1,2 @@ -{ - "shortname_uniqueness": { - "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" - }, - "remediation": "Recommend providing a more descriptive title for the dataset. " - } -} +{}