diff --git a/docs/json_schemas/contract/components/field_error_detail.schema.json b/docs/json_schemas/contract/components/field_error_detail.schema.json index a0cb547..5f00ca9 100644 --- a/docs/json_schemas/contract/components/field_error_detail.schema.json +++ b/docs/json_schemas/contract/components/field_error_detail.schema.json @@ -11,12 +11,11 @@ }, "error_message": { "description": "The message to be used for the field and error type specified. This can include templating (specified using jinja2 conventions). During templating, the full record will be available with an additional __error_value to easily obtain nested offending values.", - "type": "string", - "enum": [ - "record_rejection", - "file_rejection", - "warning" - ] + "type": "string" + }, + "reporting_entity": { + "description": "The entity name to be given for grouping in error report. If left blank will default to the contract entity name", + "type": "string" } }, "required": [ diff --git a/src/dve/core_engine/backends/implementations/duckdb/rules.py b/src/dve/core_engine/backends/implementations/duckdb/rules.py index dbc308e..b14700d 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/rules.py +++ b/src/dve/core_engine/backends/implementations/duckdb/rules.py @@ -517,6 +517,7 @@ def notify(self, entities: DuckDBEntities, *, config: Notification) -> Messages: messages.append( FeedbackMessage( entity=config.reporting.reporting_entity_override or config.entity_name, + original_entity=config.entity_name, record=record, # type: ignore error_location=config.reporting.legacy_location, error_message=template_object(config.reporting.message, record), # type: ignore diff --git a/src/dve/core_engine/backends/implementations/spark/rules.py b/src/dve/core_engine/backends/implementations/spark/rules.py index 93baae9..15afa09 100644 --- a/src/dve/core_engine/backends/implementations/spark/rules.py +++ b/src/dve/core_engine/backends/implementations/spark/rules.py @@ -412,6 +412,7 @@ def notify(self, entities: SparkEntities, *, config: Notification) -> Messages: # more complex extraction done in reporting module FeedbackMessage( entity=config.reporting.reporting_entity_override or config.entity_name, + original_entity=config.entity_name, record=record.asDict(recursive=True), error_location=config.reporting.legacy_location, error_message=template_object( diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py index 7dd4f02..dd580c6 100644 --- a/src/dve/core_engine/message.py +++ b/src/dve/core_engine/message.py @@ -30,6 +30,7 @@ class DataContractErrorDetail(BaseModel): error_code: str error_message: Optional[str] = None + reporting_entity: Optional[str] = None def template_message( self, @@ -105,6 +106,8 @@ class FeedbackMessage: # pylint: disable=too-many-instance-attributes still be completed (i.e. filters and joins can still be applied). """ + original_entity: Optional[EntityName] = None + """The original entity before any modifications to the name (if applicable).""" is_informational: bool = False """Whether the message is simply for information or has affected the outputs.""" error_type: Optional[str] = None @@ -230,7 +233,8 @@ def from_pydantic_error( messages.append( cls( - entity=entity, + entity=error_detail.reporting_entity or entity, + original_entity=entity, record=record, failure_type=failure_type, is_informational=is_informational, diff --git a/src/dve/reporting/excel_report.py b/src/dve/reporting/excel_report.py index 727b244..4dc3fee 100644 --- a/src/dve/reporting/excel_report.py +++ b/src/dve/reporting/excel_report.py @@ -443,6 +443,13 @@ def _text_length(value): @staticmethod def _format_headings(headings: list[str]) -> list[str]: + # TODO - ideally this would be config driven to allow customisation. + _renames = { + "Table": "Group", + "Data Item": "Data Item Submission Name", + "Error": "Errors and Warnings", + } headings = [heading.title() if heading[0].islower() else heading for heading in headings] headings = [heading.replace("_", " ") for heading in headings] + headings = [_renames.get(heading, heading) for heading in headings] return headings diff --git a/src/dve/reporting/utils.py b/src/dve/reporting/utils.py index 3dac919..8832b6a 100644 --- a/src/dve/reporting/utils.py +++ b/src/dve/reporting/utils.py @@ -26,7 +26,13 @@ def dump_feedback_errors( processed = [] for message in messages: - primary_keys: list[str] = key_fields.get(message.entity if message.entity else "", []) + if message.original_entity is not None: + primary_keys = key_fields.get(message.original_entity, []) + elif message.entity is not None: + primary_keys = key_fields.get(message.entity, []) + else: + primary_keys = [] + error = message.to_dict( key_field=primary_keys, value_separator=" -- ", diff --git a/tests/features/movies.feature b/tests/features/movies.feature index b148547..d1dbca4 100644 --- a/tests/features/movies.feature +++ b/tests/features/movies.feature @@ -1,47 +1,47 @@ Feature: Pipeline tests using the movies dataset - Tests for the processing framework which use the movies dataset. + Tests for the processing framework which use the movies dataset. - This tests submissions in JSON format, with configuration in JSON config files. - Complex types are tested (arrays, nested structs) + This tests submissions in JSON format, with configuration in JSON config files. + Complex types are tested (arrays, nested structs) - Some validation of entity attributes is performed: SQL expressions and Python filter - functions are used, and templatable business rules feature in the transformations. + Some validation of entity attributes is performed: SQL expressions and Python filter + functions are used, and templatable business rules feature in the transformations. - Scenario: Validate and filter movies (spark) - Given I submit the movies file movies.json for processing - And A spark pipeline is configured - And I create the following reference data tables in the database movies_refdata - | table_name | parquet_path | - | sequels | tests/testdata/movies/refdata/movies_sequels.parquet | - And I add initial audit entries for the submission - Then the latest audit record for the submission is marked with processing status file_transformation - When I run the file transformation phase - Then the movies entity is stored as a parquet after the file_transformation phase - And the latest audit record for the submission is marked with processing status data_contract - When I run the data contract phase - Then there are 3 record rejections from the data_contract phase - And there are errors with the following details and associated error_count from the data_contract phase - | ErrorCode | ErrorMessage | error_count | - | BLANKYEAR | year not provided | 1 | - | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | - | DODGYDATE | date_joined value is not valid: daft_date | 1 | - And the movies entity is stored as a parquet after the data_contract phase - And the latest audit record for the submission is marked with processing status business_rules - When I run the business rules phase - Then The rules restrict "movies" to 4 qualifying records - And there are errors with the following details and associated error_count from the business_rules phase - | ErrorCode | ErrorMessage | error_count | - | LIMITED_RATINGS | Movie has too few ratings ([6.1]) | 1 | - | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | - And the latest audit record for the submission is marked with processing status error_report - When I run the error report phase - Then An error report is produced - And The statistics entry for the submission shows the following information - | parameter | value | - | record_count | 5 | - | number_record_rejections | 4 | - | number_warnings | 1 | - And the error aggregates are persisted + Scenario: Validate and filter movies (spark) + Given I submit the movies file movies.json for processing + And A spark pipeline is configured + And I create the following reference data tables in the database movies_refdata + | table_name | parquet_path | + | sequels | tests/testdata/movies/refdata/movies_sequels.parquet | + And I add initial audit entries for the submission + Then the latest audit record for the submission is marked with processing status file_transformation + When I run the file transformation phase + Then the movies entity is stored as a parquet after the file_transformation phase + And the latest audit record for the submission is marked with processing status data_contract + When I run the data contract phase + Then there are 3 record rejections from the data_contract phase + And there are errors with the following details and associated error_count from the data_contract phase + | Entity | ErrorCode | ErrorMessage | error_count | + | movies | BLANKYEAR | year not provided | 1 | + | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | + | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | + And the movies entity is stored as a parquet after the data_contract phase + And the latest audit record for the submission is marked with processing status business_rules + When I run the business rules phase + Then The rules restrict "movies" to 4 qualifying records + And there are errors with the following details and associated error_count from the business_rules phase + | ErrorCode | ErrorMessage | error_count | + | LIMITED_RATINGS | Movie has too few ratings ([6.1]) | 1 | + | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | + And the latest audit record for the submission is marked with processing status error_report + When I run the error report phase + Then An error report is produced + And The statistics entry for the submission shows the following information + | parameter | value | + | record_count | 5 | + | number_record_rejections | 4 | + | number_warnings | 1 | + And the error aggregates are persisted Scenario: Validate and filter movies (duckdb) Given I submit the movies file movies.json for processing @@ -57,10 +57,10 @@ Feature: Pipeline tests using the movies dataset When I run the data contract phase Then there are 3 record rejections from the data_contract phase And there are errors with the following details and associated error_count from the data_contract phase - | ErrorCode | ErrorMessage | error_count | - | BLANKYEAR | year not provided | 1 | - | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | - | DODGYDATE | date_joined value is not valid: daft_date | 1 | + | Entity | ErrorCode | ErrorMessage | error_count | + | movies | BLANKYEAR | year not provided | 1 | + | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | + | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | And the movies entity is stored as a parquet after the data_contract phase And the latest audit record for the submission is marked with processing status business_rules When I run the business rules phase diff --git a/tests/test_core_engine/test_backends/fixtures.py b/tests/test_core_engine/test_backends/fixtures.py index 14369b9..31c23d7 100644 --- a/tests/test_core_engine/test_backends/fixtures.py +++ b/tests/test_core_engine/test_backends/fixtures.py @@ -567,9 +567,11 @@ def nested_parquet_custom_dc_err_details(temp_dir): err_details = { "id": { "Blank": {"error_code": "TESTIDBLANK", - "error_message": "id cannot be null"}, + "error_message": "id cannot be null", + "reporting_entity": "test_rename"}, "Bad value": {"error_code": "TESTIDBAD", - "error_message": "id is invalid: id - {{id}}"} + "error_message": "id is invalid: id - {{id}}", + "reporting_entity": "test_rename"} }, "datetimefield": { "Bad value": {"error_code": "TESTDTFIELDBAD", diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py index e4c08ad..61920c2 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py @@ -360,4 +360,5 @@ def test_duckdb_data_contract_custom_error_details(nested_all_string_parquet_w_e assert messages[0].error_code == "SUBFIELDTESTIDBAD" assert messages[0].error_message == "subfield id is invalid: subfield.id - WRONG" assert messages[1].error_code == "TESTIDBAD" - assert messages[1].error_message == "id is invalid: id - WRONG" \ No newline at end of file + assert messages[1].error_message == "id is invalid: id - WRONG" + assert messages[1].entity == "test_rename" \ No newline at end of file diff --git a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py index fac6cdf..789ca1a 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py @@ -235,5 +235,6 @@ def test_spark_data_contract_custom_error_details(nested_all_string_parquet_w_er assert messages[0].error_message == "subfield id is invalid: subfield.id - WRONG" assert messages[1].error_code == "TESTIDBAD" assert messages[1].error_message == "id is invalid: id - WRONG" + assert messages[1].entity == "test_rename" \ No newline at end of file diff --git a/tests/test_error_reporting/test_excel_report.py b/tests/test_error_reporting/test_excel_report.py index 834a5e4..17c0dfd 100644 --- a/tests/test_error_reporting/test_excel_report.py +++ b/tests/test_error_reporting/test_excel_report.py @@ -116,8 +116,8 @@ def test_excel_report(report_dfs): column_headings = [cell.value for cell in aggs["1"]] assert column_headings == [ "Type", - "Table", - "Data Item", + "Group", + "Data Item Submission Name", "Category", "Error Code", "Count", @@ -126,11 +126,11 @@ def test_excel_report(report_dfs): details = workbook["Error Data"] column_headings = [cell.value for cell in details["1"]] assert column_headings == [ - "Table", + "Group", "Type", "Error Code", - "Data Item", - "Error", + "Data Item Submission Name", + "Errors and Warnings", "Value", "ID", "Category", diff --git a/tests/test_pipeline/test_spark_pipeline.py b/tests/test_pipeline/test_spark_pipeline.py index 4e33a28..7f4738f 100644 --- a/tests/test_pipeline/test_spark_pipeline.py +++ b/tests/test_pipeline/test_spark_pipeline.py @@ -469,8 +469,8 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out OrderedDict( **{ "Type": "Submission Failure", - "Table": "planets", - "Data Item": "orbitalPeriod", + "Group": "planets", + "Data Item Submission Name": "orbitalPeriod", "Category": "Bad value", "Error Code": "LONG_ORBIT", "Count": 1, @@ -479,8 +479,8 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out OrderedDict( **{ "Type": "Submission Failure", - "Table": "planets", - "Data Item": "gravity", + "Group": "planets", + "Data Item Submission Name": "gravity", "Category": "Bad value", "Error Code": "STRONG_GRAVITY", "Count": 1, @@ -497,11 +497,11 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out assert error_data_records == [ OrderedDict( **{ - "Table": "planets", + "Group": "planets", "Type": "Submission Failure", "Error Code": "LONG_ORBIT", - "Data Item": "orbitalPeriod", - "Error": "Planet has long orbital period", + "Data Item Submission Name": "orbitalPeriod", + "Errors and Warnings": "Planet has long orbital period", "Value": 365.20001220703125, "ID": None, "Category": "Bad value", @@ -509,11 +509,11 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out ), OrderedDict( **{ - "Table": "planets", + "Group": "planets", "Type": "Submission Failure", "Error Code": "STRONG_GRAVITY", - "Data Item": "gravity", - "Error": "Planet has too strong gravity", + "Data Item Submission Name": "gravity", + "Errors and Warnings": "Planet has too strong gravity", "Value": 9.800000190734863, "ID": None, "Category": "Bad value", diff --git a/tests/testdata/movies/movies_contract_error_details.json b/tests/testdata/movies/movies_contract_error_details.json index 8c94c92..f8cd934 100644 --- a/tests/testdata/movies/movies_contract_error_details.json +++ b/tests/testdata/movies/movies_contract_error_details.json @@ -12,7 +12,8 @@ }, "Bad value": { "error_code": "DODGYYEAR", - "error_message": "year value ({{year}}) is invalid" + "error_message": "year value ({{year}}) is invalid", + "reporting_entity": "movies_rename_test" } }, "cast.date_joined": {