diff --git a/CHANGELOG.md b/CHANGELOG.md index d35d9fe24..e5d036545 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,80 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- Fix to handle logicalType format wrt avro mentioned in issue #687 +## [0.10.28] - 2025-06-05 + +### Added +- Much better ODCS support + - Import anything to ODCS via the `import --spec odcs` flag + - Export to HTML with an ODCS native template via `export --format html` + - Export to Mermaid with an ODCS native mapping via `export --format mermaid` +- The databricks `unity` importer now supports more than a single table. You can use `--unity-table-full-name` multiple times to import multiple tables. And it will automatically add a server with the catalog and schema name. + +### Changed +- `datacontract catalog [OPTIONS]`: Added version to contract cards in `index.html` of the catalog (enabled search by version) +- The type mapping of the `unity` importer no uses the native databricks types instead of relying on spark types. This allows for better type mapping and more accurate data contracts. + +### Fixed + +## [0.10.27] - 2025-05-22 + +### Added + +- `datacontract export --format mermaid` Export + to [Mermaid](https://mermaid-js.github.io/mermaid/#/) (#767, #725) + +### Changed + +- `datacontract export --format html`: Adding the mermaid figure to the html export +- `datacontract export --format odcs`: Export physical type to ODCS if the physical type is + configured in config object +- `datacontract import --format spark`: Added support for spark importer table level comments (#761) +- `datacontract import` respects `--owner` and `--id` flags (#753) + +### Fixed + +- `datacontract export --format sodacl`: Fix resolving server when using `--server` flag (#768) +- `datacontract export --format dbt`: Fixed DBT export behaviour of constraints to default to data tests when no model type is specified in the datacontract model + + +## [0.10.26] - 2025-05-16 + +### Changed +- Databricks: Add support for Variant type (#758) +- `datacontract export --format odcs`: Export physical type if the physical type is configured in + config object (#757) +- `datacontract export --format sql` Include datacontract descriptions in the Snowflake sql export ( + #756) + +## [0.10.25] - 2025-05-07 + +### Added +- Extracted the DataContractSpecification and the OpenDataContractSpecification in separate pip modules and use them in the CLI. +- `datacontract import --format excel`: Import from Excel + template https://github.com/datacontract/open-data-contract-standard-excel-template (#742) + +## [0.10.24] - 2025-04-19 + +### Added + +- `datacontract test` with DuckDB: Deep nesting of json objects in duckdb (#681) + +### Changed + +- `datacontract import --format csv` produces more descriptive output. Replaced + using clevercsv with duckdb for loading and sniffing csv file. +- Updated dependencies + +### Fixed + +- Fix to handle logicalType format wrt avro mentioned in issue (#687) +- Fix field type from TIME to DATETIME in BigQuery converter and schema (#728) +- Fix encoding issues. (#712) +- ODCS: Fix required in export and added item and fields format (#724) + +### Removed + +- Deprecated QualityLinter is now removed ## [0.10.23] - 2025-03-03 diff --git a/README.md b/README.md index f2e24fc9d..122cf7e98 100644 --- a/README.md +++ b/README.md @@ -252,110 +252,94 @@ Commands ### init ``` - - Usage: datacontract init [OPTIONS] [LOCATION] - - Create an empty data contract. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location of the data contract file to │ -│ create. │ -│ [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --template TEXT URL of a template or data contract │ -│ [default: None] │ -│ --overwrite --no-overwrite Replace the existing │ -│ datacontract.yaml │ -│ [default: no-overwrite] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract init [OPTIONS] [LOCATION] + + Create an empty data contract. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location of the data contract file to create. │ +│ [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --template TEXT URL of a template or data contract [default: None] │ +│ --overwrite --no-overwrite Replace the existing datacontract.yaml │ +│ [default: no-overwrite] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### lint ``` - - Usage: datacontract lint [OPTIONS] [LOCATION] - - Validate that the datacontract.yaml is correctly formatted. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract │ -│ yaml. │ -│ [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --schema TEXT The location (url or path) of the Data │ -│ Contract Specification JSON Schema │ -│ [default: None] │ -│ --output PATH Specify the file path where the test results │ -│ should be written to (e.g., │ -│ './test-results/TEST-datacontract.xml'). If │ -│ no path is provided, the output will be │ -│ printed to stdout. │ -│ [default: None] │ -│ --output-format [junit] The target format for the test results. │ -│ [default: None] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract lint [OPTIONS] [LOCATION] + + Validate that the datacontract.yaml is correctly formatted. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. │ +│ [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --schema TEXT The location (url or path) of the Data Contract Specification │ +│ JSON Schema │ +│ [default: None] │ +│ --output PATH Specify the file path where the test results should be written │ +│ to (e.g., './test-results/TEST-datacontract.xml'). If no path is │ +│ provided, the output will be printed to stdout. │ +│ [default: None] │ +│ --output-format [junit] The target format for the test results. [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### test ``` - - Usage: datacontract test [OPTIONS] [LOCATION] - - Run schema and quality tests on configured servers. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract │ -│ yaml. │ -│ [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --schema TEXT The location (url or │ -│ path) of the Data │ -│ Contract │ -│ Specification JSON │ -│ Schema │ -│ [default: None] │ -│ --server TEXT The server │ -│ configuration to run │ -│ the schema and │ -│ quality tests. Use │ -│ the key of the server │ -│ object in the data │ -│ contract yaml file to │ -│ refer to a server, │ -│ e.g., `production`, │ -│ or `all` for all │ -│ servers (default). │ -│ [default: all] │ -│ --publish TEXT The url to publish │ -│ the results after the │ -│ test │ -│ [default: None] │ -│ --output PATH Specify the file path │ -│ where the test │ -│ results should be │ -│ written to (e.g., │ -│ './test-results/TEST… │ -│ [default: None] │ -│ --output-format [junit] The target format for │ -│ the test results. │ -│ [default: None] │ -│ --logs --no-logs Print logs │ -│ [default: no-logs] │ -│ --ssl-verification --no-ssl-verificati… SSL verification when │ -│ publishing the data │ -│ contract. │ -│ [default: │ -│ ssl-verification] │ -│ --help Show this message and │ -│ exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract test [OPTIONS] [LOCATION] + + Run schema and quality tests on configured servers. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. │ +│ [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --schema TEXT The location (url or path) of │ +│ the Data Contract Specification │ +│ JSON Schema │ +│ [default: None] │ +│ --server TEXT The server configuration to run │ +│ the schema and quality tests. │ +│ Use the key of the server object │ +│ in the data contract yaml file │ +│ to refer to a server, e.g., │ +│ `production`, or `all` for all │ +│ servers (default). │ +│ [default: all] │ +│ --publish-test-results --no-publish-test-results Publish the results after the │ +│ test │ +│ [default: │ +│ no-publish-test-results] │ +│ --publish TEXT DEPRECATED. The url to publish │ +│ the results after the test. │ +│ [default: None] │ +│ --output PATH Specify the file path where the │ +│ test results should be written │ +│ to (e.g., │ +│ './test-results/TEST-datacontra… │ +│ [default: None] │ +│ --output-format [junit] The target format for the test │ +│ results. │ +│ [default: None] │ +│ --logs --no-logs Print logs [default: no-logs] │ +│ --ssl-verification --no-ssl-verification SSL verification when publishing │ +│ the data contract. │ +│ [default: ssl-verification] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -812,67 +796,83 @@ models: | `DATACONTRACT_TRINO_PASSWORD` | `mysecretpassword` | Password | +#### Local + +Data Contract CLI can test local files in parquet, json, csv, or delta format. + +##### Example + +datacontract.yaml +```yaml +servers: + local: + type: local + path: ./*.parquet + format: parquet +models: + my_table_1: # corresponds to a table + type: table + fields: + my_column_1: # corresponds to a column + type: varchar + my_column_2: # corresponds to a column + type: string +``` + ### export ``` - - Usage: datacontract export [OPTIONS] [LOCATION] - - Convert data contract to a specific format. Saves to file specified by - `output` option if present, otherwise prints to stdout. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract │ -│ yaml. │ -│ [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ * --format [jsonschema|pydantic-model The export format. │ -│ |sodacl|dbt|dbt-sources|db [default: None] │ -│ t-staging-sql|odcs|rdf|avr [required] │ -│ o|protobuf|great-expectati │ -│ ons|terraform|avro-idl|sql │ -│ |sql-query|html|go|bigquer │ -│ y|dbml|spark|sqlalchemy|da │ -│ ta-caterer|dcs|markdown|ic │ -│ eberg|custom] │ -│ --output PATH Specify the file path where │ -│ the exported data will be │ -│ saved. If no path is │ -│ provided, the output will │ -│ be printed to stdout. │ -│ [default: None] │ -│ --server TEXT The server name to export. │ -│ [default: None] │ -│ --model TEXT Use the key of the model in │ -│ the data contract yaml file │ -│ to refer to a model, e.g., │ -│ `orders`, or `all` for all │ -│ models (default). │ -│ [default: all] │ -│ --schema TEXT The location (url or path) │ -│ of the Data Contract │ -│ Specification JSON Schema │ -│ [default: None] │ -│ --engine TEXT [engine] The engine used │ -│ for great expection run. │ -│ [default: None] │ -│ --template PATH [custom] The file path of │ -│ Jinja template. │ -│ [default: None] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ RDF Options ────────────────────────────────────────────────────────────────╮ -│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ SQL Options ────────────────────────────────────────────────────────────────╮ -│ --sql-server-type TEXT [sql] The server type to determine the sql │ -│ dialect. By default, it uses 'auto' to │ -│ automatically detect the sql dialect via the │ -│ specified servers in the data contract. │ -│ [default: auto] │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract export [OPTIONS] [LOCATION] + + Convert data contract to a specific format. Saves to file specified by `output` option if present, + otherwise prints to stdout. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. │ +│ [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --format [jsonschema|pydantic-model|sodacl|db The export format. [default: None] │ +│ t|dbt-sources|dbt-staging-sql|odcs|r [required] │ +│ df|avro|protobuf|great-expectations| │ +│ terraform|avro-idl|sql|sql-query|mer │ +│ maid|html|go|bigquery|dbml|spark|sql │ +│ alchemy|data-caterer|dcs|markdown|ic │ +│ eberg|custom] │ +│ --output PATH Specify the file path where the │ +│ exported data will be saved. If no │ +│ path is provided, the output will be │ +│ printed to stdout. │ +│ [default: None] │ +│ --server TEXT The server name to export. │ +│ [default: None] │ +│ --model TEXT Use the key of the model in the data │ +│ contract yaml file to refer to a │ +│ model, e.g., `orders`, or `all` for │ +│ all models (default). │ +│ [default: all] │ +│ --schema TEXT The location (url or path) of the │ +│ Data Contract Specification JSON │ +│ Schema │ +│ [default: None] │ +│ --engine TEXT [engine] The engine used for great │ +│ expection run. │ +│ [default: None] │ +│ --template PATH [custom] The file path of Jinja │ +│ template. │ +│ [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ RDF Options ────────────────────────────────────────────────────────────────────────────────────╮ +│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ SQL Options ────────────────────────────────────────────────────────────────────────────────────╮ +│ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, │ +│ it uses 'auto' to automatically detect the sql dialect via the │ +│ specified servers in the data contract. │ +│ [default: auto] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -912,6 +912,21 @@ Available export options: | `custom` | Export to Custom format with Jinja | ✅ | | Missing something? | Please create an issue on GitHub | TBD | +#### SQL + +The `export` function converts a given data contract into a SQL data definition language (DDL). + +```shell +datacontract export datacontract.yaml --format sql --output output.sql +``` + +If using Databricks, and an error is thrown when trying to deploy the SQL DDLs with `variant` columns set the following properties. + +```shell +spark.conf.set(“spark.databricks.delta.schema.typeCheck.enabled”, “false”) +from datacontract.model import data_contract_specification +data_contract_specification.DATACONTRACT_TYPES.append(“variant”) +``` #### Great Expectations @@ -919,7 +934,7 @@ The `export` function transforms a specified data contract into a comprehensive If the contract includes multiple models, you need to specify the names of the model you wish to export. ```shell -datacontract export datacontract.yaml --format great-expectations --model orders +datacontract export datacontract.yaml --format great-expectations --model orders ``` The export creates a list of expectations by utilizing: @@ -944,7 +959,7 @@ To further customize the export, the following optional arguments are available: #### RDF -The export function converts a given data contract into a RDF representation. You have the option to +The `export` function converts a given data contract into a RDF representation. You have the option to add a base_url which will be used as the default prefix to resolve relative IRIs inside the document. ```shell @@ -1169,104 +1184,92 @@ FROM ### import ``` - - Usage: datacontract import [OPTIONS] - - Create a data contract from the given source location. Saves to file specified - by `output` option if present, otherwise prints to stdout. - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ * --format [sql|avro|dbt|dbml|gl The format of the │ -│ ue|jsonschema|bigquer source file. │ -│ y|odcs|unity|spark|ic [default: None] │ -│ eberg|parquet|csv|pro [required] │ -│ tobuf] │ -│ --output PATH Specify the file path │ -│ where the Data │ -│ Contract will be │ -│ saved. If no path is │ -│ provided, the output │ -│ will be printed to │ -│ stdout. │ -│ [default: None] │ -│ --source TEXT The path to the file │ -│ or Glue Database that │ -│ should be imported. │ -│ [default: None] │ -│ --dialect TEXT The SQL dialect to │ -│ use when importing │ -│ SQL files, e.g., │ -│ postgres, tsql, │ -│ bigquery. │ -│ [default: None] │ -│ --glue-table TEXT List of table ids to │ -│ import from the Glue │ -│ Database (repeat for │ -│ multiple table ids, │ -│ leave empty for all │ -│ tables in the │ -│ dataset). │ -│ [default: None] │ -│ --bigquery-project TEXT The bigquery project │ -│ id. │ -│ [default: None] │ -│ --bigquery-dataset TEXT The bigquery dataset │ -│ id. │ -│ [default: None] │ -│ --bigquery-table TEXT List of table ids to │ -│ import from the │ -│ bigquery API (repeat │ -│ for multiple table │ -│ ids, leave empty for │ -│ all tables in the │ -│ dataset). │ -│ [default: None] │ -│ --unity-table-full-n… TEXT Full name of a table │ -│ in the unity catalog │ -│ [default: None] │ -│ --dbt-model TEXT List of models names │ -│ to import from the │ -│ dbt manifest file │ -│ (repeat for multiple │ -│ models names, leave │ -│ empty for all models │ -│ in the dataset). │ -│ [default: None] │ -│ --dbml-schema TEXT List of schema names │ -│ to import from the │ -│ DBML file (repeat for │ -│ multiple schema │ -│ names, leave empty │ -│ for all tables in the │ -│ file). │ -│ [default: None] │ -│ --dbml-table TEXT List of table names │ -│ to import from the │ -│ DBML file (repeat for │ -│ multiple table names, │ -│ leave empty for all │ -│ tables in the file). │ -│ [default: None] │ -│ --iceberg-table TEXT Table name to assign │ -│ to the model created │ -│ from the Iceberg │ -│ schema. │ -│ [default: None] │ -│ --template TEXT The location (url or │ -│ path) of the Data │ -│ Contract │ -│ Specification │ -│ Template │ -│ [default: None] │ -│ --schema TEXT The location (url or │ -│ path) of the Data │ -│ Contract │ -│ Specification JSON │ -│ Schema │ -│ [default: None] │ -│ --help Show this message and │ -│ exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract import [OPTIONS] + + Create a data contract from the given source location. Saves to file specified by `output` option + if present, otherwise prints to stdout. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --format [sql|avro|dbt|dbml|glue|jsonsc The format of the source file. │ +│ hema|bigquery|odcs|unity|spark [default: None] │ +│ |iceberg|parquet|csv|protobuf| [required] │ +│ excel] │ +│ --output PATH Specify the file path where │ +│ the Data Contract will be │ +│ saved. If no path is provided, │ +│ the output will be printed to │ +│ stdout. │ +│ [default: None] │ +│ --source TEXT The path to the file that │ +│ should be imported. │ +│ [default: None] │ +│ --spec [datacontract_specification|od The format of the data │ +│ cs] contract to import. │ +│ [default: │ +│ datacontract_specification] │ +│ --dialect TEXT The SQL dialect to use when │ +│ importing SQL files, e.g., │ +│ postgres, tsql, bigquery. │ +│ [default: None] │ +│ --glue-table TEXT List of table ids to import │ +│ from the Glue Database (repeat │ +│ for multiple table ids, leave │ +│ empty for all tables in the │ +│ dataset). │ +│ [default: None] │ +│ --bigquery-project TEXT The bigquery project id. │ +│ [default: None] │ +│ --bigquery-dataset TEXT The bigquery dataset id. │ +│ [default: None] │ +│ --bigquery-table TEXT List of table ids to import │ +│ from the bigquery API (repeat │ +│ for multiple table ids, leave │ +│ empty for all tables in the │ +│ dataset). │ +│ [default: None] │ +│ --unity-table-full-name TEXT Full name of a table in the │ +│ unity catalog │ +│ [default: None] │ +│ --dbt-model TEXT List of models names to import │ +│ from the dbt manifest file │ +│ (repeat for multiple models │ +│ names, leave empty for all │ +│ models in the dataset). │ +│ [default: None] │ +│ --dbml-schema TEXT List of schema names to import │ +│ from the DBML file (repeat for │ +│ multiple schema names, leave │ +│ empty for all tables in the │ +│ file). │ +│ [default: None] │ +│ --dbml-table TEXT List of table names to import │ +│ from the DBML file (repeat for │ +│ multiple table names, leave │ +│ empty for all tables in the │ +│ file). │ +│ [default: None] │ +│ --iceberg-table TEXT Table name to assign to the │ +│ model created from the Iceberg │ +│ schema. │ +│ [default: None] │ +│ --template TEXT The location (url or path) of │ +│ the Data Contract │ +│ Specification Template │ +│ [default: None] │ +│ --schema TEXT The location (url or path) of │ +│ the Data Contract │ +│ Specification JSON Schema │ +│ [default: None] │ +│ --owner TEXT The owner or team responsible │ +│ for managing the data │ +│ contract. │ +│ [default: None] │ +│ --id TEXT The identifier for the the │ +│ data contract. │ +│ [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1282,21 +1285,22 @@ Available import options: | Type | Description | Status | |--------------------|------------------------------------------------|--------| -| `sql` | Import from SQL DDL | ✅ | | `avro` | Import from AVRO schemas | ✅ | -| `glue` | Import from AWS Glue DataCatalog | ✅ | -| `jsonschema` | Import from JSON Schemas | ✅ | | `bigquery` | Import from BigQuery Schemas | ✅ | -| `unity` | Import from Databricks Unity Catalog | partial | -| `dbt` | Import from dbt models | ✅ | -| `odcs` | Import from Open Data Contract Standard (ODCS) | ✅ | -| `spark` | Import from Spark StructTypes | ✅ | -| `dbml` | Import from DBML models | ✅ | | `csv` | Import from CSV File | ✅ | -| `protobuf` | Import from Protobuf schemas | ✅ | +| `dbml` | Import from DBML models | ✅ | +| `dbt` | Import from dbt models | ✅ | +| `excel` | Import from ODCS Excel Template | ✅ | +| `glue` | Import from AWS Glue DataCatalog | ✅ | | `iceberg` | Import from an Iceberg JSON Schema Definition | partial | -| `parquet` | Import from Parquet File Metadta | ✅ | -| Missing something? | Please create an issue on GitHub | TBD | +| `jsonschema` | Import from JSON Schemas | ✅ | +| `odcs` | Import from Open Data Contract Standard (ODCS) | ✅ | +| `parquet` | Import from Parquet File Metadata | ✅ | +| `protobuf` | Import from Protobuf schemas | ✅ | +| `spark` | Import from Spark StructTypes, Variant | ✅ | +| `sql` | Import from SQL DDL | ✅ | +| `unity` | Import from Databricks Unity Catalog | partial | +| Missing something? | Please create an issue on GitHub | TBD | #### ODCS @@ -1345,8 +1349,9 @@ datacontract import --format unity --source my_unity_table.json ```bash # Example import single table from Unity Catalog via HTTP endpoint -export DATABRICKS_IMPORT_INSTANCE="https://xyz.cloud.databricks.com" -export DATABRICKS_IMPORT_ACCESS_TOKEN= +export DATACONTRACT_DATABRICKS_SERVER_HOSTNAME="https://xyz.cloud.databricks.com" +export DATACONTRACT_DATABRICKS_HTTP_PATH="/sql/1.0/warehouses/b053a331fa014fb4" +export DATACONTRACT_DATABRICKS_TOKEN= datacontract import --format unity --unity-table-full-name ``` @@ -1367,6 +1372,17 @@ datacontract import --format dbt --source --dbt-model ``` +### Excel + +Importing from [ODCS Excel Template](https://github.com/datacontract/open-data-contract-standard-excel-template). + +Examples: + +```bash +# Example import from ODCS Excel Template +datacontract import --format excel --source odcs.xlsx +``` + #### Glue Importing from Glue reads the necessary Data directly off of the AWS API. @@ -1386,14 +1402,31 @@ datacontract import --format glue --source #### Spark -Importing from Spark table or view these must be created or accessible in the Spark context. Specify tables list in `source` parameter. - -Example: +Importing from Spark table or view these must be created or accessible in the Spark context. Specify tables list in `source` parameter. If the `source` tables are registered as tables in Databricks, and they have a table-level descriptions they will also be added to the Data Contract Specification. ```bash +# Example: Import Spark table(s) from Spark context datacontract import --format spark --source "users,orders" ``` +```bash +# Example: Import Spark table +DataContract().import_from_source("spark", "users") +DataContract().import_from_source(format = "spark", source = "users") + +# Example: Import Spark dataframe +DataContract().import_from_source("spark", "users", dataframe = df_user) +DataContract().import_from_source(format = "spark", source = "users", dataframe = df_user) + +# Example: Import Spark table + table description +DataContract().import_from_source("spark", "users", description = "description") +DataContract().import_from_source(format = "spark", source = "users", description = "description") + +# Example: Import Spark dataframe + table description +DataContract().import_from_source("spark", "users", dataframe = df_user, description = "description") +DataContract().import_from_source(format = "spark", source = "users", dataframe = df_user, description = "description") +``` + #### DBML Importing from DBML Documents. @@ -1458,91 +1491,83 @@ datacontract import --format protobuf --source "test.proto" ### breaking ``` - - Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW - - Identifies breaking changes between data contracts. Prints to stdout. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW + + Identifies breaking changes between data contracts. Prints to stdout. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ * location_old TEXT The location (url or path) of the old data contract yaml. │ +│ [default: None] │ +│ [required] │ +│ * location_new TEXT The location (url or path) of the new data contract yaml. │ +│ [default: None] │ +│ [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### changelog ``` - - Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW - - Generate a changelog between data contracts. Prints to stdout. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW + + Generate a changelog between data contracts. Prints to stdout. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ * location_old TEXT The location (url or path) of the old data contract yaml. │ +│ [default: None] │ +│ [required] │ +│ * location_new TEXT The location (url or path) of the new data contract yaml. │ +│ [default: None] │ +│ [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### diff ``` - - Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW - - PLACEHOLDER. Currently works as 'changelog' does. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data │ -│ contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW + + PLACEHOLDER. Currently works as 'changelog' does. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ * location_old TEXT The location (url or path) of the old data contract yaml. │ +│ [default: None] │ +│ [required] │ +│ * location_new TEXT The location (url or path) of the new data contract yaml. │ +│ [default: None] │ +│ [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### catalog ``` - - Usage: datacontract catalog [OPTIONS] - - Create a html catalog of data contracts. - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --files TEXT Glob pattern for the data contract files to include in │ -│ the catalog. Applies recursively to any subfolders. │ -│ [default: *.yaml] │ -│ --output TEXT Output directory for the catalog html files. │ -│ [default: catalog/] │ -│ --schema TEXT The location (url or path) of the Data Contract │ -│ Specification JSON Schema │ -│ [default: None] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract catalog [OPTIONS] + + Create a html catalog of data contracts. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --files TEXT Glob pattern for the data contract files to include in the catalog. │ +│ Applies recursively to any subfolders. │ +│ [default: *.yaml] │ +│ --output TEXT Output directory for the catalog html files. [default: catalog/] │ +│ --schema TEXT The location (url or path) of the Data Contract Specification JSON Schema │ +│ [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1558,56 +1583,48 @@ datacontract catalog --files "*.odcs.yaml" ### publish ``` - - Usage: datacontract publish [OPTIONS] [LOCATION] - - Publish the data contract to the Data Mesh Manager. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract │ -│ yaml. │ -│ [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --schema TEXT The location (url or │ -│ path) of the Data │ -│ Contract Specification │ -│ JSON Schema │ -│ [default: None] │ -│ --ssl-verification --no-ssl-verification SSL verification when │ -│ publishing the data │ -│ contract. │ -│ [default: │ -│ ssl-verification] │ -│ --help Show this message and │ -│ exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract publish [OPTIONS] [LOCATION] + + Publish the data contract to the Data Mesh Manager. + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. │ +│ [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --schema TEXT The location (url or path) of the Data │ +│ Contract Specification JSON Schema │ +│ [default: None] │ +│ --ssl-verification --no-ssl-verification SSL verification when publishing the data │ +│ contract. │ +│ [default: ssl-verification] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` ### api ``` - - Usage: datacontract api [OPTIONS] - - Start the datacontract CLI as server application with REST API. - The OpenAPI documentation as Swagger UI is available on http://localhost:4242. - You can execute the commands directly from the Swagger UI. - To protect the API, you can set the environment variable - DATACONTRACT_CLI_API_KEY to a secret API key. To authenticate, requests must - include the header 'x-api-key' with the correct API key. This is highly - recommended, as data contract tests may be subject to SQL injections or leak - sensitive information. - To connect to servers (such as a Snowflake data source), set the credentials - as environment variables as documented in https://cli.datacontract.com/#test - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --port INTEGER Bind socket to this port. [default: 4242] │ -│ --host TEXT Bind socket to this host. Hint: For running in │ -│ docker, set it to 0.0.0.0 │ -│ [default: 127.0.0.1] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + + Usage: datacontract api [OPTIONS] + + Start the datacontract CLI as server application with REST API. + The OpenAPI documentation as Swagger UI is available on http://localhost:4242. You can execute the + commands directly from the Swagger UI. + To protect the API, you can set the environment variable DATACONTRACT_CLI_API_KEY to a secret API + key. To authenticate, requests must include the header 'x-api-key' with the correct API key. This + is highly recommended, as data contract tests may be subject to SQL injections or leak sensitive + information. + To connect to servers (such as a Snowflake data source), set the credentials as environment + variables as documented in https://cli.datacontract.com/#test + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --port INTEGER Bind socket to this port. [default: 4242] │ +│ --host TEXT Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0 │ +│ [default: 127.0.0.1] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1906,7 +1923,8 @@ pytest ```bash # make sure uv is installed uv python pin 3.11 -uv sync --all-extras +uv pip install -e '.[dev]' +uv run ruff check uv run pytest ``` diff --git a/datacontract/api.py b/datacontract/api.py index 422016083..89fe711ab 100644 --- a/datacontract/api.py +++ b/datacontract/api.py @@ -162,7 +162,7 @@ async def test( server: Annotated[ str | None, Query( - example="production", + examples=["production"], description="The server name to test. Optional, if there is only one server.", ), ] = None, @@ -191,7 +191,7 @@ async def lint( schema: Annotated[ str | None, Query( - example="https://datacontract.com/datacontract.schema.json", + examples=["https://datacontract.com/datacontract.schema.json"], description="The schema to use for validation. This must be a URL.", ), ] = None, @@ -220,7 +220,7 @@ def export( server: Annotated[ str | None, Query( - example="production", + examples=["production"], description="The server name to export. Optional, if there is only one server.", ), ] = None, diff --git a/datacontract/catalog/catalog.py b/datacontract/catalog/catalog.py index 03284defb..50a92aecc 100644 --- a/datacontract/catalog/catalog.py +++ b/datacontract/catalog/catalog.py @@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, select_autoescape from datacontract.data_contract import DataContract -from datacontract.export.html_export import get_version +from datacontract.export.html_exporter import get_version from datacontract.model.data_contract_specification import DataContractSpecification diff --git a/datacontract/cli.py b/datacontract/cli.py index 7af97463e..5309ad119 100644 --- a/datacontract/cli.py +++ b/datacontract/cli.py @@ -11,7 +11,7 @@ from datacontract.catalog.catalog import create_data_contract_html, create_index_html from datacontract.data_contract import DataContract, ExportFormat -from datacontract.imports.importer import ImportFormat +from datacontract.imports.importer import ImportFormat, Spec from datacontract.init.init_template import get_init_template from datacontract.integration.datamesh_manager import ( publish_data_contract_to_datamesh_manager, @@ -126,7 +126,8 @@ def test( "servers (default)." ), ] = "all", - publish: Annotated[str, typer.Option(help="The url to publish the results after the test")] = None, + publish_test_results: Annotated[bool, typer.Option(help="Publish the results after the test")] = False, + publish: Annotated[str, typer.Option(help="DEPRECATED. The url to publish the results after the test.")] = None, output: Annotated[ Path, typer.Option( @@ -149,6 +150,7 @@ def test( run = DataContract( data_contract_file=location, schema_location=schema, + publish_test_results=publish_test_results, publish_url=publish, server=server, ssl_verification=ssl_verification, @@ -244,8 +246,12 @@ def import_( ] = None, source: Annotated[ Optional[str], - typer.Option(help="The path to the file or Glue Database that should be imported."), + typer.Option(help="The path to the file that should be imported."), ] = None, + spec: Annotated[ + Spec, + typer.Option(help="The format of the data contract to import. "), + ] = Spec.datacontract_specification, dialect: Annotated[ Optional[str], typer.Option(help="The SQL dialect to use when importing SQL files, e.g., postgres, tsql, bigquery."), @@ -265,7 +271,7 @@ def import_( ), ] = None, unity_table_full_name: Annotated[ - Optional[str], typer.Option(help="Full name of a table in the unity catalog") + Optional[List[str]], typer.Option(help="Full name of a table in the unity catalog") ] = None, dbt_model: Annotated[ Optional[List[str]], @@ -297,6 +303,14 @@ def import_( str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), ] = None, + owner: Annotated[ + Optional[str], + typer.Option(help="The owner or team responsible for managing the data contract."), + ] = None, + id: Annotated[ + Optional[str], + typer.Option(help="The identifier for the the data contract."), + ] = None, ): """ Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout. @@ -304,6 +318,7 @@ def import_( result = DataContract().import_from_source( format=format, source=source, + spec=spec, template=template, schema=schema, dialect=dialect, @@ -316,6 +331,8 @@ def import_( dbml_schema=dbml_schema, dbml_table=dbml_table, iceberg_table=iceberg_table, + owner=owner, + id=id, ) if output is None: console.print(result.to_yaml(), markup=False, soft_wrap=True) diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index 1ebe81268..7e8e4eda7 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -1,6 +1,12 @@ import logging import typing +from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard + +from datacontract.export.odcs_v3_exporter import to_odcs_v3 +from datacontract.imports.importer import Spec +from datacontract.imports.odcs_v3_importer import import_from_odcs + if typing.TYPE_CHECKING: from pyspark.sql import SparkSession @@ -24,9 +30,8 @@ from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter -from datacontract.lint.linters.quality_schema_linter import QualityUsesSchemaLinter from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter -from datacontract.model.data_contract_specification import DataContractSpecification +from datacontract.model.data_contract_specification import DataContractSpecification, Info from datacontract.model.exceptions import DataContractException from datacontract.model.run import Check, ResultEnum, Run @@ -45,6 +50,7 @@ def __init__( inline_definitions: bool = True, inline_quality: bool = True, ssl_verification: bool = True, + publish_test_results: bool = False, ): self._data_contract_file = data_contract_file self._data_contract_str = data_contract_str @@ -52,13 +58,13 @@ def __init__( self._schema_location = schema_location self._server = server self._publish_url = publish_url + self._publish_test_results = publish_test_results self._spark = spark self._duckdb_connection = duckdb_connection self._inline_definitions = inline_definitions self._inline_quality = inline_quality self._ssl_verification = ssl_verification self.all_linters = { - QualityUsesSchemaLinter(), FieldPatternLinter(), FieldReferenceLinter(), NoticePeriodLinter(), @@ -180,7 +186,7 @@ def test(self) -> Run: run.finish() - if self._publish_url is not None: + if self._publish_url is not None or self._publish_test_results: publish_test_results_to_datamesh_manager(run, self._publish_url, self._ssl_verification) return run @@ -245,33 +251,128 @@ def get_data_contract_specification(self) -> DataContractSpecification: ) def export(self, export_format: ExportFormat, model: str = "all", sql_server_type: str = "auto", **kwargs) -> str: - data_contract = resolve.resolve_data_contract( - self._data_contract_file, - self._data_contract_str, - self._data_contract, - schema_location=self._schema_location, - inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, - ) + if export_format == ExportFormat.html or export_format == ExportFormat.mermaid: + data_contract = resolve.resolve_data_contract_v2( + self._data_contract_file, + self._data_contract_str, + self._data_contract, + schema_location=self._schema_location, + inline_definitions=self._inline_definitions, + inline_quality=self._inline_quality, + ) - return exporter_factory.create(export_format).export( - data_contract=data_contract, - model=model, - server=self._server, - sql_server_type=sql_server_type, - export_args=kwargs, - ) + return exporter_factory.create(export_format).export( + data_contract=data_contract, + model=model, + server=self._server, + sql_server_type=sql_server_type, + export_args=kwargs, + ) + else: + data_contract = resolve.resolve_data_contract( + self._data_contract_file, + self._data_contract_str, + self._data_contract, + schema_location=self._schema_location, + inline_definitions=self._inline_definitions, + inline_quality=self._inline_quality, + ) + + return exporter_factory.create(export_format).export( + data_contract=data_contract, + model=model, + server=self._server, + sql_server_type=sql_server_type, + export_args=kwargs, + ) + # REFACTOR THIS + # could be a class method, not using anything from the instance def import_from_source( self, format: str, source: typing.Optional[str] = None, template: typing.Optional[str] = None, schema: typing.Optional[str] = None, + spec: Spec = Spec.datacontract_specification, **kwargs, - ) -> DataContractSpecification: - data_contract_specification_initial = DataContract.init(template=template, schema=schema) + ) -> DataContractSpecification | OpenDataContractStandard: + id = kwargs.get("id") + owner = kwargs.get("owner") - return importer_factory.create(format).import_source( - data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs - ) + if spec == Spec.odcs: + data_contract_specification_initial = DataContract.init(template=template, schema=schema) + + odcs_imported = importer_factory.create(format).import_source( + data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs + ) + + if isinstance(odcs_imported, DataContractSpecification): + # convert automatically + odcs_imported = to_odcs_v3(odcs_imported) + + self._overwrite_id_in_odcs(odcs_imported, id) + self._overwrite_owner_in_odcs(odcs_imported, owner) + + return odcs_imported + elif spec == Spec.datacontract_specification: + data_contract_specification_initial = DataContract.init(template=template, schema=schema) + + data_contract_specification_imported = importer_factory.create(format).import_source( + data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs + ) + + if isinstance(data_contract_specification_imported, OpenDataContractStandard): + # convert automatically + data_contract_specification_imported = import_from_odcs( + data_contract_specification_initial, data_contract_specification_imported + ) + + self._overwrite_id_in_data_contract_specification(data_contract_specification_imported, id) + self._overwrite_owner_in_data_contract_specification(data_contract_specification_imported, owner) + + return data_contract_specification_imported + else: + raise DataContractException( + type="general", + result=ResultEnum.error, + name="Import Data Contract", + reason=f"Unsupported data contract format: {spec}", + engine="datacontract", + ) + + def _overwrite_id_in_data_contract_specification( + self, data_contract_specification: DataContractSpecification, id: str | None + ): + if not id: + return + + data_contract_specification.id = id + + def _overwrite_owner_in_data_contract_specification( + self, data_contract_specification: DataContractSpecification, owner: str | None + ): + if not owner: + return + + if data_contract_specification.info is None: + data_contract_specification.info = Info() + data_contract_specification.info.owner = owner + + def _overwrite_owner_in_odcs(self, odcs: OpenDataContractStandard, owner: str | None): + if not owner: + return + + if odcs.customProperties is None: + odcs.customProperties = [] + for customProperty in odcs.customProperties: + if customProperty.name == "owner": + customProperty.value = owner + return + odcs.customProperties.append(CustomProperty(property="owner", value=owner)) + + def _overwrite_id_in_odcs(self, odcs: OpenDataContractStandard, id: str | None): + if not id: + return + + odcs.id = id diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 2be6c649e..4def63173 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -502,11 +502,13 @@ def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> query = quality.query query = query.replace("{model}", model_name) + query = query.replace("{schema}", model_name) query = query.replace("{table}", model_name) if field_name is not None: query = query.replace("{field}", field_name) query = query.replace("{column}", field_name) + query = query.replace("{property}", field_name) return query diff --git a/datacontract/engines/soda/connections/duckdb_connection.py b/datacontract/engines/soda/connections/duckdb_connection.py index f05fce2f6..f05f1c762 100644 --- a/datacontract/engines/soda/connections/duckdb_connection.py +++ b/datacontract/engines/soda/connections/duckdb_connection.py @@ -1,10 +1,10 @@ import os -from typing import Any +from typing import Any, Dict import duckdb -from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type -from datacontract.model.data_contract_specification import DataContractSpecification, Server +from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type +from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server from datacontract.model.run import Run @@ -12,8 +12,8 @@ def get_duckdb_connection( data_contract: DataContractSpecification, server: Server, run: Run, - duckdb_connection: duckdb.DuckDBPyConnection = None, -): + duckdb_connection: duckdb.DuckDBPyConnection | None = None, +) -> duckdb.DuckDBPyConnection: if duckdb_connection is None: con = duckdb.connect(database=":memory:") else: @@ -43,9 +43,16 @@ def get_duckdb_connection( json_format = "newline_delimited" elif server.delimiter == "array": json_format = "array" - con.sql(f""" + columns = to_json_types(model) + if columns is None: + con.sql(f""" CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1); """) + else: + con.sql( + f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);""" + ) + add_nested_views(con, model_name, model.fields) elif server.format == "parquet": con.sql(f""" CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1); @@ -77,6 +84,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None: return columns +def to_json_types(model: Model) -> dict[Any, str | None] | None: + if model is None: + return None + columns = {} + for field_name, field in model.fields.items(): + columns[field_name] = convert_to_duckdb_json_type(field) + return columns + + +def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None): + model_name = model_name.strip('"') + if fields is None: + return + for field_name, field in fields.items(): + if field.type is None or field.type.lower() not in ["array", "object"]: + continue + field_type = field.type.lower() + if field_type == "array" and field.items is None: + continue + elif field_type == "object" and field.fields is None: + continue + + nested_model_name = f"{model_name}__{field_name}" + max_depth = 2 if field_type == "array" else 1 + + ## if parent field is not required, the nested objects may respolve + ## to a row of NULLs -- but if the objects themselves have required + ## fields, this will fail the check. + where = "" if field.required else f" WHERE {field_name} IS NOT NULL" + con.sql(f""" + CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS + SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where} + """) + if field_type == "array": + add_nested_views(con, nested_model_name, field.items.fields) + elif field_type == "object": + add_nested_views(con, nested_model_name, field.fields) + + def setup_s3_connection(con, server): s3_region = os.getenv("DATACONTRACT_S3_REGION") s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID") diff --git a/datacontract/engines/soda/connections/kafka.py b/datacontract/engines/soda/connections/kafka.py index 2a2445bf4..94c1ce24d 100644 --- a/datacontract/engines/soda/connections/kafka.py +++ b/datacontract/engines/soda/connections/kafka.py @@ -27,6 +27,7 @@ def create_spark_session(): tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark") atexit.register(tmp_dir.cleanup) + pyspark_version = "3.5.5" # MUST be the same as in the pyproject.toml spark = ( SparkSession.builder.appName("datacontract") .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse") @@ -34,7 +35,7 @@ def create_spark_session(): .config("spark.ui.enabled", "false") .config( "spark.jars.packages", - "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,org.apache.spark:spark-avro_2.12:3.5.5", + f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}", ) .getOrCreate() ) diff --git a/datacontract/export/bigquery_converter.py b/datacontract/export/bigquery_converter.py index e41a20429..71222d46c 100644 --- a/datacontract/export/bigquery_converter.py +++ b/datacontract/export/bigquery_converter.py @@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str: elif field_type.lower() == "date": return "DATE" elif field_type.lower() == "timestamp_ntz": - return "TIME" + return "DATETIME" elif field_type.lower() in ["number", "decimal", "numeric"]: return "NUMERIC" elif field_type.lower() == "double": diff --git a/datacontract/export/csv_type_converter.py b/datacontract/export/csv_type_converter.py deleted file mode 100644 index 79dfe1668..000000000 --- a/datacontract/export/csv_type_converter.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://duckdb.org/docs/data/csv/overview.html -# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR'] -def convert_to_duckdb_csv_type(field) -> None | str: - type = field.type - if type is None: - return "VARCHAR" - if type.lower() in ["string", "varchar", "text"]: - return "VARCHAR" - if type.lower() in ["timestamp", "timestamp_tz"]: - return "TIMESTAMP" - if type.lower() in ["timestamp_ntz"]: - return "TIMESTAMP" - if type.lower() in ["date"]: - return "DATE" - if type.lower() in ["time"]: - return "TIME" - if type.lower() in ["number", "decimal", "numeric"]: - # precision and scale not supported by data contract - return "VARCHAR" - if type.lower() in ["float", "double"]: - return "DOUBLE" - if type.lower() in ["integer", "int", "long", "bigint"]: - return "BIGINT" - if type.lower() in ["boolean"]: - return "BOOLEAN" - if type.lower() in ["object", "record", "struct"]: - # not supported in CSV - return "VARCHAR" - if type.lower() in ["bytes"]: - # not supported in CSV - return "VARCHAR" - if type.lower() in ["array"]: - return "VARCHAR" - if type.lower() in ["null"]: - return "SQLNULL" - return "VARCHAR" diff --git a/datacontract/export/dbt_converter.py b/datacontract/export/dbt_converter.py index a9cb1fcaf..3cf07afe3 100644 --- a/datacontract/export/dbt_converter.py +++ b/datacontract/export/dbt_converter.py @@ -27,7 +27,7 @@ def export(self, data_contract, model, server, sql_server_type, export_args) -> ) -def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None): +def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None) -> str: dbt = { "version": 2, "models": [], @@ -102,8 +102,11 @@ def _to_dbt_model( "name": model_key, } model_type = _to_dbt_model_type(model_value.type) + dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}} - dbt_model["config"]["materialized"] = model_type + + if model_type: + dbt_model["config"]["materialized"] = model_type if data_contract_spec.info.owner is not None: dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner @@ -123,7 +126,7 @@ def _to_dbt_model_type(model_type): # Allowed values: table, view, incremental, ephemeral, materialized view # Custom values also possible if model_type is None: - return "table" + return None if model_type.lower() == "table": return "table" if model_type.lower() == "view": diff --git a/datacontract/export/duckdb_type_converter.py b/datacontract/export/duckdb_type_converter.py new file mode 100644 index 000000000..cf57398e1 --- /dev/null +++ b/datacontract/export/duckdb_type_converter.py @@ -0,0 +1,57 @@ +from typing import Dict + +from datacontract.model.data_contract_specification import Field + + +# https://duckdb.org/docs/data/csv/overview.html +# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR'] +def convert_to_duckdb_csv_type(field) -> None | str: + datacontract_type = field.type + if datacontract_type is None: + return "VARCHAR" + if datacontract_type.lower() in ["string", "varchar", "text"]: + return "VARCHAR" + if datacontract_type.lower() in ["timestamp", "timestamp_tz"]: + return "TIMESTAMP" + if datacontract_type.lower() in ["timestamp_ntz"]: + return "TIMESTAMP" + if datacontract_type.lower() in ["date"]: + return "DATE" + if datacontract_type.lower() in ["time"]: + return "TIME" + if datacontract_type.lower() in ["number", "decimal", "numeric"]: + # precision and scale not supported by data contract + return "VARCHAR" + if datacontract_type.lower() in ["float", "double"]: + return "DOUBLE" + if datacontract_type.lower() in ["integer", "int", "long", "bigint"]: + return "BIGINT" + if datacontract_type.lower() in ["boolean"]: + return "BOOLEAN" + if datacontract_type.lower() in ["object", "record", "struct"]: + # not supported in CSV + return "VARCHAR" + if datacontract_type.lower() in ["bytes"]: + # not supported in CSV + return "VARCHAR" + if datacontract_type.lower() in ["array"]: + return "VARCHAR" + if datacontract_type.lower() in ["null"]: + return "SQLNULL" + return "VARCHAR" + + +def convert_to_duckdb_json_type(field: Field) -> None | str: + datacontract_type = field.type + if datacontract_type is None: + return "VARCHAR" + if datacontract_type.lower() in ["array"]: + return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore + if datacontract_type.lower() in ["object", "record", "struct"]: + return convert_to_duckdb_object(field.fields) + return convert_to_duckdb_csv_type(field) + + +def convert_to_duckdb_object(fields: Dict[str, Field]): + columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()] + return f"STRUCT({', '.join(columns)})" diff --git a/datacontract/export/exporter.py b/datacontract/export/exporter.py index f0b9d2f3e..2c3864b38 100644 --- a/datacontract/export/exporter.py +++ b/datacontract/export/exporter.py @@ -33,6 +33,7 @@ class ExportFormat(str, Enum): avro_idl = "avro-idl" sql = "sql" sql_query = "sql-query" + mermaid = "mermaid" html = "html" go = "go" bigquery = "bigquery" diff --git a/datacontract/export/exporter_factory.py b/datacontract/export/exporter_factory.py index 4804f7993..c9d8419ce 100644 --- a/datacontract/export/exporter_factory.py +++ b/datacontract/export/exporter_factory.py @@ -89,6 +89,12 @@ def load_module_class(module_path, class_name): class_name="DbtExporter", ) +exporter_factory.register_lazy_exporter( + name=ExportFormat.mermaid, + module_path="datacontract.export.mermaid_exporter", + class_name="MermaidExporter", +) + exporter_factory.register_lazy_exporter( name=ExportFormat.dbt_sources, module_path="datacontract.export.dbt_converter", @@ -127,7 +133,7 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.html, - module_path="datacontract.export.html_export", + module_path="datacontract.export.html_exporter", class_name="HtmlExporter", ) diff --git a/datacontract/export/great_expectations_converter.py b/datacontract/export/great_expectations_converter.py index a8202ea9b..d49129f7d 100644 --- a/datacontract/export/great_expectations_converter.py +++ b/datacontract/export/great_expectations_converter.py @@ -19,6 +19,7 @@ from datacontract.export.sql_type_converter import convert_to_sql_type from datacontract.model.data_contract_specification import ( DataContractSpecification, + DeprecatedQuality, Field, Quality, ) @@ -91,8 +92,14 @@ def to_great_expectations( model_key=model_key, contract_version=data_contract_spec.info.version ) model_value = data_contract_spec.models.get(model_key) - quality_checks = get_quality_checks(data_contract_spec.quality) + + # Support for Deprecated Quality + quality_checks = get_deprecated_quality_checks(data_contract_spec.quality) + + expectations.extend(get_quality_checks(model_value.quality)) + expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type)) + expectations.extend(checks_to_expectations(quality_checks, model_key)) model_expectation_suite = to_suite(expectations, expectation_suite_name) @@ -135,6 +142,7 @@ def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_serv add_column_order_exp(fields, expectations) for field_name, field in fields.items(): add_field_expectations(field_name, field, expectations, engine, sql_server_type) + expectations.extend(get_quality_checks(field.quality, field_name)) return expectations @@ -173,6 +181,8 @@ def add_field_expectations( expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength)) if field.minimum is not None or field.maximum is not None: expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum)) + if field.enum is not None and len(field.enum) != 0: + expectations.append(to_column_enum_exp(field_name, field.enum)) return expectations @@ -266,7 +276,24 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]: } -def get_quality_checks(quality: Quality) -> Dict[str, Any]: +def to_column_enum_exp(field_name, enum_list: List[str]) -> Dict[str, Any]: + """Creates a expect_column_values_to_be_in_set expectation. + + Args: + field_name (str): The name of the field. + enum_list (Set[str]): enum list of value. + + Returns: + Dict[str, Any]: Column value in set expectation. + """ + return { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": {"column": field_name, "value_set": enum_list}, + "meta": {}, + } + + +def get_deprecated_quality_checks(quality: DeprecatedQuality) -> Dict[str, Any]: """Retrieves quality checks defined in a data contract. Args: @@ -288,6 +315,26 @@ def get_quality_checks(quality: Quality) -> Dict[str, Any]: return quality_specification +def get_quality_checks(qualities: List[Quality], field_name: str | None = None) -> List[Dict[str, Any]]: + """Retrieves quality checks defined in a data contract. + + Args: + qualities (List[Quality]): List of quality object from the model specification. + field_name (str | None): field name if the quality list is attached to a specific field + + Returns: + Dict[str, Any]: Dictionary of quality checks. + """ + quality_specification = [] + for quality in qualities: + if quality is not None and quality.engine is not None and quality.engine.lower() == "great-expectations": + ge_expectation = quality.implementation + if field_name is not None: + ge_expectation["column"] = field_name + quality_specification.append(ge_expectation) + return quality_specification + + def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]: """Converts quality checks to a list of expectations. diff --git a/datacontract/export/html_export.py b/datacontract/export/html_exporter.py similarity index 51% rename from datacontract/export/html_export.py rename to datacontract/export/html_exporter.py index bc1b1c101..53b26b28d 100644 --- a/datacontract/export/html_export.py +++ b/datacontract/export/html_exporter.py @@ -6,8 +6,10 @@ import pytz import yaml from jinja2 import Environment, PackageLoader, select_autoescape +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.export.exporter import Exporter +from datacontract.export.mermaid_exporter import to_mermaid from datacontract.model.data_contract_specification import DataContractSpecification @@ -16,7 +18,7 @@ def export(self, data_contract, model, server, sql_server_type, export_args) -> return to_html(data_contract) -def to_html(data_contract_spec: DataContractSpecification) -> str: +def to_html(data_contract_spec: DataContractSpecification | OpenDataContractStandard) -> str: # Load templates from templates folder package_loader = PackageLoader("datacontract", "templates") env = Environment( @@ -31,28 +33,30 @@ def to_html(data_contract_spec: DataContractSpecification) -> str: # Load the required template # needs to be included in /MANIFEST.in - template = env.get_template("datacontract.html") - - if data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, str): - quality_specification = data_contract_spec.quality.specification - elif data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, object): - if data_contract_spec.quality.type == "great-expectations": - quality_specification = yaml.dump( - data_contract_spec.quality.specification, sort_keys=False, default_style="|" - ) - else: - quality_specification = yaml.dump(data_contract_spec.quality.specification, sort_keys=False) - else: - quality_specification = None + template_file = "datacontract.html" + if isinstance(data_contract_spec, OpenDataContractStandard): + template_file = "datacontract_odcs.html" + + template = env.get_template(template_file) style_content, _, _ = package_loader.get_source(env, "style/output.css") + quality_specification = None + if isinstance(data_contract_spec, DataContractSpecification): + if data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, str): + quality_specification = data_contract_spec.quality.specification + elif data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, object): + if data_contract_spec.quality.type == "great-expectations": + quality_specification = yaml.dump( + data_contract_spec.quality.specification, sort_keys=False, default_style="|" + ) + else: + quality_specification = yaml.dump(data_contract_spec.quality.specification, sort_keys=False) + datacontract_yaml = data_contract_spec.to_yaml() - tz = pytz.timezone("UTC") - now = datetime.datetime.now(tz) - formatted_date = now.strftime("%d %b %Y %H:%M:%S UTC") - datacontract_cli_version = get_version() + # Get the mermaid diagram + mermaid_diagram = to_mermaid(data_contract_spec) # Render the template with necessary data html_string = template.render( @@ -60,13 +64,20 @@ def to_html(data_contract_spec: DataContractSpecification) -> str: quality_specification=quality_specification, style=style_content, datacontract_yaml=datacontract_yaml, - formatted_date=formatted_date, - datacontract_cli_version=datacontract_cli_version, + formatted_date=_formatted_date(), + datacontract_cli_version=get_version(), + mermaid_diagram=mermaid_diagram, ) return html_string +def _formatted_date() -> str: + tz = pytz.timezone("UTC") + now = datetime.datetime.now(tz) + return now.strftime("%d %b %Y %H:%M:%S UTC") + + def get_version() -> str: try: return version("datacontract_cli") diff --git a/datacontract/export/mermaid_exporter.py b/datacontract/export/mermaid_exporter.py new file mode 100644 index 000000000..6653b7aa0 --- /dev/null +++ b/datacontract/export/mermaid_exporter.py @@ -0,0 +1,97 @@ +from open_data_contract_standard.model import OpenDataContractStandard + +from datacontract.export.exporter import Exporter +from datacontract.model.data_contract_specification import DataContractSpecification + + +class MermaidExporter(Exporter): + def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + return to_mermaid(data_contract) + + +def to_mermaid(data_contract_spec: DataContractSpecification | OpenDataContractStandard) -> str | None: + if isinstance(data_contract_spec, DataContractSpecification): + return dcs_to_mermaid(data_contract_spec) + elif isinstance(data_contract_spec, OpenDataContractStandard): + return odcs_to_mermaid(data_contract_spec) + else: + return None + + +def dcs_to_mermaid(data_contract_spec: DataContractSpecification) -> str | None: + try: + if not data_contract_spec.models: + return None + + mmd_entity = "erDiagram\n" + mmd_references = [] + + for model_name, model in data_contract_spec.models.items(): + entity_block = "" + + for field_name, field in model.fields.items(): + clean_name = _sanitize_name(field_name) + indicators = "" + + if field.primaryKey or (field.unique and field.required): + indicators += "🔑" + if field.references: + indicators += "⌘" + + field_type = field.type or "unknown" + entity_block += f"\t{clean_name}{indicators} {field_type}\n" + + if field.references: + referenced_model = field.references.split(".")[0] if "." in field.references else "" + if referenced_model: + mmd_references.append(f'"📑{referenced_model}"' + "}o--{ ||" + f'"📑{model_name}"') + + mmd_entity += f'\t"**{model_name}**"' + "{\n" + entity_block + "}\n" + + if mmd_references: + mmd_entity += "\n" + "\n".join(mmd_references) + + return f"{mmd_entity}\n" + + except Exception as e: + print(f"Error generating DCS mermaid diagram: {e}") + return None + + +def odcs_to_mermaid(data_contract_spec: OpenDataContractStandard) -> str | None: + try: + if not data_contract_spec.schema_: + return None + + mmd_entity = "erDiagram\n" + + for schema in data_contract_spec.schema_: + schema_name = schema.name or schema.physicalName + entity_block = "" + + if schema.properties: + for prop in schema.properties: + clean_name = _sanitize_name(prop.name) + indicators = "" + + if prop.primaryKey: + indicators += "🔑" + if getattr(prop, "partitioned", False): + indicators += "🔀" + if getattr(prop, "criticalDataElement", False): + indicators += "⚠️" + + prop_type = prop.logicalType or prop.physicalType or "unknown" + entity_block += f"\t{clean_name}{indicators} {prop_type}\n" + + mmd_entity += f'\t"**{schema_name}**"' + "{\n" + entity_block + "}\n" + + return f"{mmd_entity}\n" + + except Exception as e: + print(f"Error generating ODCS mermaid diagram: {e}") + return None + + +def _sanitize_name(name: str) -> str: + return name.replace("#", "Nb").replace(" ", "_").replace("/", "by") diff --git a/datacontract/export/odcs_v3_exporter.py b/datacontract/export/odcs_v3_exporter.py index 6a34af946..4dfe9e7f9 100644 --- a/datacontract/export/odcs_v3_exporter.py +++ b/datacontract/export/odcs_v3_exporter.py @@ -1,6 +1,17 @@ -from typing import Dict - -import yaml +from typing import Any, Dict + +from open_data_contract_standard.model import ( + CustomProperty, + DataQuality, + Description, + OpenDataContractStandard, + Role, + SchemaObject, + SchemaProperty, + Server, + ServiceLevelAgreementProperty, + Support, +) from datacontract.export.exporter import Exporter from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model @@ -12,155 +23,146 @@ def export(self, data_contract, model, server, sql_server_type, export_args) -> def to_odcs_v3_yaml(data_contract_spec: DataContractSpecification) -> str: - odcs = { - "apiVersion": "v3.0.1", - "kind": "DataContract", - "id": data_contract_spec.id, - "name": data_contract_spec.info.title, - "version": data_contract_spec.info.version, - "status": to_status(data_contract_spec.info.status), - } + result = to_odcs_v3(data_contract_spec) + + return result.to_yaml() + +def to_odcs_v3(data_contract_spec: DataContractSpecification) -> OpenDataContractStandard: + result = OpenDataContractStandard( + apiVersion="v3.0.1", + kind="DataContract", + id=data_contract_spec.id, + name=data_contract_spec.info.title, + version=data_contract_spec.info.version, + status=to_status(data_contract_spec.info.status), + ) if data_contract_spec.terms is not None: - odcs["description"] = { - "purpose": data_contract_spec.terms.description.strip() + result.description = Description( + purpose=data_contract_spec.terms.description.strip() if data_contract_spec.terms.description is not None else None, - "usage": data_contract_spec.terms.usage.strip() if data_contract_spec.terms.usage is not None else None, - "limitations": data_contract_spec.terms.limitations.strip() + usage=data_contract_spec.terms.usage.strip() if data_contract_spec.terms.usage is not None else None, + limitations=data_contract_spec.terms.limitations.strip() if data_contract_spec.terms.limitations is not None else None, - } - - odcs["schema"] = [] + ) + result.schema_ = [] for model_key, model_value in data_contract_spec.models.items(): odcs_schema = to_odcs_schema(model_key, model_value) - odcs["schema"].append(odcs_schema) - + result.schema_.append(odcs_schema) if data_contract_spec.servicelevels is not None: slas = [] if data_contract_spec.servicelevels.availability is not None: slas.append( - { - "property": "generalAvailability", - "value": data_contract_spec.servicelevels.availability.description, - } + ServiceLevelAgreementProperty( + property="generalAvailability", value=data_contract_spec.servicelevels.availability.description + ) ) if data_contract_spec.servicelevels.retention is not None: - slas.append({"property": "retention", "value": data_contract_spec.servicelevels.retention.period}) + slas.append( + ServiceLevelAgreementProperty( + property="retention", value=data_contract_spec.servicelevels.retention.period + ) + ) if len(slas) > 0: - odcs["slaProperties"] = slas - + result.slaProperties = slas if data_contract_spec.info.contact is not None: support = [] if data_contract_spec.info.contact.email is not None: - support.append( - { - "channel": "email", - "url": "mailto:" + data_contract_spec.info.contact.email, - } - ) + support.append(Support(channel="email", url="mailto:" + data_contract_spec.info.contact.email)) if data_contract_spec.info.contact.url is not None: - support.append( - { - "channel": "other", - "url": data_contract_spec.info.contact.url, - } - ) + support.append(Support(channel="other", url=data_contract_spec.info.contact.url)) if len(support) > 0: - odcs["support"] = support - + result.support = support if data_contract_spec.servers is not None and len(data_contract_spec.servers) > 0: servers = [] for server_key, server_value in data_contract_spec.servers.items(): - server_dict = {} - server_dict["server"] = server_key - if server_value.type is not None: - server_dict["type"] = server_value.type + server = Server(server=server_key, type=server_value.type or "") + + # Set all the attributes that are not None if server_value.environment is not None: - server_dict["environment"] = server_value.environment + server.environment = server_value.environment if server_value.account is not None: - server_dict["account"] = server_value.account + server.account = server_value.account if server_value.database is not None: - server_dict["database"] = server_value.database + server.database = server_value.database if server_value.schema_ is not None: - server_dict["schema"] = server_value.schema_ + server.schema_ = server_value.schema_ if server_value.format is not None: - server_dict["format"] = server_value.format + server.format = server_value.format if server_value.project is not None: - server_dict["project"] = server_value.project + server.project = server_value.project if server_value.dataset is not None: - server_dict["dataset"] = server_value.dataset + server.dataset = server_value.dataset if server_value.path is not None: - server_dict["path"] = server_value.path + server.path = server_value.path if server_value.delimiter is not None: - server_dict["delimiter"] = server_value.delimiter + server.delimiter = server_value.delimiter if server_value.endpointUrl is not None: - server_dict["endpointUrl"] = server_value.endpointUrl + server.endpointUrl = server_value.endpointUrl if server_value.location is not None: - server_dict["location"] = server_value.location + server.location = server_value.location if server_value.host is not None: - server_dict["host"] = server_value.host + server.host = server_value.host if server_value.port is not None: - server_dict["port"] = server_value.port + server.port = server_value.port if server_value.catalog is not None: - server_dict["catalog"] = server_value.catalog + server.catalog = server_value.catalog if server_value.topic is not None: - server_dict["topic"] = server_value.topic + server.topic = server_value.topic if server_value.http_path is not None: - server_dict["http_path"] = server_value.http_path + server.http_path = server_value.http_path if server_value.token is not None: - server_dict["token"] = server_value.token + server.token = server_value.token if server_value.driver is not None: - server_dict["driver"] = server_value.driver + server.driver = server_value.driver + if server_value.roles is not None: - server_dict["roles"] = [ - {"name": role.name, "description": role.description} for role in server_value.roles - ] - servers.append(server_dict) + server.roles = [Role(role=role.name, description=role.description) for role in server_value.roles] - if len(servers) > 0: - odcs["servers"] = servers + servers.append(server) - odcs["customProperties"] = [] + if len(servers) > 0: + result.servers = servers + custom_properties = [] if data_contract_spec.info.owner is not None: - odcs["customProperties"].append({"property": "owner", "value": data_contract_spec.info.owner}) + custom_properties.append(CustomProperty(property="owner", value=data_contract_spec.info.owner)) if data_contract_spec.info.model_extra is not None: for key, value in data_contract_spec.info.model_extra.items(): - odcs["customProperties"].append({"property": key, "value": value}) - if len(odcs["customProperties"]) == 0: - del odcs["customProperties"] + custom_properties.append(CustomProperty(property=key, value=value)) + if len(custom_properties) > 0: + result.customProperties = custom_properties + return result - return yaml.safe_dump(odcs, indent=2, sort_keys=False, allow_unicode=True) +def to_odcs_schema(model_key, model_value: Model) -> SchemaObject: + schema_obj = SchemaObject( + name=model_key, physicalName=model_key, logicalType="object", physicalType=model_value.type + ) -def to_odcs_schema(model_key, model_value: Model) -> dict: - odcs_table = { - "name": model_key, - "physicalName": model_key, - "logicalType": "object", - "physicalType": model_value.type, - } if model_value.description is not None: - odcs_table["description"] = model_value.description + schema_obj.description = model_value.description + properties = to_properties(model_value.fields) if properties: - odcs_table["properties"] = properties + schema_obj.properties = properties model_quality = to_odcs_quality_list(model_value.quality) if len(model_quality) > 0: - odcs_table["quality"] = model_quality + schema_obj.quality = model_quality - odcs_table["customProperties"] = [] + custom_properties = [] if model_value.model_extra is not None: for key, value in model_value.model_extra.items(): - odcs_table["customProperties"].append({"property": key, "value": value}) - if len(odcs_table["customProperties"]) == 0: - del odcs_table["customProperties"] + custom_properties.append(CustomProperty(property=key, value=value)) + + if len(custom_properties) > 0: + schema_obj.customProperties = custom_properties - return odcs_table + return schema_obj def to_properties(fields: Dict[str, Field]) -> list: @@ -198,82 +200,119 @@ def to_logical_type(type: str) -> str | None: return "array" if type.lower() in ["array"]: return "array" + if type.lower() in ["variant"]: + return "variant" if type.lower() in ["null"]: return None return None -def to_physical_type(type: str) -> str | None: - # TODO: to we need to do a server mapping here? - return type +def to_physical_type(config: Dict[str, Any]) -> str | None: + if config is None: + return None + if "postgresType" in config: + return config["postgresType"] + elif "bigqueryType" in config: + return config["bigqueryType"] + elif "snowflakeType" in config: + return config["snowflakeType"] + elif "redshiftType" in config: + return config["redshiftType"] + elif "sqlserverType" in config: + return config["sqlserverType"] + elif "databricksType" in config: + return config["databricksType"] + elif "physicalType" in config: + return config["physicalType"] + return None -def to_property(field_name: str, field: Field) -> dict: - property = {"name": field_name} +def to_property(field_name: str, field: Field) -> SchemaProperty: + property = SchemaProperty(name=field_name) + + if field.fields: + properties = [] + for field_name_, field_ in field.fields.items(): + property_ = to_property(field_name_, field_) + properties.append(property_) + property.properties = properties + + if field.items: + items = to_property(field_name, field.items) + items.name = None # Clear the name for items + property.items = items + if field.title is not None: - property["businessName"] = field.title + property.businessName = field.title + if field.type is not None: - property["logicalType"] = to_logical_type(field.type) - property["physicalType"] = to_physical_type(field.type) + property.logicalType = to_logical_type(field.type) + property.physicalType = to_physical_type(field.config) or field.type + if field.description is not None: - property["description"] = field.description + property.description = field.description + if field.required is not None: - property["nullable"] = not field.required + property.required = field.required + if field.unique is not None: - property["unique"] = field.unique + property.unique = field.unique + if field.classification is not None: - property["classification"] = field.classification + property.classification = field.classification + if field.examples is not None: - property["examples"] = field.examples.copy() + property.examples = field.examples.copy() + if field.example is not None: - property["examples"] = [field.example] + property.examples = [field.example] + if field.primaryKey is not None and field.primaryKey: - property["primaryKey"] = field.primaryKey - property["primaryKeyPosition"] = 1 + property.primaryKey = field.primaryKey + property.primaryKeyPosition = 1 + if field.primary is not None and field.primary: - property["primaryKey"] = field.primary - property["primaryKeyPosition"] = 1 + property.primaryKey = field.primary + property.primaryKeyPosition = 1 - property["customProperties"] = [] + custom_properties = [] if field.model_extra is not None: for key, value in field.model_extra.items(): - property["customProperties"].append({"property": key, "value": value}) + custom_properties.append(CustomProperty(property=key, value=value)) + if field.pii is not None: - property["customProperties"].append({"property": "pii", "value": field.pii}) - if property.get("customProperties") is not None and len(property["customProperties"]) == 0: - del property["customProperties"] + custom_properties.append(CustomProperty(property="pii", value=field.pii)) + + if len(custom_properties) > 0: + property.customProperties = custom_properties - property["tags"] = [] - if field.tags is not None: - property["tags"].extend(field.tags) - if not property["tags"]: - del property["tags"] + if field.tags is not None and len(field.tags) > 0: + property.tags = field.tags - property["logicalTypeOptions"] = {} + logical_type_options = {} if field.minLength is not None: - property["logicalTypeOptions"]["minLength"] = field.minLength + logical_type_options["minLength"] = field.minLength if field.maxLength is not None: - property["logicalTypeOptions"]["maxLength"] = field.maxLength + logical_type_options["maxLength"] = field.maxLength if field.pattern is not None: - property["logicalTypeOptions"]["pattern"] = field.pattern + logical_type_options["pattern"] = field.pattern if field.minimum is not None: - property["logicalTypeOptions"]["minimum"] = field.minimum + logical_type_options["minimum"] = field.minimum if field.maximum is not None: - property["logicalTypeOptions"]["maximum"] = field.maximum + logical_type_options["maximum"] = field.maximum if field.exclusiveMinimum is not None: - property["logicalTypeOptions"]["exclusiveMinimum"] = field.exclusiveMinimum + logical_type_options["exclusiveMinimum"] = field.exclusiveMinimum if field.exclusiveMaximum is not None: - property["logicalTypeOptions"]["exclusiveMaximum"] = field.exclusiveMaximum - if property["logicalTypeOptions"] == {}: - del property["logicalTypeOptions"] + logical_type_options["exclusiveMaximum"] = field.exclusiveMaximum + + if logical_type_options: + property.logicalTypeOptions = logical_type_options if field.quality is not None: quality_list = field.quality quality_property = to_odcs_quality_list(quality_list) if len(quality_property) > 0: - property["quality"] = quality_property - - # todo enum + property.quality = quality_property return property @@ -286,33 +325,35 @@ def to_odcs_quality_list(quality_list): def to_odcs_quality(quality): - quality_dict = {"type": quality.type} + quality_obj = DataQuality(type=quality.type) + if quality.description is not None: - quality_dict["description"] = quality.description + quality_obj.description = quality.description if quality.query is not None: - quality_dict["query"] = quality.query + quality_obj.query = quality.query # dialect is not supported in v3.0.0 if quality.mustBe is not None: - quality_dict["mustBe"] = quality.mustBe + quality_obj.mustBe = quality.mustBe if quality.mustNotBe is not None: - quality_dict["mustNotBe"] = quality.mustNotBe + quality_obj.mustNotBe = quality.mustNotBe if quality.mustBeGreaterThan is not None: - quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan + quality_obj.mustBeGreaterThan = quality.mustBeGreaterThan if quality.mustBeGreaterThanOrEqualTo is not None: - quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo + quality_obj.mustBeGreaterOrEqualTo = quality.mustBeGreaterThanOrEqualTo if quality.mustBeLessThan is not None: - quality_dict["mustBeLessThan"] = quality.mustBeLessThan + quality_obj.mustBeLessThan = quality.mustBeLessThan if quality.mustBeLessThanOrEqualTo is not None: - quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo + quality_obj.mustBeLessOrEqualTo = quality.mustBeLessThanOrEqualTo if quality.mustBeBetween is not None: - quality_dict["mustBeBetween"] = quality.mustBeBetween + quality_obj.mustBeBetween = quality.mustBeBetween if quality.mustNotBeBetween is not None: - quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween + quality_obj.mustNotBeBetween = quality.mustNotBeBetween if quality.engine is not None: - quality_dict["engine"] = quality.engine + quality_obj.engine = quality.engine if quality.implementation is not None: - quality_dict["implementation"] = quality.implementation - return quality_dict + quality_obj.implementation = quality.implementation + + return quality_obj def to_status(status): diff --git a/datacontract/export/sodacl_converter.py b/datacontract/export/sodacl_converter.py index 8ecc546af..5e1fa9a03 100644 --- a/datacontract/export/sodacl_converter.py +++ b/datacontract/export/sodacl_converter.py @@ -2,12 +2,14 @@ from datacontract.engines.data_contract_checks import create_checks from datacontract.export.exporter import Exporter +from datacontract.model.data_contract_specification import DataContractSpecification, Server from datacontract.model.run import Run class SodaExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, model, server, sql_server_type, export_args) -> str: run = Run.create_run() + server = get_server(data_contract, server) run.checks.extend(create_checks(data_contract, server)) return to_sodacl_yaml(run) @@ -28,3 +30,9 @@ def to_sodacl_yaml(run: Run) -> str: else: sodacl_dict[key] = value return yaml.dump(sodacl_dict) + + +def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None: + if server_name is None: + return None + return data_contract_specification.servers.get(server_name) diff --git a/datacontract/export/spark_converter.py b/datacontract/export/spark_converter.py index be2b6cae5..0c7d86bcb 100644 --- a/datacontract/export/spark_converter.py +++ b/datacontract/export/spark_converter.py @@ -175,7 +175,7 @@ def indent(text: str, level: int) -> str: Returns: str: The indented text. """ - return "\n".join([f'{" " * level}{line}' for line in text.split("\n")]) + return "\n".join([f"{' ' * level}{line}" for line in text.split("\n")]) def repr_column(column: types.StructField) -> str: """ diff --git a/datacontract/export/sql_converter.py b/datacontract/export/sql_converter.py index 2aabe111d..9e42f56da 100644 --- a/datacontract/export/sql_converter.py +++ b/datacontract/export/sql_converter.py @@ -4,7 +4,7 @@ class SqlExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, model, server, sql_server_type, export_args) -> str: server_type = _determine_sql_server_type( data_contract, sql_server_type, @@ -13,7 +13,7 @@ def export(self, data_contract, model, server, sql_server_type, export_args) -> class SqlQueryExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, model, server, sql_server_type, export_args) -> str: model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) server_type = _determine_sql_server_type(data_contract, sql_server_type, export_args.get("server")) return to_sql_query( @@ -117,6 +117,8 @@ def _to_sql_table(model_name, model, server_type="snowflake"): result += " primary key" if server_type == "databricks" and field.description is not None: result += f' COMMENT "{_escape(field.description)}"' + if server_type == "snowflake" and field.description is not None: + result += f" COMMENT '{_escape(field.description)}'" if current_field_index < fields: result += "," result += "\n" @@ -124,6 +126,8 @@ def _to_sql_table(model_name, model, server_type="snowflake"): result += ")" if server_type == "databricks" and model.description is not None: result += f' COMMENT "{_escape(model.description)}"' + if server_type == "snowflake" and model.description is not None: + result += f" COMMENT='{_escape(model.description)}'" result += ";\n" return result diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index 7e6bb5f3f..75168e5c3 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -158,9 +158,13 @@ def convert_to_dataframe(field: Field) -> None | str: # databricks data types: # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html def convert_to_databricks(field: Field) -> None | str: - if field.config and "databricksType" in field.config: - return field.config["databricksType"] type = field.type + if ( + field.config + and "databricksType" in field.config + and type.lower() not in ["array", "object", "record", "struct"] + ): + return field.config["databricksType"] if type is None: return None if type.lower() in ["string", "varchar", "text"]: @@ -197,6 +201,8 @@ def convert_to_databricks(field: Field) -> None | str: if type.lower() in ["array"]: item_type = convert_to_databricks(field.items) return f"ARRAY<{item_type}>" + if type.lower() in ["variant"]: + return "VARIANT" return None diff --git a/datacontract/imports/avro_importer.py b/datacontract/imports/avro_importer.py index 5309a6ce2..e7df6048d 100644 --- a/datacontract/imports/avro_importer.py +++ b/datacontract/imports/avro_importer.py @@ -55,7 +55,7 @@ def import_avro(data_contract_specification: DataContractSpecification, source: engine="datacontract", original_exception=e, ) - # type record is being used for both the table and the object types in data contract + # type record is being used for both the table and the object types in data contract # -> CONSTRAINT: one table per .avsc input, all nested records are interpreted as objects fields = import_record_fields(avro_schema.fields) @@ -92,19 +92,19 @@ def handle_config_avro_custom_properties(field: avro.schema.Field, imported_fiel LOGICAL_TYPE_MAPPING = { - "decimal": "decimal", - "date": "date", - "time-millis": "time", - "time-micros": "time", - "timestamp-millis": "timestamp_tz", - "timestamp-micros": "timestamp_tz", - "local-timestamp-micros": "timestamp_ntz", - "local-timestamp-millis": "timestamp_ntz", - "duration": "string", - "uuid": "string", - } - - + "decimal": "decimal", + "date": "date", + "time-millis": "time", + "time-micros": "time", + "timestamp-millis": "timestamp_tz", + "timestamp-micros": "timestamp_tz", + "local-timestamp-micros": "timestamp_ntz", + "local-timestamp-millis": "timestamp_ntz", + "duration": "string", + "uuid": "string", +} + + def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]: """ Import Avro record fields and convert them to data contract fields. @@ -150,15 +150,15 @@ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Fi if not imported_field.config: imported_field.config = {} imported_field.config["avroType"] = "enum" - else: - logical_type = field.type.get_prop("logicalType") - if logical_type in LOGICAL_TYPE_MAPPING: - imported_field.type = LOGICAL_TYPE_MAPPING[logical_type] - if logical_type == "decimal": - imported_field.precision = field.type.precision - imported_field.scale = field.type.scale - else: - imported_field.type = map_type_from_avro(field.type.type) + else: + logical_type = field.type.get_prop("logicalType") + if logical_type in LOGICAL_TYPE_MAPPING: + imported_field.type = LOGICAL_TYPE_MAPPING[logical_type] + if logical_type == "decimal": + imported_field.precision = field.type.precision + imported_field.scale = field.type.scale + else: + imported_field.type = map_type_from_avro(field.type.type) imported_fields[field.name] = imported_field return imported_fields diff --git a/datacontract/imports/csv_importer.py b/datacontract/imports/csv_importer.py index 485c1130f..f58d2fc34 100644 --- a/datacontract/imports/csv_importer.py +++ b/datacontract/imports/csv_importer.py @@ -1,89 +1,143 @@ import os +from typing import Any, Dict, List -import clevercsv +import duckdb from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Example, Field, Model, Server +from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server class CsvImporter(Importer): def import_source( self, data_contract_specification: DataContractSpecification, source: str, import_args: dict ) -> DataContractSpecification: - return import_csv(data_contract_specification, self.import_format, source) + return import_csv(data_contract_specification, source) -def import_csv(data_contract_specification: DataContractSpecification, format: str, source: str): - include_example = False - - # detect encoding and dialect - encoding = clevercsv.encoding.get_encoding(source) - with open(source, "r", newline="") as fp: - dialect = clevercsv.Sniffer().sniff(fp.read(10000)) - - # using auto detecting of the format and encoding - df = clevercsv.read_dataframe(source) - - if data_contract_specification.models is None: - data_contract_specification.models = {} - +def import_csv( + data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False +) -> DataContractSpecification: # use the file name as table name table_name = os.path.splitext(os.path.basename(source))[0] + # use duckdb to auto detect format, columns, etc. + con = duckdb.connect(database=":memory:") + con.sql( + f"""CREATE VIEW "{table_name}" AS SELECT * FROM read_csv_auto('{source}', hive_partitioning=1, auto_type_candidates = ['BOOLEAN', 'INTEGER', 'BIGINT', 'DOUBLE', 'VARCHAR']);""" + ) + dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy() + tbl = con.table(table_name) + if data_contract_specification.servers is None: data_contract_specification.servers = {} + delimiter = None if dialect is None else dialect["Delimiter"][0] + + if dialect is not None: + dc_types = [map_type_from_duckdb(x["type"]) for x in dialect["Columns"][0]] + else: + dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes] + data_contract_specification.servers["production"] = Server( - type="local", path=source, format="csv", delimiter=dialect.delimiter + type="local", path=source, format="csv", delimiter=delimiter ) + rowcount = tbl.shape[0] + + tallies = dict() + for row in tbl.describe().fetchall(): + if row[0] not in ["count", "max", "min"]: + continue + for i in range(tbl.shape[1]): + tallies[(row[0], tbl.columns[i])] = row[i + 1] if row[0] != "count" else int(row[i + 1]) + + samples: Dict[str, List] = dict() + for i in range(tbl.shape[1]): + field_name = tbl.columns[i] + if tallies[("count", field_name)] > 0 and tbl.dtypes[i] not in ["BOOLEAN", "BLOB"]: + sql = f"""SELECT DISTINCT "{field_name}" FROM "{table_name}" WHERE "{field_name}" IS NOT NULL USING SAMPLE 5 ROWS;""" + samples[field_name] = [x[0] for x in con.sql(sql).fetchall()] + + formats: Dict[str, str] = dict() + for i in range(tbl.shape[1]): + field_name = tbl.columns[i] + if tallies[("count", field_name)] > 0 and tbl.dtypes[i] == "VARCHAR": + sql = f"""SELECT + count_if("{field_name}" IS NOT NULL) as count, + count_if(regexp_matches("{field_name}", '^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{{2,4}}$')) as email, + count_if(regexp_matches("{field_name}", '^[[a-z0-9]{{8}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{12}}]')) as uuid + FROM "{table_name}"; + """ + res = con.sql(sql).fetchone() + if res[1] == res[0]: + formats[field_name] = "email" + elif res[2] == res[0]: + formats[field_name] = "uuid" + fields = {} - for column, dtype in df.dtypes.items(): - field = Field() - field.type = map_type_from_pandas(dtype.name) - fields[column] = field + for i in range(tbl.shape[1]): + field_name = tbl.columns[i] + dc_type = dc_types[i] + + ## specifying "integer" rather than "bigint" looks nicer + if ( + dc_type == "bigint" + and tallies[("max", field_name)] <= 2147483647 + and tallies[("min", field_name)] >= -2147483648 + ): + dc_type = "integer" + + field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)} + + if tallies[("count", field_name)] == rowcount: + field["required"] = True + if dc_type not in ["boolean", "bytes"]: + distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore + if distinct_values > 0 and distinct_values == tallies[("count", field_name)]: + field["unique"] = True + s = samples.get(field_name, None) + if s is not None: + field["examples"] = s + if dc_type in ["integer", "bigint", "float", "double"]: + field["minimum"] = tallies[("min", field_name)] + field["maximum"] = tallies[("max", field_name)] + + fields[field_name] = field + + model_examples = None + if include_examples: + model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall() data_contract_specification.models[table_name] = Model( - type="table", - description=f"Csv file with encoding {encoding}", - fields=fields, + type="table", description="Generated model of " + source, fields=fields, examples=model_examples ) - # multiline data is not correctly handled by yaml dump - if include_example: - if data_contract_specification.examples is None: - data_contract_specification.examples = [] - - # read first 10 lines with the detected encoding - with open(source, "r", encoding=encoding) as csvfile: - lines = csvfile.readlines()[:10] - - data_contract_specification.examples.append(Example(type="csv", model=table_name, data="".join(lines))) - return data_contract_specification -def map_type_from_pandas(sql_type: str): +_duck_db_types = { + "BOOLEAN": "boolean", + "BLOB": "bytes", + "TINYINT": "integer", + "SMALLINT": "integer", + "INTEGER": "integer", + "BIGINT": "bigint", + "UTINYINT": "integer", + "USMALLINT": "integer", + "UINTEGER": "integer", + "UBIGINT": "bigint", + "FLOAT": "float", + "DOUBLE": "double", + "VARCHAR": "string", + "TIMESTAMP": "timestamp", + "DATE": "date", + # TODO: Add support for NULL +} + + +def map_type_from_duckdb(sql_type: None | str): if sql_type is None: return None - sql_type_normed = sql_type.lower().strip() - - if sql_type_normed == "object": - return "string" - elif sql_type_normed.startswith("str"): - return "string" - elif sql_type_normed.startswith("int"): - return "integer" - elif sql_type_normed.startswith("float"): - return "float" - elif sql_type_normed.startswith("bool"): - return "boolean" - elif sql_type_normed.startswith("timestamp"): - return "timestamp" - elif sql_type_normed == "datetime64": - return "date" - elif sql_type_normed == "timedelta[ns]": - return "timestamp_ntz" - else: - return "variant" + sql_type_normed = sql_type.upper().strip() + return _duck_db_types.get(sql_type_normed, "string") diff --git a/datacontract/imports/excel_importer.py b/datacontract/imports/excel_importer.py new file mode 100644 index 000000000..49f7379f1 --- /dev/null +++ b/datacontract/imports/excel_importer.py @@ -0,0 +1,855 @@ +import logging +import os +from typing import Any, Dict, List, Optional + +import openpyxl +from open_data_contract_standard.model import ( + AuthoritativeDefinition, + CustomProperty, + DataQuality, + OpenDataContractStandard, + Role, + SchemaObject, + SchemaProperty, + Server, + ServiceLevelAgreementProperty, + Support, + Team, +) +from openpyxl.cell.cell import Cell +from openpyxl.workbook.workbook import Workbook +from openpyxl.worksheet.worksheet import Worksheet + +from datacontract.imports.importer import Importer +from datacontract.model.data_contract_specification import ( + DataContractSpecification, +) +from datacontract.model.exceptions import DataContractException + +logger = logging.getLogger(__name__) + + +class ExcelImporter(Importer): + def import_source( + self, + data_contract_specification: DataContractSpecification | OpenDataContractStandard, + source: str, + import_args: dict, + ) -> DataContractSpecification | OpenDataContractStandard: + return import_excel_as_odcs(source) + + +def import_excel_as_odcs(excel_file_path: str) -> OpenDataContractStandard: + """ + Import an Excel file and convert it to an OpenDataContractStandard object + + Args: + excel_file_path: Path to the Excel file + + Returns: + OpenDataContractStandard object + """ + if not os.path.exists(excel_file_path): + raise FileNotFoundError(f"Excel file not found: {excel_file_path}") + + try: + workbook = openpyxl.load_workbook(excel_file_path, data_only=True) + except Exception as e: + raise DataContractException( + type="schema", + name="Parse excel contract", + reason=f"Failed to open Excel file: {excel_file_path}", + engine="datacontract", + original_exception=e, + ) + + try: + # Get description values + purpose = get_cell_value_by_name(workbook, "description.purpose") + limitations = get_cell_value_by_name(workbook, "description.limitations") + usage = get_cell_value_by_name(workbook, "description.usage") + + # Build description dict + description = None + if purpose or limitations or usage: + description = {"purpose": purpose, "limitations": limitations, "usage": usage} + + # Get tags as a list + tags_str = get_cell_value_by_name(workbook, "tags") + tags = None + if tags_str: + tags = [tag.strip() for tag in tags_str.split(",") if tag.strip()] + + # Import other components + schemas = import_schemas(workbook) + support = import_support(workbook) + team = import_team(workbook) + roles = import_roles(workbook) + sla_properties = import_sla_properties(workbook) + servers = import_servers(workbook) + price = import_price(workbook) + custom_properties = import_custom_properties(workbook) + + # Create the ODCS object with proper object creation + odcs = OpenDataContractStandard( + apiVersion=get_cell_value_by_name(workbook, "apiVersion"), + kind=get_cell_value_by_name(workbook, "kind"), + id=get_cell_value_by_name(workbook, "id"), + name=get_cell_value_by_name(workbook, "name"), + version=get_cell_value_by_name(workbook, "version"), + status=get_cell_value_by_name(workbook, "status"), + domain=get_cell_value_by_name(workbook, "domain"), + dataProduct=get_cell_value_by_name(workbook, "dataProduct"), + tenant=get_cell_value_by_name(workbook, "tenant"), + description=description, + tags=tags, + schema=schemas, + support=support, + price=price, + team=team, + roles=roles, + slaDefaultElement=get_cell_value_by_name(workbook, "slaDefaultElement"), + slaProperties=sla_properties, + servers=servers, + customProperties=custom_properties, + ) + + return odcs + except Exception as e: + logger.error(f"Error importing Excel file: {str(e)}") + raise DataContractException( + type="schema", + name="Parse excel contract", + reason=f"Failed to parse Excel file: {excel_file_path}", + engine="datacontract", + original_exception=e, + ) + finally: + workbook.close() + + +def import_schemas(workbook) -> Optional[List[SchemaObject]]: + """Extract schema information from sheets starting with 'Schema '""" + schemas = [] + + for sheet_name in workbook.sheetnames: + if sheet_name.startswith("Schema ") and sheet_name != "Schema ": + sheet = workbook[sheet_name] + schema_name = get_cell_value_by_name_in_sheet(sheet, "schema.name") + + if not schema_name: + continue + + schema = SchemaObject( + name=schema_name, + logicalType="object", + physicalType=get_cell_value_by_name_in_sheet(sheet, "schema.physicalType"), + physicalName=get_cell_value_by_name_in_sheet(sheet, "schema.physicalName"), + description=get_cell_value_by_name_in_sheet(sheet, "schema.description"), + businessName=get_cell_value_by_name_in_sheet(sheet, "schema.businessName"), + dataGranularityDescription=get_cell_value_by_name_in_sheet(sheet, "schema.dataGranularityDescription"), + authoritativeDefinitions=None, + properties=import_properties(sheet), + quality=None, + customProperties=None, + tags=None, + ) + + # Get tags + tags_str = get_cell_value_by_name_in_sheet(sheet, "schema.tags") + if tags_str: + schema.tags = [tag.strip() for tag in tags_str.split(",") if tag.strip()] + + schemas.append(schema) + + return schemas if schemas else None + + +def import_properties(sheet) -> Optional[List[SchemaProperty]]: + """Extract properties from the schema sheet""" + try: + # Find the properties table + properties_range = get_range_by_name_in_sheet(sheet, "schema.properties") + if not properties_range: + return None + + # Get header row to map column names to indices + header_row = list(sheet.rows)[properties_range[0] - 1] # Convert to 0-based indexing + headers = {} + for i, cell in enumerate(header_row): + if cell.value: + headers[cell.value.lower()] = i + + # Process property rows + property_lookup = {} # Dictionary to keep track of properties by name for nesting + + # First, create all properties + for row_idx in range(properties_range[0], properties_range[1]): + if len(list(sheet.rows)) < row_idx + 1: + break + row = list(sheet.rows)[row_idx] + + # Skip empty rows or header row + property_name = get_cell_value(row, headers.get("property")) + if not property_name or row_idx == properties_range[0] - 1: + continue + + # Create property object + property_obj = SchemaProperty( + name=property_name, + logicalType=get_cell_value(row, headers.get("logical type")), + logicalTypeOptions=import_logical_type_options(row, headers), + physicalType=get_cell_value(row, headers.get("physical type")), + physicalName=get_cell_value(row, headers.get("physical name")), + description=get_cell_value(row, headers.get("description")), + businessName=get_cell_value(row, headers.get("business name")), + required=parse_boolean(get_cell_value(row, headers.get("required"))), + unique=parse_boolean(get_cell_value(row, headers.get("unique"))), + primaryKey=parse_boolean(get_cell_value(row, headers.get("primary key"))), + primaryKeyPosition=parse_integer(get_cell_value(row, headers.get("primary key position"))), + partitioned=parse_boolean(get_cell_value(row, headers.get("partitioned"))), + partitionKeyPosition=parse_integer(get_cell_value(row, headers.get("partition key position"))), + criticalDataElement=parse_boolean(get_cell_value(row, headers.get("critical data element status"))), + classification=get_cell_value(row, headers.get("classification")), + transformLogic=get_cell_value(row, headers.get("transform logic")), + transformDescription=get_cell_value(row, headers.get("transform description")), + encryptedName=get_cell_value(row, headers.get("encrypted name")), + properties=None, + items=None, + tags=get_property_tags(headers, row), + ) + + # Authoritative definitions + authoritative_definition_url = get_cell_value(row, headers.get("authoritative definition url")) + authoritative_definition_type = get_cell_value(row, headers.get("authoritative definition type")) + if authoritative_definition_url and authoritative_definition_type: + property_obj.authoritativeDefinitions = [ + AuthoritativeDefinition( + url=authoritative_definition_url, + type=authoritative_definition_type, + ) + ] + + # Quality + quality_type = get_cell_value(row, headers.get("quality type")) + quality_description = get_cell_value(row, headers.get("quality description")) + if quality_type and quality_description: + property_obj.quality = [ + DataQuality( + type=quality_type, + description=quality_description, + ) + ] + + # Transform sources + transform_sources = get_cell_value(row, headers.get("transform sources")) + if transform_sources: + property_obj.transformSourceObjects = [ + src.strip() for src in transform_sources.split(",") if src.strip() + ] + + # Examples + examples = get_cell_value(row, headers.get("example(s)")) + if examples: + property_obj.examples = [ex.strip() for ex in examples.split(",") if ex.strip()] + + # Add to lookup dictionary + property_lookup[property_name] = property_obj + + # Now organize nested properties + root_properties = [] + for name, prop in property_lookup.items(): + if "." in name: + # This is a nested property + parent_name = name.rsplit(".", 1)[0] + child_name = name.rsplit(".", 1)[1] + + if parent_name in property_lookup: + parent_prop = property_lookup[parent_name] + # Update the property name to be just the child part + prop.name = child_name + + # If parent is an array, set as items + if parent_prop.logicalType == "array": + parent_prop.items = prop + else: + # Otherwise add to properties list + if parent_prop.properties is None: + parent_prop.properties = [] + parent_prop.properties.append(prop) + else: + # This is a root property + root_properties.append(prop) + + return root_properties if root_properties else None + except Exception as e: + logger.warning(f"Error importing properties: {str(e)}") + return None + + +def import_logical_type_options(row, headers): + """Import logical type options from property row""" + + required_props = get_cell_value(row, headers.get("required properties")) + + required_props_list = None + if required_props: + required_props_list = [prop.strip() for prop in required_props.split(",") if prop.strip()] + + logical_type_options_dict = { + "minLength": parse_integer(get_cell_value(row, headers.get("minimum length"))), + "maxLength": parse_integer(get_cell_value(row, headers.get("maximum length"))), + "pattern": get_cell_value(row, headers.get("pattern")), + "format": get_cell_value(row, headers.get("format")), + "exclusiveMaximum": parse_boolean(get_cell_value(row, headers.get("exclusive maximum"))), + "exclusiveMinimum": parse_boolean(get_cell_value(row, headers.get("exclusive minimum"))), + "minimum": get_cell_value(row, headers.get("minimum")), + "maximum": get_cell_value(row, headers.get("maximum")), + "multipleOf": get_cell_value(row, headers.get("multiple of")), + "minItems": parse_integer(get_cell_value(row, headers.get("minimum items"))), + "maxItems": parse_integer(get_cell_value(row, headers.get("maximum items"))), + "uniqueItems": parse_boolean(get_cell_value(row, headers.get("unique items"))), + "maxProperties": parse_integer(get_cell_value(row, headers.get("maximum properties"))), + "minProperties": parse_integer(get_cell_value(row, headers.get("minimum properties"))), + "required": required_props_list, + } + + for dict_key in list(logical_type_options_dict.keys()): + if logical_type_options_dict[dict_key] is None: + del logical_type_options_dict[dict_key] + + if len(logical_type_options_dict) == 0: + return None + return logical_type_options_dict + + +def get_property_tags(headers, row): + tags_value = get_cell_value(row, headers.get("tags")) + if tags_value: + return [tag.strip() for tag in tags_value.split(",") if tag.strip()] + return None + + +def parse_boolean(value): + """Parse a string value to boolean""" + if value is None: + return None + value = value.lower().strip() + return value == "true" or value == "yes" or value == "1" + + +def parse_integer(value): + """Parse a string value to integer""" + if value is None: + return None + try: + return int(value) + except (ValueError, TypeError): + return None + + +def get_range_by_name_in_workbook(workbook: Workbook, name: str) -> tuple | None: + """Find the range (start_row, end_row) of a named range in a workbook""" + try: + for named_range in workbook.defined_names: + if named_range == name: + destinations = workbook.defined_names[named_range].destinations + for sheet_title, range_address in destinations: + if ":" in range_address: + # Convert Excel range to row numbers + start_ref, end_ref = range_address.split(":") + start_row = int("".join(filter(str.isdigit, start_ref))) + end_row = int("".join(filter(str.isdigit, end_ref))) + return start_row, end_row + else: + # Single cell + row = int("".join(filter(str.isdigit, range_address))) + return row, row + except Exception as e: + logger.warning(f"Error finding range by name {name}: {str(e)}") + return None + + +def get_range_by_name_in_sheet(sheet: Worksheet, name: str) -> tuple | None: + """Find the range (start_row, end_row) of a named range in a sheet""" + try: + for named_range in sheet.defined_names: + if named_range == name: + destinations = sheet.defined_names[named_range].destinations + for sheet_title, range_address in destinations: + if sheet_title == sheet.title: + # For named ranges that refer to entire rows or multiple rows + if ":" in range_address: + # Convert Excel range to row numbers + start_ref, end_ref = range_address.split(":") + start_row = int("".join(filter(str.isdigit, start_ref))) + end_row = int("".join(filter(str.isdigit, end_ref))) + return (start_row, end_row) + else: + # Single cell + row = int("".join(filter(str.isdigit, range_address))) + return (row, row) + except Exception as e: + logger.warning(f"Error finding range by name {name}: {str(e)}") + return None + + +def get_cell_by_name_in_workbook(workbook: Workbook, name: str) -> Cell | None: + """Find a cell by name within a workbook""" + try: + for named_range in workbook.defined_names: + if named_range == name: + destinations = workbook.defined_names[named_range].destinations + for sheet_title, coordinate in destinations: + sheet = workbook[sheet_title] + if sheet_title == sheet.title: + return sheet[coordinate] + except Exception as e: + logger.warning(f"Error finding cell by name {name}: {str(e)}") + return None + + +def get_cell_value_by_name(workbook: Workbook, name: str) -> str | None: + """Get the value of a named cell""" + try: + cell = get_cell_by_name_in_workbook(workbook, name) + if cell.value is not None: + return str(cell.value) + except Exception as e: + logger.warning(f"Error getting cell value by name {name}: {str(e)}") + return None + + +def get_cell_value_by_name_in_sheet(sheet: Worksheet, name: str) -> str | None: + """Get the value of a named cell within a specific sheet""" + try: + for named_range in sheet.defined_names: + if named_range == name: + destinations = sheet.defined_names[named_range].destinations + for sheet_title, coordinate in destinations: + if sheet_title == sheet.title: + cell = sheet[coordinate] + if cell.value is not None: + return str(cell.value) + except Exception as e: + logger.warning(f"Error getting cell value by name {name} in sheet {sheet.title}: {str(e)}") + return None + + +def get_cell_value(row, col_idx): + """Safely get cell value from a row by column index""" + if col_idx is None: + return None + try: + cell = row[col_idx] + return str(cell.value) if cell.value is not None else None + except (IndexError, AttributeError): + return None + + +def get_cell_value_by_position(sheet, row_idx, col_idx): + """Get cell value by row and column indices (0-based)""" + try: + cell = sheet.cell(row=row_idx + 1, column=col_idx + 1) # Convert to 1-based indices + return str(cell.value) if cell.value is not None else None + except Exception as e: + logger.warning(f"Error getting cell value by position ({row_idx}, {col_idx}): {str(e)}") + return None + + +def import_support(workbook: Workbook) -> Optional[List[Support]]: + """Extract support information from the Support sheet""" + try: + support_sheet = workbook["Support"] + if not support_sheet: + return None + + support_range = get_range_by_name_in_workbook(workbook, "support") + if not support_range: + return None + + header_row = list(support_sheet.rows)[support_range[0] - 1] + headers = {} + for i, cell in enumerate(header_row): + if cell.value: + headers[cell.value.lower()] = i + + support_channels = [] + for row_idx in range(support_range[0], support_range[1]): + if len(list(support_sheet.rows)) < row_idx + 1: + break + row = list(support_sheet.rows)[row_idx] + + channel = get_cell_value(row, headers.get("channel")) + if not channel or row_idx == support_range[0] - 1: + continue + + support_channel = Support( + channel=channel, + url=get_cell_value(row, headers.get("channel url")), + description=get_cell_value(row, headers.get("description")), + tool=get_cell_value(row, headers.get("tool")), + scope=get_cell_value(row, headers.get("scope")), + invitationUrl=get_cell_value(row, headers.get("invitation url")), + ) + + support_channels.append(support_channel) + except Exception as e: + logger.warning(f"Error importing support: {str(e)}") + return None + + return support_channels if support_channels else None + + +def import_team(workbook: Workbook) -> Optional[List[Team]]: + """Extract team information from the Team sheet""" + try: + team_sheet = workbook["Team"] + if not team_sheet: + return None + + team_range = get_range_by_name_in_workbook(workbook, "team") + if not team_range: + return None + + header_row = list(team_sheet.rows)[team_range[0] - 1] + headers = {} + for i, cell in enumerate(header_row): + if cell.value: + headers[cell.value.lower()] = i + + team_members = [] + for row_idx in range(team_range[0], team_range[1]): + if len(list(team_sheet.rows)) < row_idx + 1: + break + row = list(team_sheet.rows)[row_idx] + + username = get_cell_value(row, headers.get("username")) + name = get_cell_value(row, headers.get("name")) + role = get_cell_value(row, headers.get("role")) + + if (not (username or name or role)) or row_idx == team_range[0] - 1: + continue + + team_member = Team( + username=username, + name=name, + description=get_cell_value(row, headers.get("description")), + role=role, + dateIn=get_cell_value(row, headers.get("date in")), + dateOut=get_cell_value(row, headers.get("date out")), + replacedByUsername=get_cell_value(row, headers.get("replaced by username")), + ) + + team_members.append(team_member) + except Exception as e: + logger.warning(f"Error importing team: {str(e)}") + return None + + return team_members if team_members else None + + +def import_roles(workbook: Workbook) -> Optional[List[Role]]: + """Extract roles information from the Roles sheet""" + try: + roles_sheet = workbook["Roles"] + if not roles_sheet: + return None + + roles_range = get_range_by_name_in_sheet(roles_sheet, "roles") + if not roles_range: + return None + + header_row = list(roles_sheet.rows)[roles_range[0] - 1] + headers = {} + for i, cell in enumerate(header_row): + if cell.value: + headers[cell.value.lower()] = i + + roles_list = [] + for row_idx in range(roles_range[0], roles_range[1]): + if len(list(roles_sheet.rows)) < row_idx + 1: + break + row = list(roles_sheet.rows)[row_idx] + + role_name = get_cell_value(row, headers.get("role")) + if not role_name or row_idx == roles_range[0] - 1: + continue + + role = Role( + role=role_name, + description=get_cell_value(row, headers.get("description")), + access=get_cell_value(row, headers.get("access")), + firstLevelApprovers=get_cell_value(row, headers.get("1st level approvers")), + secondLevelApprovers=get_cell_value(row, headers.get("2nd level approvers")), + customProperties=None, + ) + + roles_list.append(role) + except Exception as e: + logger.warning(f"Error importing roles: {str(e)}") + return None + + return roles_list if roles_list else None + + +def import_sla_properties(workbook: Workbook) -> Optional[List[ServiceLevelAgreementProperty]]: + """Extract SLA properties from the SLA sheet""" + try: + sla_sheet = workbook["SLA"] + if not sla_sheet: + return None + + sla_range = get_range_by_name_in_sheet(sla_sheet, "slaProperties") + if not sla_range: + return None + + header_row = list(sla_sheet.rows)[sla_range[0] - 1] + headers = {} + for i, cell in enumerate(header_row): + if cell.value: + headers[cell.value.lower()] = i + + sla_properties = [] + for row_idx in range(sla_range[0], sla_range[1]): + if len(list(sla_sheet.rows)) < row_idx + 1: + break + row = list(sla_sheet.rows)[row_idx] + + property_name = get_cell_value(row, headers.get("property")) + if not property_name or row_idx == sla_range[0] - 1: + continue + + sla_property = ServiceLevelAgreementProperty( + property=property_name, + value=get_cell_value(row, headers.get("value")), + valueExt=get_cell_value(row, headers.get("extended value")), + unit=get_cell_value(row, headers.get("unit")), + element=get_cell_value(row, headers.get("element")), + driver=get_cell_value(row, headers.get("driver")), + ) + + sla_properties.append(sla_property) + except Exception as e: + logger.warning(f"Error importing SLA properties: {str(e)}") + return None + + return sla_properties if sla_properties else None + + +def import_servers(workbook) -> Optional[List[Server]]: + """Extract server information from the Servers sheet""" + try: + sheet = workbook["Servers"] + if not sheet: + return None + + # Find the server cells + server_cell = get_cell_by_name_in_workbook(workbook, "servers.server") + if not server_cell: + return None + + # Get servers (horizontally arranged in the sheet) + servers = [] + col_idx = server_cell.column - 1 # 0-based index + row_idx = server_cell.row - 1 # 0-based index + + index = 0 + while True: + server_name = get_cell_value_by_position(sheet, row_idx, col_idx + index) + if not server_name: + break + + server = Server( + server=server_name, + description=get_server_cell_value(workbook, sheet, "servers.description", index), + environment=get_server_cell_value(workbook, sheet, "servers.environment", index), + type=get_server_cell_value(workbook, sheet, "servers.type", index), + ) + + # Get type-specific fields + server_type = server.type + if server_type: + if server_type == "azure": + server.location = get_server_cell_value(workbook, sheet, "servers.azure.location", index) + server.format = get_server_cell_value(workbook, sheet, "servers.azure.format", index) + server.delimiter = get_server_cell_value(workbook, sheet, "servers.azure.delimiter", index) + elif server_type == "bigquery": + server.project = get_server_cell_value(workbook, sheet, "servers.bigquery.project", index) + server.dataset = get_server_cell_value(workbook, sheet, "servers.bigquery.dataset", index) + elif server_type == "databricks": + server.catalog = get_server_cell_value(workbook, sheet, "servers.databricks.catalog", index) + server.host = get_server_cell_value(workbook, sheet, "servers.databricks.host", index) + server.schema = get_server_cell_value(workbook, sheet, "servers.databricks.schema", index) + elif server_type == "glue": + server.account = get_server_cell_value(workbook, sheet, "servers.glue.account", index) + server.database = get_server_cell_value(workbook, sheet, "servers.glue.database", index) + server.format = get_server_cell_value(workbook, sheet, "servers.glue.format", index) + server.location = get_server_cell_value(workbook, sheet, "servers.glue.location", index) + elif server_type == "kafka": + server.format = get_server_cell_value(workbook, sheet, "servers.kafka.format", index) + server.host = get_server_cell_value(workbook, sheet, "servers.kafka.host", index) + server.topic = get_server_cell_value(workbook, sheet, "servers.kafka.topic", index) + elif server_type == "postgres": + server.database = get_server_cell_value(workbook, sheet, "servers.postgres.database", index) + server.host = get_server_cell_value(workbook, sheet, "servers.postgres.host", index) + server.port = get_server_cell_value(workbook, sheet, "servers.postgres.port", index) + server.schema = get_server_cell_value(workbook, sheet, "servers.postgres.schema", index) + elif server_type == "s3": + server.delimiter = get_server_cell_value(workbook, sheet, "servers.s3.delimiter", index) + server.endpointUrl = get_server_cell_value(workbook, sheet, "servers.s3.endpointUrl", index) + server.format = get_server_cell_value(workbook, sheet, "servers.s3.format", index) + server.location = get_server_cell_value(workbook, sheet, "servers.s3.location", index) + elif server_type == "snowflake": + server.account = get_server_cell_value(workbook, sheet, "servers.snowflake.account", index) + server.database = get_server_cell_value(workbook, sheet, "servers.snowflake.database", index) + server.host = get_server_cell_value(workbook, sheet, "servers.snowflake.host", index) + server.port = get_server_cell_value(workbook, sheet, "servers.snowflake.port", index) + server.schema = get_server_cell_value(workbook, sheet, "servers.snowflake.schema", index) + server.warehouse = get_server_cell_value(workbook, sheet, "servers.snowflake.warehouse", index) + elif server_type == "sqlserver": + server.database = get_server_cell_value(workbook, sheet, "servers.sqlserver.database", index) + server.host = get_server_cell_value(workbook, sheet, "servers.sqlserver.host", index) + server.port = get_server_cell_value(workbook, sheet, "servers.sqlserver.port", index) + server.schema = get_server_cell_value(workbook, sheet, "servers.sqlserver.schema", index) + else: + # Custom server type - grab all possible fields + server.account = get_server_cell_value(workbook, sheet, "servers.custom.account", index) + server.catalog = get_server_cell_value(workbook, sheet, "servers.custom.catalog", index) + server.database = get_server_cell_value(workbook, sheet, "servers.custom.database", index) + server.dataset = get_server_cell_value(workbook, sheet, "servers.custom.dataset", index) + server.delimiter = get_server_cell_value(workbook, sheet, "servers.custom.delimiter", index) + server.endpointUrl = get_server_cell_value(workbook, sheet, "servers.custom.endpointUrl", index) + server.format = get_server_cell_value(workbook, sheet, "servers.custom.format", index) + server.host = get_server_cell_value(workbook, sheet, "servers.custom.host", index) + server.location = get_server_cell_value(workbook, sheet, "servers.custom.location", index) + server.path = get_server_cell_value(workbook, sheet, "servers.custom.path", index) + server.port = get_server_cell_value(workbook, sheet, "servers.custom.port", index) + server.project = get_server_cell_value(workbook, sheet, "servers.custom.project", index) + server.schema = get_server_cell_value(workbook, sheet, "servers.custom.schema", index) + server.stagingDir = get_server_cell_value(workbook, sheet, "servers.custom.stagingDir", index) + server.table = get_server_cell_value(workbook, sheet, "servers.custom.table", index) + server.view = get_server_cell_value(workbook, sheet, "servers.custom.view", index) + server.warehouse = get_server_cell_value(workbook, sheet, "servers.custom.warehouse", index) + server.region = get_server_cell_value(workbook, sheet, "servers.custom.region", index) + server.regionName = get_server_cell_value(workbook, sheet, "servers.custom.regionName", index) + server.serviceName = get_server_cell_value(workbook, sheet, "servers.custom.serviceName", index) + + servers.append(server) + index += 1 + except Exception as e: + logger.warning(f"Error importing servers: {str(e)}") + return None + + return servers if servers else None + + +def get_server_cell_value(workbook: Workbook, sheet: Worksheet, name: str, col_offset: int): + """Get cell value for server properties (arranged horizontally)""" + try: + cell = get_cell_by_name_in_workbook(workbook, name) + if not cell: + return None + + row = cell.row - 1 # 0-based + col = cell.column - 1 + col_offset # 0-based + return get_cell_value_by_position(sheet, row, col) + except Exception as e: + logger.warning(f"Error getting server cell value for {name}: {str(e)}") + return None + + +def import_price(workbook) -> Optional[Dict[str, Any]]: + """Extract price information""" + try: + price_amount = get_cell_value_by_name(workbook, "price.priceAmount") + price_currency = get_cell_value_by_name(workbook, "price.priceCurrency") + price_unit = get_cell_value_by_name(workbook, "price.priceUnit") + + if not (price_amount or price_currency or price_unit): + return None + + # Create a dictionary for price since the class doesn't seem to be directly available + return { + "priceAmount": price_amount, + "priceCurrency": price_currency, + "priceUnit": price_unit, + } + except Exception as e: + logger.warning(f"Error importing price: {str(e)}") + return None + + +def import_custom_properties(workbook: Workbook) -> List[CustomProperty]: + """Extract custom properties""" + custom_properties = [] + + owner = get_cell_value_by_name(workbook, "owner") + + # Add owner as a custom property + if owner: + custom_properties.append( + CustomProperty( + property="owner", + value=owner, + ) + ) + + try: + # Get other custom properties + custom_properties_sheet = workbook["Custom Properties"] + if custom_properties_sheet: + custom_properties_range = get_range_by_name_in_workbook(workbook, "CustomProperties") + if custom_properties_range: + # Skip header row + for row_idx in range(custom_properties_range[0], custom_properties_range[1]): + if row_idx == custom_properties_range[0] - 1: + continue + + property_name = get_cell_value_by_position(custom_properties_sheet, row_idx, 0) + if not property_name or property_name == "owner": + continue + + property_value = get_cell_value_by_position(custom_properties_sheet, row_idx, 1) + parsed_value = parse_property_value(property_value) + + custom_properties.append( + CustomProperty( + property=property_name, + value=parsed_value, + ) + ) + except Exception as e: + logger.warning(f"Error importing custom properties: {str(e)}") + + return custom_properties + + +def parse_property_value(value: str) -> Any: + """Parse a property value into the appropriate type based on Excel values""" + if value is None: + return None + + # Try to convert to boolean (simple case) + if isinstance(value, str): + value_lower = value.lower().strip() + if value_lower == "true": + return True + if value_lower == "false": + return False + + # Try numeric conversions + try: + # Check if it's an integer + if isinstance(value, str) and value.isdigit(): + return int(value) + + # Try float conversion + float_val = float(value) + # If it's a whole number, return as int + if float_val.is_integer(): + return int(float_val) + return float_val + except (ValueError, TypeError, AttributeError): + # If conversion fails, return original string + return value diff --git a/datacontract/imports/importer.py b/datacontract/imports/importer.py index c3056a03d..0dd033608 100644 --- a/datacontract/imports/importer.py +++ b/datacontract/imports/importer.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod from enum import Enum -from datacontract.model.data_contract_specification import DataContractSpecification +from datacontract_specification.model import DataContractSpecification +from open_data_contract_standard.model import OpenDataContractStandard class Importer(ABC): @@ -11,10 +12,10 @@ def __init__(self, import_format) -> None: @abstractmethod def import_source( self, - data_contract_specification: DataContractSpecification, + data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str, import_args: dict, - ) -> DataContractSpecification: + ) -> DataContractSpecification | OpenDataContractStandard: pass @@ -33,7 +34,17 @@ class ImportFormat(str, Enum): parquet = "parquet" csv = "csv" protobuf = "protobuf" + excel = "excel" @classmethod def get_supported_formats(cls): return list(map(lambda c: c.value, cls)) + + +class Spec(str, Enum): + datacontract_specification = "datacontract_specification" + odcs = "odcs" + + @classmethod + def get_supported_types(cls): + return list(map(lambda c: c.value, cls)) diff --git a/datacontract/imports/importer_factory.py b/datacontract/imports/importer_factory.py index 8df55721d..c7c107440 100644 --- a/datacontract/imports/importer_factory.py +++ b/datacontract/imports/importer_factory.py @@ -114,3 +114,8 @@ def load_module_class(module_path, class_name): module_path="datacontract.imports.protobuf_importer", class_name="ProtoBufImporter", ) +importer_factory.register_lazy_importer( + name=ImportFormat.excel, + module_path="datacontract.imports.excel_importer", + class_name="ExcelImporter", +) diff --git a/datacontract/imports/odcs_importer.py b/datacontract/imports/odcs_importer.py index f189ef74e..3e40ce090 100644 --- a/datacontract/imports/odcs_importer.py +++ b/datacontract/imports/odcs_importer.py @@ -48,9 +48,9 @@ def import_odcs(data_contract_specification: DataContractSpecification, source: engine="datacontract", ) elif odcs_api_version.startswith("v3."): - from datacontract.imports.odcs_v3_importer import import_odcs_v3 + from datacontract.imports.odcs_v3_importer import import_odcs_v3_as_dcs - return import_odcs_v3(data_contract_specification, source) + return import_odcs_v3_as_dcs(data_contract_specification, source) else: raise DataContractException( type="schema", diff --git a/datacontract/imports/odcs_v3_importer.py b/datacontract/imports/odcs_v3_importer.py index 226d9d0f4..a1420d48a 100644 --- a/datacontract/imports/odcs_v3_importer.py +++ b/datacontract/imports/odcs_v3_importer.py @@ -1,9 +1,11 @@ import datetime import logging +import re from typing import Any, Dict, List from venv import logger -import yaml +from datacontract_specification.model import Quality +from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer from datacontract.lint.resources import read_resource @@ -14,7 +16,6 @@ Field, Info, Model, - Quality, Retention, Server, ServerRole, @@ -28,19 +29,20 @@ class OdcsImporter(Importer): def import_source( self, data_contract_specification: DataContractSpecification, source: str, import_args: dict ) -> DataContractSpecification: - return import_odcs_v3(data_contract_specification, source) + return import_odcs_v3_as_dcs(data_contract_specification, source) -def import_odcs_v3(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: +def import_odcs_v3_as_dcs( + data_contract_specification: DataContractSpecification, source: str +) -> DataContractSpecification: source_str = read_resource(source) - return import_odcs_v3_from_str(data_contract_specification, source_str) + odcs = parse_odcs_v3_from_str(source_str) + return import_from_odcs(data_contract_specification, odcs) -def import_odcs_v3_from_str( - data_contract_specification: DataContractSpecification, source_str: str -) -> DataContractSpecification: +def parse_odcs_v3_from_str(source_str): try: - odcs_contract = yaml.safe_load(source_str) + odcs = OpenDataContractStandard.from_string(source_str) except Exception as e: raise DataContractException( type="schema", @@ -49,42 +51,44 @@ def import_odcs_v3_from_str( engine="datacontract", original_exception=e, ) + return odcs - data_contract_specification.id = odcs_contract["id"] - data_contract_specification.info = import_info(odcs_contract) - data_contract_specification.servers = import_servers(odcs_contract) - data_contract_specification.terms = import_terms(odcs_contract) - data_contract_specification.servicelevels = import_servicelevels(odcs_contract) - data_contract_specification.models = import_models(odcs_contract) - data_contract_specification.tags = import_tags(odcs_contract) +def import_from_odcs(data_contract_specification: DataContractSpecification, odcs: OpenDataContractStandard): + data_contract_specification.id = odcs.id + data_contract_specification.info = import_info(odcs) + data_contract_specification.servers = import_servers(odcs) + data_contract_specification.terms = import_terms(odcs) + data_contract_specification.servicelevels = import_servicelevels(odcs) + data_contract_specification.models = import_models(odcs) + data_contract_specification.tags = import_tags(odcs) return data_contract_specification -def import_info(odcs_contract: Dict[str, Any]) -> Info: +def import_info(odcs: Any) -> Info: info = Info() - info.title = odcs_contract.get("name") if odcs_contract.get("name") is not None else "" + info.title = odcs.name if odcs.name is not None else "" - if odcs_contract.get("version") is not None: - info.version = odcs_contract.get("version") + if odcs.version is not None: + info.version = odcs.version # odcs.description.purpose => datacontract.description - if odcs_contract.get("description") is not None and odcs_contract.get("description").get("purpose") is not None: - info.description = odcs_contract.get("description").get("purpose") + if odcs.description is not None and odcs.description.purpose is not None: + info.description = odcs.description.purpose # odcs.domain => datacontract.owner - owner = get_owner(odcs_contract.get("customProperties")) + owner = get_owner(odcs.customProperties) if owner is not None: info.owner = owner # add dataProduct as custom property - if odcs_contract.get("dataProduct") is not None: - info.dataProduct = odcs_contract.get("dataProduct") + if odcs.dataProduct is not None: + info.dataProduct = odcs.dataProduct # add tenant as custom property - if odcs_contract.get("tenant") is not None: - info.tenant = odcs_contract.get("tenant") + if odcs.tenant is not None: + info.tenant = odcs.tenant return info @@ -95,96 +99,92 @@ def import_server_roles(roles: List[Dict]) -> List[ServerRole] | None: result = [] for role in roles: server_role = ServerRole() - server_role.name = role.get("role") - server_role.description = role.get("description") + server_role.name = role.role + server_role.description = role.description result.append(server_role) -def import_servers(odcs_contract: Dict[str, Any]) -> Dict[str, Server] | None: - if odcs_contract.get("servers") is None: +def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None: + if odcs.servers is None: return None servers = {} - for odcs_server in odcs_contract.get("servers"): - server_name = odcs_server.get("server") + for odcs_server in odcs.servers: + server_name = odcs_server.server if server_name is None: logger.warning("Server name is missing, skipping server") continue server = Server() - server.type = odcs_server.get("type") - server.description = odcs_server.get("description") - server.environment = odcs_server.get("environment") - server.format = odcs_server.get("format") - server.project = odcs_server.get("project") - server.dataset = odcs_server.get("dataset") - server.path = odcs_server.get("path") - server.delimiter = odcs_server.get("delimiter") - server.endpointUrl = odcs_server.get("endpointUrl") - server.location = odcs_server.get("location") - server.account = odcs_server.get("account") - server.database = odcs_server.get("database") - server.schema_ = odcs_server.get("schema") - server.host = odcs_server.get("host") - server.port = odcs_server.get("port") - server.catalog = odcs_server.get("catalog") - server.topic = odcs_server.get("topic") - server.http_path = odcs_server.get("http_path") - server.token = odcs_server.get("token") - server.dataProductId = odcs_server.get("dataProductId") - server.outputPortId = odcs_server.get("outputPortId") - server.driver = odcs_server.get("driver") - server.roles = import_server_roles(odcs_server.get("roles")) - + server.type = odcs_server.type + server.description = odcs_server.description + server.environment = odcs_server.environment + server.format = odcs_server.format + server.project = odcs_server.project + server.dataset = odcs_server.dataset + server.path = odcs_server.path + server.delimiter = odcs_server.delimiter + server.endpointUrl = odcs_server.endpointUrl + server.location = odcs_server.location + server.account = odcs_server.account + server.database = odcs_server.database + server.schema_ = odcs_server.schema_ + server.host = odcs_server.host + server.port = odcs_server.port + server.catalog = odcs_server.catalog + server.topic = getattr(odcs_server, "topic", None) + server.http_path = getattr(odcs_server, "http_path", None) + server.token = getattr(odcs_server, "token", None) + server.driver = getattr(odcs_server, "driver", None) + server.roles = import_server_roles(odcs_server.roles) + server.storageAccount = ( + re.search(r"(?:@|://)([^.]+)\.", odcs_server.location, re.IGNORECASE) if server.type == "azure" else None + ) servers[server_name] = server return servers -def import_terms(odcs_contract: Dict[str, Any]) -> Terms | None: - if odcs_contract.get("description") is None: +def import_terms(odcs: Any) -> Terms | None: + if odcs.description is None: return None - if ( - odcs_contract.get("description").get("usage") is not None - or odcs_contract.get("description").get("limitations") is not None - or odcs_contract.get("price") is not None - ): + if odcs.description.usage is not None or odcs.description.limitations is not None or odcs.price is not None: terms = Terms() - if odcs_contract.get("description").get("usage") is not None: - terms.usage = odcs_contract.get("description").get("usage") - if odcs_contract.get("description").get("limitations") is not None: - terms.limitations = odcs_contract.get("description").get("limitations") - if odcs_contract.get("price") is not None: - terms.billing = f"{odcs_contract.get('price').get('priceAmount')} {odcs_contract.get('price').get('priceCurrency')} / {odcs_contract.get('price').get('priceUnit')}" + if odcs.description.usage is not None: + terms.usage = odcs.description.usage + if odcs.description.limitations is not None: + terms.limitations = odcs.description.limitations + if odcs.price is not None: + terms.billing = f"{odcs.price.priceAmount} {odcs.price.priceCurrency} / {odcs.price.priceUnit}" return terms else: return None -def import_servicelevels(odcs_contract: Dict[str, Any]) -> ServiceLevel: +def import_servicelevels(odcs: Any) -> ServiceLevel: # find the two properties we can map (based on the examples) - sla_properties = odcs_contract.get("slaProperties") if odcs_contract.get("slaProperties") is not None else [] - availability = next((p for p in sla_properties if p["property"] == "generalAvailability"), None) - retention = next((p for p in sla_properties if p["property"] == "retention"), None) + sla_properties = odcs.slaProperties if odcs.slaProperties is not None else [] + availability = next((p for p in sla_properties if p.property == "generalAvailability"), None) + retention = next((p for p in sla_properties if p.property == "retention"), None) if availability is not None or retention is not None: servicelevel = ServiceLevel() if availability is not None: - value = availability.get("value") + value = availability.value if isinstance(value, datetime.datetime): value = value.isoformat() servicelevel.availability = Availability(description=value) if retention is not None: - servicelevel.retention = Retention(period=f"{retention.get('value')}{retention.get('unit')}") + servicelevel.retention = Retention(period=f"{retention.value}{retention.unit}") return servicelevel else: return None -def get_server_type(odcs_contract: Dict[str, Any]) -> str | None: - servers = import_servers(odcs_contract) +def get_server_type(odcs: OpenDataContractStandard) -> str | None: + servers = import_servers(odcs) if servers is None or len(servers) == 0: return None # get first server from map @@ -192,49 +192,106 @@ def get_server_type(odcs_contract: Dict[str, Any]) -> str | None: return server.type -def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]: - custom_type_mappings = get_custom_type_mappings(odcs_contract.get("customProperties")) +def import_models(odcs: Any) -> Dict[str, Model]: + custom_type_mappings = get_custom_type_mappings(odcs.customProperties) - odcs_schemas = odcs_contract.get("schema") if odcs_contract.get("schema") is not None else [] + odcs_schemas = odcs.schema_ if odcs.schema_ is not None else [] result = {} for odcs_schema in odcs_schemas: - schema_name = odcs_schema.get("name") - schema_physical_name = odcs_schema.get("physicalName") - schema_description = odcs_schema.get("description") if odcs_schema.get("description") is not None else "" + schema_name = odcs_schema.name + schema_physical_name = odcs_schema.physicalName + schema_description = odcs_schema.description if odcs_schema.description is not None else "" model_name = schema_physical_name if schema_physical_name is not None else schema_name - model = Model(description=" ".join(schema_description.splitlines()), type="table") - model.fields = import_fields( - odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract) - ) - if odcs_schema.get("quality") is not None: - # convert dict to pydantic model - - model.quality = [Quality.model_validate(q) for q in odcs_schema.get("quality")] + model = Model(description=" ".join(schema_description.splitlines()) if schema_description else "", type="table") + model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs)) + if odcs_schema.quality is not None: + model.quality = convert_quality_list(odcs_schema.quality) model.title = schema_name - if odcs_schema.get("dataGranularityDescription") is not None: - model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")} + if odcs_schema.dataGranularityDescription is not None: + model.config = {"dataGranularityDescription": odcs_schema.dataGranularityDescription} result[model_name] = model return result -def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict[str, Any]: +def convert_quality_list(odcs_quality_list): + """Convert a list of ODCS DataQuality objects to datacontract Quality objects""" + quality_list = [] + + if odcs_quality_list is not None: + for odcs_quality in odcs_quality_list: + quality = Quality(type=odcs_quality.type) + + if odcs_quality.description is not None: + quality.description = odcs_quality.description + if odcs_quality.query is not None: + quality.query = odcs_quality.query + if odcs_quality.mustBe is not None: + quality.mustBe = odcs_quality.mustBe + if odcs_quality.mustNotBe is not None: + quality.mustNotBe = odcs_quality.mustNotBe + if odcs_quality.mustBeGreaterThan is not None: + quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan + if odcs_quality.mustBeGreaterOrEqualTo is not None: + quality.mustBeGreaterThanOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo + if odcs_quality.mustBeLessThan is not None: + quality.mustBeLessThan = odcs_quality.mustBeLessThan + if odcs_quality.mustBeLessOrEqualTo is not None: + quality.mustBeLessThanOrEqualTo = odcs_quality.mustBeLessOrEqualTo + if odcs_quality.mustBeBetween is not None: + quality.mustBeBetween = odcs_quality.mustBeBetween + if odcs_quality.mustNotBeBetween is not None: + quality.mustNotBeBetween = odcs_quality.mustNotBeBetween + if odcs_quality.engine is not None: + quality.engine = odcs_quality.engine + if odcs_quality.implementation is not None: + quality.implementation = odcs_quality.implementation + if odcs_quality.businessImpact is not None: + quality.model_extra["businessImpact"] = odcs_quality.businessImpact + if odcs_quality.dimension is not None: + quality.model_extra["dimension"] = odcs_quality.dimension + if odcs_quality.rule is not None: + quality.model_extra["rule"] = odcs_quality.rule + if odcs_quality.schedule is not None: + quality.model_extra["schedule"] = odcs_quality.schedule + if odcs_quality.scheduler is not None: + quality.model_extra["scheduler"] = odcs_quality.scheduler + if odcs_quality.severity is not None: + quality.model_extra["severity"] = odcs_quality.severity + if odcs_quality.method is not None: + quality.model_extra["method"] = odcs_quality.method + if odcs_quality.customProperties is not None: + quality.model_extra["customProperties"] = [] + for item in odcs_quality.customProperties: + quality.model_extra["customProperties"].append( + { + "property": item.property, + "value": item.value, + } + ) + + quality_list.append(quality) + + return quality_list + + +def import_field_config(odcs_property: SchemaProperty, server_type=None) -> Dict[str, Any]: config = {} - if odcs_property.get("criticalDataElement") is not None: - config["criticalDataElement"] = odcs_property.get("criticalDataElement") - if odcs_property.get("encryptedName") is not None: - config["encryptedName"] = odcs_property.get("encryptedName") - if odcs_property.get("partitionKeyPosition") is not None: - config["partitionKeyPosition"] = odcs_property.get("partitionKeyPosition") - if odcs_property.get("partitioned") is not None: - config["partitioned"] = odcs_property.get("partitioned") - - if odcs_property.get("customProperties") is not None and isinstance(odcs_property.get("customProperties"), list): - for item in odcs_property.get("customProperties"): - config[item["property"]] = item["value"] - - physical_type = odcs_property.get("physicalType") + if odcs_property.criticalDataElement is not None: + config["criticalDataElement"] = odcs_property.criticalDataElement + if odcs_property.encryptedName is not None: + config["encryptedName"] = odcs_property.encryptedName + if odcs_property.partitionKeyPosition is not None: + config["partitionKeyPosition"] = odcs_property.partitionKeyPosition + if odcs_property.partitioned is not None: + config["partitioned"] = odcs_property.partitioned + + if odcs_property.customProperties is not None: + for item in odcs_property.customProperties: + config[item.property] = item.value + + physical_type = odcs_property.physicalType if physical_type is not None: if server_type == "postgres" or server_type == "postgresql": config["postgresType"] = physical_type @@ -254,13 +311,13 @@ def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict return config -def has_composite_primary_key(odcs_properties) -> bool: - primary_keys = [prop for prop in odcs_properties if prop.get("primaryKey") is not None and prop.get("primaryKey")] +def has_composite_primary_key(odcs_properties: List[SchemaProperty]) -> bool: + primary_keys = [prop for prop in odcs_properties if prop.primaryKey is not None and prop.primaryKey] return len(primary_keys) > 1 def import_fields( - odcs_properties: Dict[str, Any], custom_type_mappings: Dict[str, str], server_type + odcs_properties: List[SchemaProperty], custom_type_mappings: Dict[str, str], server_type ) -> Dict[str, Field]: logger = logging.getLogger(__name__) result = {} @@ -269,31 +326,51 @@ def import_fields( return result for odcs_property in odcs_properties: - mapped_type = map_type(odcs_property.get("logicalType"), custom_type_mappings) + mapped_type = map_type(odcs_property.logicalType, custom_type_mappings) if mapped_type is not None: - property_name = odcs_property["name"] - description = odcs_property.get("description") if odcs_property.get("description") is not None else None + property_name = odcs_property.name + description = odcs_property.description if odcs_property.description is not None else None field = Field( description=" ".join(description.splitlines()) if description is not None else None, type=mapped_type, - title=odcs_property.get("businessName"), - required=odcs_property.get("required") if odcs_property.get("required") is not None else None, - primaryKey=odcs_property.get("primaryKey") - if not has_composite_primary_key(odcs_properties) and odcs_property.get("primaryKey") is not None + title=odcs_property.businessName, + required=odcs_property.required if odcs_property.required is not None else None, + primaryKey=odcs_property.primaryKey + if not has_composite_primary_key(odcs_properties) and odcs_property.primaryKey is not None else False, - unique=odcs_property.get("unique"), - examples=odcs_property.get("examples") if odcs_property.get("examples") is not None else None, - classification=odcs_property.get("classification") - if odcs_property.get("classification") is not None - else None, - tags=odcs_property.get("tags") if odcs_property.get("tags") is not None else None, - quality=odcs_property.get("quality") if odcs_property.get("quality") is not None else [], + unique=odcs_property.unique if odcs_property.unique else None, + examples=odcs_property.examples if odcs_property.examples is not None else None, + classification=odcs_property.classification if odcs_property.classification is not None else None, + tags=odcs_property.tags if odcs_property.tags is not None else None, + quality=convert_quality_list(odcs_property.quality), + fields=import_fields(odcs_property.properties, custom_type_mappings, server_type) + if odcs_property.properties is not None + else {}, config=import_field_config(odcs_property, server_type), + format=getattr(odcs_property, "format", None), ) + # mapped_type is array + if field.type == "array" and odcs_property.items is not None: + # nested array object + if odcs_property.items.logicalType == "object": + field.items = Field( + type="object", + fields=import_fields(odcs_property.items.properties, custom_type_mappings, server_type), + ) + # array of simple type + elif odcs_property.items.logicalType is not None: + field.items = Field(type=odcs_property.items.logicalType) + + # enum from quality validValues as enum + if field.type == "string": + for q in field.quality: + if hasattr(q, "validValues"): + field.enum = q.validValues + result[property_name] = field else: logger.info( - f"Can't map {odcs_property.get('column')} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.get('logicalName')}' that defines your expected type as the 'value'" + f"Can't map {odcs_property.name} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.logicalType}' that defines your expected type as the 'value'" ) return result @@ -311,28 +388,28 @@ def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None: return None -def get_custom_type_mappings(odcs_custom_properties: List[Any]) -> Dict[str, str]: +def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]: result = {} if odcs_custom_properties is not None: for prop in odcs_custom_properties: - if prop["property"].startswith("dc_mapping_"): - odcs_type_name = prop["property"].substring(11) - datacontract_type = prop["value"] + if prop.property.startswith("dc_mapping_"): + odcs_type_name = prop.property[11:] # Changed substring to slice + datacontract_type = prop.value result[odcs_type_name] = datacontract_type return result -def get_owner(odcs_custom_properties: List[Any]) -> str | None: +def get_owner(odcs_custom_properties: List[CustomProperty]) -> str | None: if odcs_custom_properties is not None: for prop in odcs_custom_properties: - if prop["property"] == "owner": - return prop["value"] + if prop.property == "owner": + return prop.value return None -def import_tags(odcs_contract) -> List[str] | None: - if odcs_contract.get("tags") is None: +def import_tags(odcs: OpenDataContractStandard) -> List[str] | None: + if odcs.tags is None: return None - return odcs_contract.get("tags") + return odcs.tags diff --git a/datacontract/imports/protobuf_importer.py b/datacontract/imports/protobuf_importer.py index 1152f6cee..cea238898 100644 --- a/datacontract/imports/protobuf_importer.py +++ b/datacontract/imports/protobuf_importer.py @@ -238,7 +238,6 @@ def import_protobuf( os.remove(descriptor_file) - class ProtoBufImporter(Importer): def __init__(self, name): # 'name' is passed by the importer factory. @@ -263,4 +262,3 @@ def import_source( """ # Wrap the source in a list because import_protobuf expects a list of sources. return import_protobuf(data_contract_specification, [source], import_args) - diff --git a/datacontract/imports/spark_importer.py b/datacontract/imports/spark_importer.py index bca3f0aae..e74177bec 100644 --- a/datacontract/imports/spark_importer.py +++ b/datacontract/imports/spark_importer.py @@ -1,3 +1,6 @@ +import logging + +from databricks.sdk import WorkspaceClient from pyspark.sql import DataFrame, SparkSession, types from datacontract.imports.importer import Importer @@ -8,6 +11,8 @@ Server, ) +logger = logging.getLogger(__name__) + class SparkImporter(Importer): def import_source( @@ -23,39 +28,60 @@ def import_source( data_contract_specification: The data contract specification object. source: The source string indicating the Spark tables to read. import_args: Additional arguments for the import process. - Returns: dict: The updated data contract specification. """ - return import_spark(data_contract_specification, source) + dataframe = import_args.get("dataframe", None) + description = import_args.get("description", None) + return import_spark(data_contract_specification, source, dataframe, description) -def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: +def import_spark( + data_contract_specification: DataContractSpecification, + source: str, + dataframe: DataFrame | None = None, + description: str | None = None, +) -> DataContractSpecification: """ - Reads Spark tables and updates the data contract specification with their schemas. + Imports schema(s) from Spark into a Data Contract Specification. Args: - data_contract_specification: The data contract specification to update. - source: A comma-separated string of Spark temporary views to read. + data_contract_specification (DataContractSpecification): The contract spec to update. + source (str): Comma-separated Spark table/view names. + dataframe (DataFrame | None): Optional Spark DataFrame to import. + description (str | None): Optional table-level description. Returns: - DataContractSpecification: The updated data contract specification. + DataContractSpecification: The updated contract spec with imported models. """ spark = SparkSession.builder.getOrCreate() data_contract_specification.servers["local"] = Server(type="dataframe") - for temp_view in source.split(","): - temp_view = temp_view.strip() - df = spark.read.table(temp_view) - data_contract_specification.models[temp_view] = import_from_spark_df(df) + + if dataframe is not None: + if not isinstance(dataframe, DataFrame): + raise TypeError("Expected 'dataframe' to be a pyspark.sql.DataFrame") + data_contract_specification.models[source] = import_from_spark_df(spark, source, dataframe, description) + return data_contract_specification + + if not source: + raise ValueError("Either 'dataframe' or a valid 'source' must be provided") + + for table_name in map(str.strip, source.split(",")): + df = spark.read.table(table_name) + data_contract_specification.models[table_name] = import_from_spark_df(spark, table_name, df, description) + return data_contract_specification -def import_from_spark_df(df: DataFrame) -> Model: +def import_from_spark_df(spark: SparkSession, source: str, df: DataFrame, description: str) -> Model: """ Converts a Spark DataFrame into a Model. Args: + spark: SparkSession + source: A comma-separated string of Spark temporary views to read. df: The Spark DataFrame to convert. + description: Table level comment Returns: Model: The generated data contract model. @@ -63,6 +89,11 @@ def import_from_spark_df(df: DataFrame) -> Model: model = Model() schema = df.schema + if description is None: + model.description = _table_comment_from_spark(spark, source) + else: + model.description = description + for field in schema: model.fields[field.name] = _field_from_struct_type(field) @@ -154,5 +185,67 @@ def _data_type_from_spark(spark_type: types.DataType) -> str: return "null" elif isinstance(spark_type, types.VarcharType): return "varchar" + elif isinstance(spark_type, types.VariantType): + return "variant" else: raise ValueError(f"Unsupported Spark type: {spark_type}") + + +def _table_comment_from_spark(spark: SparkSession, source: str): + """ + Attempts to retrieve the table-level comment from a Spark table using multiple fallback methods. + + Args: + spark (SparkSession): The active Spark session. + source (str): The name of the table (without catalog or schema). + + Returns: + str or None: The table-level comment, if found. + """ + + # Get Current Catalog and Schema from Spark Session + try: + current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0] + except Exception: + current_catalog = "hive_metastore" # Fallback for non-Unity Catalog clusters + try: + current_schema = spark.catalog.currentDatabase() + except Exception: + current_schema = spark.sql("SELECT current_database()").collect()[0][0] + + # Get table comment if it exists + table_comment = "" + source = f"{current_catalog}.{current_schema}.{source}" + try: + # Initialize WorkspaceClient for Unity Catalog API calls + workspace_client = WorkspaceClient() + created_table = workspace_client.tables.get(full_name=f"{source}") + table_comment = created_table.comment + logger.info(f"'{source}' table comment retrieved using 'WorkspaceClient.tables.get({source})'") + return table_comment + except Exception: + pass + + # Fallback to Spark Catalog API for Hive Metastore or Non-UC Tables + try: + table_comment = spark.catalog.getTable(f"{source}").description + logger.info(f"'{source}' table comment retrieved using 'spark.catalog.getTable({source}).description'") + return table_comment + except Exception: + pass + + # Final Fallback Using DESCRIBE TABLE EXTENDED + try: + rows = spark.sql(f"DESCRIBE TABLE EXTENDED {source}").collect() + for row in rows: + if row.col_name.strip().lower() == "comment": + table_comment = row.data_type + break + logger.info(f"'{source}' table comment retrieved using 'DESCRIBE TABLE EXTENDED {source}'") + return table_comment + except Exception: + pass + + logger.info(f"{source} table comment could not be retrieved") + + return None diff --git a/datacontract/imports/sql_importer.py b/datacontract/imports/sql_importer.py index c51e4272c..c08efaee6 100644 --- a/datacontract/imports/sql_importer.py +++ b/datacontract/imports/sql_importer.py @@ -105,7 +105,7 @@ def to_dialect(import_args: dict) -> Dialects | None: return None -def to_physical_type_key(dialect: Dialects | None) -> str: +def to_physical_type_key(dialect: Dialects | str | None) -> str: dialect_map = { Dialects.TSQL: "sqlserverType", Dialects.POSTGRES: "postgresType", @@ -116,6 +116,8 @@ def to_physical_type_key(dialect: Dialects | None) -> str: Dialects.MYSQL: "mysqlType", Dialects.DATABRICKS: "databricksType", } + if isinstance(dialect, str): + dialect = Dialects[dialect.upper()] if dialect.upper() in Dialects.__members__ else None return dialect_map.get(dialect, "physicalType") @@ -198,7 +200,7 @@ def get_precision_scale(column): return None, None -def map_type_from_sql(sql_type: str): +def map_type_from_sql(sql_type: str) -> str | None: if sql_type is None: return None diff --git a/datacontract/imports/unity_importer.py b/datacontract/imports/unity_importer.py index 403438cb6..e85e9aae3 100644 --- a/datacontract/imports/unity_importer.py +++ b/datacontract/imports/unity_importer.py @@ -1,14 +1,14 @@ import json import os -from typing import List, Optional +from typing import List from databricks.sdk import WorkspaceClient from databricks.sdk.service.catalog import ColumnInfo, TableInfo -from pyspark.sql import types +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.imports.importer import Importer -from datacontract.imports.spark_importer import _field_from_struct_type -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model +from datacontract.imports.sql_importer import map_type_from_sql, to_physical_type_key +from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server from datacontract.model.exceptions import DataContractException @@ -18,8 +18,11 @@ class UnityImporter(Importer): """ def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: + self, + data_contract_specification: DataContractSpecification | OpenDataContractStandard, + source: str, + import_args: dict, + ) -> DataContractSpecification | OpenDataContractStandard: """ Import data contract specification from a source. @@ -35,15 +38,14 @@ def import_source( if source is not None: data_contract_specification = import_unity_from_json(data_contract_specification, source) else: - data_contract_specification = import_unity_from_api( - data_contract_specification, import_args.get("unity_table_full_name") - ) + unity_table_full_name_list = import_args.get("unity_table_full_name") + data_contract_specification = import_unity_from_api(data_contract_specification, unity_table_full_name_list) return data_contract_specification def import_unity_from_json( - data_contract_specification: DataContractSpecification, source: str -) -> DataContractSpecification: + data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str +) -> DataContractSpecification | OpenDataContractStandard: """ Import data contract specification from a JSON file. @@ -71,39 +73,66 @@ def import_unity_from_json( def import_unity_from_api( - data_contract_specification: DataContractSpecification, unity_table_full_name: Optional[str] = None + data_contract_specification: DataContractSpecification, unity_table_full_name_list: List[str] = None ) -> DataContractSpecification: """ Import data contract specification from Unity Catalog API. :param data_contract_specification: The data contract specification to be imported. :type data_contract_specification: DataContractSpecification - :param unity_table_full_name: The full name of the Unity table. - :type unity_table_full_name: Optional[str] + :param unity_table_full_name_list: The full name of the Unity table. + :type unity_table_full_name_list: list[str] :return: The imported data contract specification. :rtype: DataContractSpecification :raises DataContractException: If there is an error retrieving the schema from the API. """ try: - workspace_client = WorkspaceClient() - unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name) + # print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}") + host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN") + # print(f"Databricks host: {host}, token: {'***' if token else 'not set'}") + if not host: + raise DataContractException( + type="configuration", + name="Databricks configuration", + reason="DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set", + engine="datacontract", + ) + if not token: + raise DataContractException( + type="configuration", + name="Databricks configuration", + reason="DATACONTRACT_DATABRICKS_TOKEN environment variable is not set", + engine="datacontract", + ) + workspace_client = WorkspaceClient(host=host, token=token) except Exception as e: raise DataContractException( type="schema", name="Retrieve unity catalog schema", - reason=f"Failed to retrieve unity catalog schema from databricks profile: {os.getenv('DATABRICKS_CONFIG_PROFILE')}", + reason="Failed to connect to unity catalog schema", engine="datacontract", original_exception=e, ) - convert_unity_schema(data_contract_specification, unity_schema) + for unity_table_full_name in unity_table_full_name_list: + try: + unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name) + except Exception as e: + raise DataContractException( + type="schema", + name="Retrieve unity catalog schema", + reason=f"Unity table {unity_table_full_name} not found", + engine="datacontract", + original_exception=e, + ) + data_contract_specification = convert_unity_schema(data_contract_specification, unity_schema) return data_contract_specification def convert_unity_schema( - data_contract_specification: DataContractSpecification, unity_schema: TableInfo -) -> DataContractSpecification: + data_contract_specification: DataContractSpecification | OpenDataContractStandard, unity_schema: TableInfo +) -> DataContractSpecification | OpenDataContractStandard: """ Convert Unity schema to data contract specification. @@ -117,6 +146,21 @@ def convert_unity_schema( if data_contract_specification.models is None: data_contract_specification.models = {} + if data_contract_specification.servers is None: + data_contract_specification.servers = {} + + # Configure databricks server with catalog and schema from Unity table info + schema_name = unity_schema.schema_name + catalog_name = unity_schema.catalog_name + if catalog_name and schema_name: + server_name = "myserver" # Default server name + + data_contract_specification.servers[server_name] = Server( + type="databricks", + catalog=catalog_name, + schema=schema_name, + ) + fields = import_table_fields(unity_schema.columns) table_id = unity_schema.name or unity_schema.table_id @@ -149,25 +193,21 @@ def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]: imported_fields = {} for column in columns: - struct_field: types.StructField = _type_json_to_spark_field(column.type_json) - imported_fields[column.name] = _field_from_struct_type(struct_field) + imported_fields[column.name] = _to_field(column) return imported_fields -def _type_json_to_spark_field(type_json: str) -> types.StructField: - """ - Parses a JSON string representing a Spark field and returns a StructField object. +def _to_field(column: ColumnInfo) -> Field: + field = Field() + if column.type_name is not None: + sql_type = str(column.type_text) + field.type = map_type_from_sql(sql_type) + physical_type_key = to_physical_type_key("databricks") + field.config = { + physical_type_key: sql_type, + } + field.required = column.nullable is None or not column.nullable + field.description = column.comment if column.comment else None - The reason we do this is to leverage the Spark JSON schema parser to handle the - complexity of the Spark field types. The field `type_json` in the Unity API is - the output of a `StructField.jsonValue()` call. - - :param type_json: The JSON string representing the Spark field. - :type type_json: str - - :return: The StructField object. - :rtype: types.StructField - """ - type_dict = json.loads(type_json) - return types.StructField.fromJson(type_dict) + return field diff --git a/datacontract/integration/datamesh_manager.py b/datacontract/integration/datamesh_manager.py index f314f1570..52020bf02 100644 --- a/datacontract/integration/datamesh_manager.py +++ b/datacontract/integration/datamesh_manager.py @@ -4,6 +4,9 @@ from datacontract.model.run import Run +# used to retrieve the HTML location of the published data contract or test results +RESPONSE_HEADER_LOCATION_HTML = "location-html" + def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool): try: @@ -38,7 +41,12 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_ver if response.status_code != 200: run.log_error(f"Error publishing test results to Data Mesh Manager: {response.text}") return - run.log_info(f"Published test results to {url}") + run.log_info("Published test results successfully") + + location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) + if location_html is not None and len(location_html) > 0: + print(f"🚀 Open {location_html}") + except Exception as e: run.log_error(f"Failed publishing test results. Error: {str(e)}") @@ -67,6 +75,12 @@ def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_veri if response.status_code != 200: print(f"Error publishing data contract to Data Mesh Manager: {response.text}") exit(1) - print(f"Published data contract to {url}") + + print("✅ Published data contract successfully") + + location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) + if location_html is not None and len(location_html) > 0: + print(f"🚀 Open {location_html}") + except Exception as e: print(f"Failed publishing data contract. Error: {str(e)}") diff --git a/datacontract/lint/linters/description_linter.py b/datacontract/lint/linters/description_linter.py index 615dd3edc..2f88615b8 100644 --- a/datacontract/lint/linters/description_linter.py +++ b/datacontract/lint/linters/description_linter.py @@ -23,9 +23,7 @@ def lint_implementation(self, contract: DataContractSpecification) -> LinterResu result = result.with_error(f"Model '{model_name}' has empty description.") for field_name, field in model.fields.items(): if not field.description: - result = result.with_error( - f"Field '{field_name}' in model '{model_name}'" f" has empty description." - ) + result = result.with_error(f"Field '{field_name}' in model '{model_name}' has empty description.") for definition_name, definition in contract.definitions.items(): if not definition.description: result = result.with_error(f"Definition '{definition_name}' has empty description.") diff --git a/datacontract/lint/linters/field_reference_linter.py b/datacontract/lint/linters/field_reference_linter.py index 97213cd72..0a9a52435 100644 --- a/datacontract/lint/linters/field_reference_linter.py +++ b/datacontract/lint/linters/field_reference_linter.py @@ -34,8 +34,7 @@ def lint_implementation(self, contract: DataContractSpecification) -> LinterResu if ref_model not in contract.models: result = result.with_error( - f"Field '{field_name}' in model '{model_name}'" - f" references non-existing model '{ref_model}'." + f"Field '{field_name}' in model '{model_name}' references non-existing model '{ref_model}'." ) else: ref_model_obj = contract.models[ref_model] diff --git a/datacontract/lint/linters/notice_period_linter.py b/datacontract/lint/linters/notice_period_linter.py index bb09703e1..7051bc4f6 100644 --- a/datacontract/lint/linters/notice_period_linter.py +++ b/datacontract/lint/linters/notice_period_linter.py @@ -41,10 +41,10 @@ def lint_implementation(self, contract: DataContractSpecification) -> LinterResu if not period: return LinterResult.cautious("No notice period defined.") if not period.startswith("P"): - return LinterResult.erroneous(f"Notice period '{period}' is not a valid" "ISO8601 duration.") + return LinterResult.erroneous(f"Notice period '{period}' is not a valid ISO8601 duration.") if period == "P": return LinterResult.erroneous( - "Notice period 'P' is not a valid" "ISO8601 duration, requires at least one" "duration to be specified." + "Notice period 'P' is not a valid ISO8601 duration, requires at least one duration to be specified." ) if ( not self.simple.fullmatch(period) diff --git a/datacontract/lint/linters/quality_schema_linter.py b/datacontract/lint/linters/quality_schema_linter.py deleted file mode 100644 index a2c17de3c..000000000 --- a/datacontract/lint/linters/quality_schema_linter.py +++ /dev/null @@ -1,52 +0,0 @@ -import yaml - -from datacontract.model.data_contract_specification import DataContractSpecification, Model - -from ..lint import Linter, LinterResult - - -class QualityUsesSchemaLinter(Linter): - @property - def name(self) -> str: - return "Quality check(s) use model" - - @property - def id(self) -> str: - return "quality-schema" - - def lint_sodacl(self, check, models: dict[str, Model]) -> LinterResult: - result = LinterResult() - for sodacl_check in check.keys(): - table_name = sodacl_check[len("checks for ") :] - if table_name not in models: - result = result.with_error(f"Quality check on unknown model '{table_name}'") - return result - - def lint_montecarlo(self, check, models: dict[str, Model]) -> LinterResult: - return LinterResult().with_warning("Linting montecarlo checks is not currently implemented") - - def lint_great_expectations(self, check, models: dict[str, Model]) -> LinterResult: - return LinterResult().with_warning("Linting great expectations checks is not currently implemented") - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - result = LinterResult() - models = contract.models - check = contract.quality - if not check: - return LinterResult() - if not check.specification: - return LinterResult.cautious("Quality check without specification.") - if isinstance(check.specification, str): - check_specification = yaml.safe_load(check.specification) - else: - check_specification = check.specification - match check.type: - case "SodaCL": - result = result.combine(self.lint_sodacl(check_specification, models)) - case "montecarlo": - result = result.combine(self.lint_montecarlo(check_specification, models)) - case "great-expectations": - result = result.combine(self.lint_great_expectations(check_specification, models)) - case _: - result = result.with_warning("Can't lint quality check " f"with type '{check.type}'") - return result diff --git a/datacontract/lint/linters/valid_constraints_linter.py b/datacontract/lint/linters/valid_constraints_linter.py index c1f764787..54afea84d 100644 --- a/datacontract/lint/linters/valid_constraints_linter.py +++ b/datacontract/lint/linters/valid_constraints_linter.py @@ -40,7 +40,7 @@ def check_minimum_maximum(self, field: Field, field_name: str, model_name: str) ): case (True, True, _, _) if min > max: return LinterResult.erroneous( - f"Minimum {min} is greater than maximum {max} on " f"field '{field_name}' in model '{model_name}'." + f"Minimum {min} is greater than maximum {max} on field '{field_name}' in model '{model_name}'." ) case (_, _, True, True) if xmin >= xmax: return LinterResult.erroneous( @@ -68,11 +68,11 @@ def check_string_constraints(self, field: Field, field_name: str, model_name: st result = LinterResult() if field.minLength and field.maxLength and field.minLength > field.maxLength: result = result.with_error( - f"Minimum length is greater that maximum length on" f" field '{field_name}' in model '{model_name}'." + f"Minimum length is greater that maximum length on field '{field_name}' in model '{model_name}'." ) if field.pattern and field.format: result = result.with_error( - f"Both a pattern and a format are defined for field" f" '{field_name}' in model '{model_name}'." + f"Both a pattern and a format are defined for field '{field_name}' in model '{model_name}'." ) return result diff --git a/datacontract/lint/resolve.py b/datacontract/lint/resolve.py index 78eb5f170..a116246ab 100644 --- a/datacontract/lint/resolve.py +++ b/datacontract/lint/resolve.py @@ -1,11 +1,13 @@ import logging import os +import warnings import fastjsonschema import yaml from fastjsonschema import JsonSchemaValueException +from open_data_contract_standard.model import OpenDataContractStandard -from datacontract.imports.odcs_v3_importer import import_odcs_v3_from_str +from datacontract.imports.odcs_v3_importer import import_from_odcs, parse_odcs_v3_from_str from datacontract.lint.resources import read_resource from datacontract.lint.schema import fetch_schema from datacontract.lint.urls import fetch_resource @@ -16,6 +18,7 @@ ) from datacontract.model.exceptions import DataContractException from datacontract.model.odcs import is_open_data_contract_standard +from datacontract.model.run import ResultEnum def resolve_data_contract( @@ -37,7 +40,35 @@ def resolve_data_contract( else: raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, + name="Check that data contract YAML is valid", + reason="Data contract needs to be provided", + engine="datacontract", + ) + + +def resolve_data_contract_v2( + data_contract_location: str = None, + data_contract_str: str = None, + data_contract: DataContractSpecification | OpenDataContractStandard = None, + schema_location: str = None, + inline_definitions: bool = False, + inline_quality: bool = False, +) -> DataContractSpecification | OpenDataContractStandard: + if data_contract_location is not None: + return resolve_data_contract_from_location_v2( + data_contract_location, schema_location, inline_definitions, inline_quality + ) + elif data_contract_str is not None: + return _resolve_data_contract_from_str_v2( + data_contract_str, schema_location, inline_definitions, inline_quality + ) + elif data_contract is not None: + return data_contract + else: + raise DataContractException( + type="lint", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason="Data contract needs to be provided", engine="datacontract", @@ -58,13 +89,20 @@ def resolve_data_contract_dict( else: raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason="Data contract needs to be provided", engine="datacontract", ) +def resolve_data_contract_from_location_v2( + location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False +) -> DataContractSpecification | OpenDataContractStandard: + data_contract_str = read_resource(location) + return _resolve_data_contract_from_str_v2(data_contract_str, schema_location, inline_definitions, inline_quality) + + def resolve_data_contract_from_location( location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False ) -> DataContractSpecification: @@ -152,7 +190,7 @@ def _resolve_definition_ref(ref, spec) -> Definition: else: raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason=f"Cannot resolve reference {ref}", engine="datacontract", @@ -165,7 +203,7 @@ def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification) if definition_key not in spec.definitions: raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason=f"Cannot resolve definition {definition_key}", engine="datacontract", @@ -195,7 +233,7 @@ def _fetch_file(path) -> str: if not os.path.exists(path): raise DataContractException( type="export", - result="failed", + result=ResultEnum.failed, name="Check that data contract definition is valid", reason=f"Cannot resolve reference {path}", engine="datacontract", @@ -230,7 +268,7 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object: if not os.path.exists(ref): raise DataContractException( type="export", - result="failed", + result=ResultEnum.failed, name="Check that data contract quality is valid", reason=f"Cannot resolve reference {ref}", engine="datacontract", @@ -240,6 +278,21 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object: return quality_spec +def _resolve_data_contract_from_str_v2( + data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False +) -> DataContractSpecification | OpenDataContractStandard: + yaml_dict = _to_yaml(data_contract_str) + + if is_open_data_contract_standard(yaml_dict): + logging.info("Importing ODCS v3") + # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly + odcs = parse_odcs_v3_from_str(data_contract_str) + return odcs + + logging.info("Importing DCS") + return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict) + + def _resolve_data_contract_from_str( data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False ) -> DataContractSpecification: @@ -248,20 +301,36 @@ def _resolve_data_contract_from_str( if is_open_data_contract_standard(yaml_dict): logging.info("Importing ODCS v3") # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly + odcs = parse_odcs_v3_from_str(data_contract_str) + data_contract_specification = DataContractSpecification(dataContractSpecification="1.1.0") - return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str) - else: - logging.info("Importing DCS") + return import_from_odcs(data_contract_specification, odcs) + logging.info("Importing DCS") + return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict) + + +def _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict): _validate_data_contract_specification_schema(yaml_dict, schema_location) data_contract_specification = yaml_dict spec = DataContractSpecification(**data_contract_specification) - if inline_definitions: inline_definitions_into_data_contract(spec) - if spec.quality and inline_quality: - _resolve_quality_ref(spec.quality) - + ## Suppress DeprecationWarning when accessing spec.quality, + ## iif it is in fact *not* used. + with warnings.catch_warnings(record=True) as recorded_warnings: + spec_quality = spec.quality + for w in recorded_warnings: + if not issubclass(w.category, DeprecationWarning) or spec_quality is not None: + warnings.warn_explicit( + message=w.message, + category=w.category, + filename=w.filename, + lineno=w.lineno, + source=w.source, + ) + if spec_quality and inline_quality: + _resolve_quality_ref(spec_quality) return spec diff --git a/datacontract/model/data_contract_specification.py b/datacontract/model/data_contract_specification.py deleted file mode 100644 index 0a3c1cd01..000000000 --- a/datacontract/model/data_contract_specification.py +++ /dev/null @@ -1,327 +0,0 @@ -import os -from typing import Any, Dict, List - -import pydantic as pyd -import yaml - -DATACONTRACT_TYPES = [ - "string", - "text", - "varchar", - "number", - "decimal", - "numeric", - "int", - "integer", - "long", - "bigint", - "float", - "double", - "boolean", - "timestamp", - "timestamp_tz", - "timestamp_ntz", - "date", - "array", - "bytes", - "object", - "record", - "struct", - "null", -] - - -class Contact(pyd.BaseModel): - name: str | None = None - url: str | None = None - email: str | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class ServerRole(pyd.BaseModel): - name: str | None = None - description: str | None = None - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Server(pyd.BaseModel): - type: str | None = None - description: str | None = None - environment: str | None = None - format: str | None = None - project: str | None = None - dataset: str | None = None - path: str | None = None - delimiter: str | None = None - endpointUrl: str | None = None - location: str | None = None - account: str | None = None - database: str | None = None - schema_: str | None = pyd.Field(default=None, alias="schema") - host: str | None = None - port: int | None = None - catalog: str | None = None - topic: str | None = None - http_path: str | None = None # Use ENV variable - token: str | None = None # Use ENV variable - dataProductId: str | None = None - outputPortId: str | None = None - driver: str | None = None - storageAccount: str | None = None - roles: List[ServerRole] = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Terms(pyd.BaseModel): - usage: str | None = None - limitations: str | None = None - billing: str | None = None - noticePeriod: str | None = None - description: str | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Definition(pyd.BaseModel): - domain: str | None = None - name: str | None = None - title: str | None = None - description: str | None = None - type: str | None = None - enum: List[str] = [] - format: str | None = None - minLength: int | None = None - maxLength: int | None = None - pattern: str | None = None - minimum: int | None = None - exclusiveMinimum: int | None = None - maximum: int | None = None - exclusiveMaximum: int | None = None - pii: bool | None = None - classification: str | None = None - fields: Dict[str, "Field"] = {} - items: "Field" = None - tags: List[str] = [] - links: Dict[str, str] = {} - example: str | None = None - examples: List[Any] | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Quality(pyd.BaseModel): - type: str | None = None - description: str | None = None - query: str | None = None - dialect: str | None = None - mustBe: int | None = None - mustNotBe: int | None = None - mustBeGreaterThan: int | None = None - mustBeGreaterThanOrEqualTo: int | None = None - mustBeLessThan: int | None = None - mustBeLessThanOrEqualTo: int | None = None - mustBeBetween: List[int] = None - mustNotBeBetween: List[int] = None - engine: str | None = None - implementation: str | Dict[str, Any] | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Field(pyd.BaseModel): - ref: str = pyd.Field(default=None, alias="$ref") - title: str | None = None - type: str | None = None - format: str | None = None - required: bool | None = None - primary: bool = pyd.Field( - default=None, - deprecated="Removed in Data Contract Specification v1.1.0. Use primaryKey instead.", - ) - primaryKey: bool | None = None - unique: bool | None = None - references: str | None = None - description: str | None = None - pii: bool | None = None - classification: str | None = None - pattern: str | None = None - minLength: int | None = None - maxLength: int | None = None - minimum: int | None = None - exclusiveMinimum: int | None = None - maximum: int | None = None - exclusiveMaximum: int | None = None - enum: List[str] | None = [] - tags: List[str] | None = [] - links: Dict[str, str] = {} - fields: Dict[str, "Field"] = {} - items: "Field" = None - keys: "Field" = None - values: "Field" = None - precision: int | None = None - scale: int | None = None - example: Any | None = pyd.Field( - default=None, - deprecated="Removed in Data Contract Specification v1.1.0. Use examples instead.", - ) - examples: List[Any] | None = None - quality: List[Quality] | None = [] - config: Dict[str, Any] | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Model(pyd.BaseModel): - description: str | None = None - type: str | None = None - namespace: str | None = None - title: str | None = None - fields: Dict[str, Field] = {} - quality: List[Quality] | None = [] - primaryKey: List[str] | None = [] - examples: List[Any] | None = None - config: Dict[str, Any] = None - tags: List[str] | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Info(pyd.BaseModel): - title: str | None = None - version: str | None = None - status: str | None = None - description: str | None = None - owner: str | None = None - contact: Contact | None = None - - model_config = pyd.ConfigDict( - extra="allow", - ) - - -class Example(pyd.BaseModel): - type: str | None = None - description: str | None = None - model: str | None = None - data: str | object = None - - -# Deprecated Quality class -class DeprecatedQuality(pyd.BaseModel): - type: str | None = None - specification: str | object = None - - -class Availability(pyd.BaseModel): - description: str | None = None - percentage: str | None = None - - -class Retention(pyd.BaseModel): - description: str | None = None - period: str | None = None - unlimited: bool | None = None - timestampField: str | None = None - - -class Latency(pyd.BaseModel): - description: str | None = None - threshold: str | None = None - sourceTimestampField: str | None = None - processedTimestampField: str | None = None - - -class Freshness(pyd.BaseModel): - description: str | None = None - threshold: str | None = None - timestampField: str | None = None - - -class Frequency(pyd.BaseModel): - description: str | None = None - type: str | None = None - interval: str | None = None - cron: str | None = None - - -class Support(pyd.BaseModel): - description: str | None = None - time: str | None = None - responseTime: str | None = None - - -class Backup(pyd.BaseModel): - description: str | None = None - interval: str | None = None - cron: str | None = None - recoveryTime: str | None = None - recoveryPoint: str | None = None - - -class ServiceLevel(pyd.BaseModel): - availability: Availability | None = None - retention: Retention | None = None - latency: Latency | None = None - freshness: Freshness | None = None - frequency: Frequency | None = None - support: Support | None = None - backup: Backup | None = None - - -class DataContractSpecification(pyd.BaseModel): - dataContractSpecification: str | None = None - id: str | None = None - info: Info | None = None - servers: Dict[str, Server] = {} - terms: Terms | None = None - models: Dict[str, Model] = {} - definitions: Dict[str, Definition] = {} - examples: List[Example] = pyd.Field( - default_factory=list, - deprecated="Removed in Data Contract Specification " "v1.1.0. Use models.examples instead.", - ) - quality: DeprecatedQuality | None = pyd.Field( - default=None, - deprecated="Removed in Data Contract Specification v1.1.0. Use " "model-level and field-level quality instead.", - ) - servicelevels: ServiceLevel | None = None - links: Dict[str, str] = {} - tags: List[str] = [] - - @classmethod - def from_file(cls, file): - if not os.path.exists(file): - raise FileNotFoundError(f"The file '{file}' does not exist.") - with open(file, "r") as file: - file_content = file.read() - return DataContractSpecification.from_string(file_content) - - @classmethod - def from_string(cls, data_contract_str): - data = yaml.safe_load(data_contract_str) - return DataContractSpecification(**data) - - def to_yaml(self): - return yaml.safe_dump( - self.model_dump(mode="json", exclude_defaults=True, exclude_none=True, by_alias=True), - sort_keys=False, - allow_unicode=True, - ) diff --git a/datacontract/model/data_contract_specification/__init__.py b/datacontract/model/data_contract_specification/__init__.py new file mode 100644 index 000000000..2141d2da0 --- /dev/null +++ b/datacontract/model/data_contract_specification/__init__.py @@ -0,0 +1 @@ +from datacontract_specification.model import * diff --git a/datacontract/templates/datacontract.html b/datacontract/templates/datacontract.html index 31aa60a28..24d65357f 100644 --- a/datacontract/templates/datacontract.html +++ b/datacontract/templates/datacontract.html @@ -5,6 +5,8 @@ {# #} + + @@ -29,7 +31,6 @@ -
@@ -40,6 +41,10 @@

{{ datacontract.id }} + + + Data Contract Specification v{{ datacontract.dataContractSpecification }} +

{% if datacontract.tags %} @@ -77,7 +82,6 @@

@@ -103,6 +107,52 @@

Servers {% endif %} +
+
+

Entity Relationship + Diagram

+

Visual representation of data model relationships

+
+
+
+
+
+                    {{ mermaid_diagram }}
+                  
+
+
+
+ +
diff --git a/datacontract/templates/datacontract_odcs.html b/datacontract/templates/datacontract_odcs.html new file mode 100644 index 000000000..cd9ab779d --- /dev/null +++ b/datacontract/templates/datacontract_odcs.html @@ -0,0 +1,666 @@ + + + + Data Contract + + + {# #} + + + + + + +
+ + +
+ +
+
+
+
+

+ Data Contract

+
+ {{ datacontract.id }} +
+
+ + Open Data Contract Standard {{ datacontract.apiVersion }} + + + {% if datacontract.tags %} +
+ {% for tag in datacontract.tags %} + + {{ tag }} + + {% endfor %} +
+ {% endif %} +
+
+
+ +
+
+
+ +
+
+
+
+

Fundamentals

+

Basic information about the data contract

+
+ +
+
+
+ + {% if datacontract.name %} +
+
Name
+
{{ datacontract.name }}
+
+ {% endif %} + + {% if datacontract.version %} +
+
Version
+
{{ datacontract.version }}
+
+ {% endif %} + + {% if datacontract.status %} +
+
Status
+
{{ datacontract.status }}
+
+ {% endif %} + + {% if datacontract.dataProduct %} +
+
Data Product
+
{{ datacontract.dataProduct }}
+
+ {% endif %} + + {% if datacontract.tenant %} +
+
Tenant
+
{{ datacontract.tenant }}
+
+ {% endif %} + + {% if datacontract.description %} + {% if datacontract.description.purpose %} +
+
Purpose
+
{{ datacontract.description.purpose }}
+
+ {% endif %} + + {% if datacontract.description.usage %} +
+
Usage
+
{{ datacontract.description.usage }}
+
+ {% endif %} + + {% if datacontract.description.limitations %} +
+
Limitations
+
{{ datacontract.description.limitations }}
+
+ {% endif %} + {% endif %} + + {% if datacontract.contractCreatedTs %} +
+
Contract Created
+
{{ datacontract.contractCreatedTs }}
+
+ {% endif %} +
+
+
+
+ +
+
+

Entity Relationship + Diagram

+

Visual representation of data model relationships

+
+
+
+
+
+                    {{ mermaid_diagram }}
+                  
+
+
+
+ +
+ +
+
+
+

+ Schema +

+

The data schema and structure

+
+
+ + {% for schema in datacontract.schema_ %} + +
+
+
+
+ + + + + + + + + + + + + + + + {% for property in schema.properties %} + + + + + + + + {% endfor %} + + {% if schema.quality %} + + {% for quality in schema.quality %} + + + + {% endfor %} + + {% endif %} +
+ {% if schema.name %} + {{ schema.name }} + {% endif %} + {% if schema.physicalName and schema.physicalName != schema.name %} + ({{ schema.physicalName }}) + {% endif %} + {{ schema.physicalType or schema.logicalType }} +
{{ schema.description }} +
+ {% if schema.dataGranularityDescription %} +
Granularity: {{ + schema.dataGranularityDescription }} +
+ {% endif %} +
+ Property + + Business Name + + Type + + Required + + Description +
+
{{ property.name }}
+ {% if property.primaryKey %} + Primary Key + {% endif %} + {% if property.partitioned %} + Partitioned + {% endif %} + {% if property.criticalDataElement %} + Critical + {% endif %} +
{{ + property.businessName or "-" }} + +
{{ property.logicalType }}
+ {% if property.physicalType and property.physicalType != + property.logicalType %} +
{{ property.physicalType }}
+ {% endif %} +
+ {% if property.required %} + Yes + {% else %} + No + {% endif %} + {{ property.description or "-" + }} +
+
+
{{ quality.rule }}
+
{{ quality.description }}
+ {% if quality.dimension %} + {{ quality.dimension }} + {% endif %} +
+
+
+
+
+
+ {% endfor %} +
+ + {% if datacontract.support %} +
+
+

Support

+

Support channels and resources

+
+ +
    + {% for support in datacontract.support %} +
  • +
    +
    +
    +

    {{ support.channel }}

    + {% if support.description %} +

    {{ support.description }}

    + {% endif %} + {% if support.tool %} + {{ support.tool }} + {% endif %} +
    +
    + {% if support.url %} + + {% endif %} +
    +
  • + {% endfor %} +
+
+ {% endif %} + + {% if datacontract.price %} +
+
+

Pricing

+

Data contract pricing information

+
+ +
+
+
+
+ {{ datacontract.price.priceAmount }} {{ datacontract.price.priceCurrency }} +
+
+ per {{ datacontract.price.priceUnit }} +
+
+
+
+
+ {% endif %} + + + {% if datacontract.team %} +
+
+

Team

+

Team members and their roles

+
+ +
+ + + + + + + + + + + + {% for member in datacontract.team %} + + + + + + + + {% endfor %} + +
+ Username + + Role + + Date In + + Date Out + + Comment +
{{ + member.username }} + {{ member.role }} + {{ member.dateIn or + "-" }} + {{ member.dateOut or + "-" }} + {{ member.comment or + "-" }} +
+
+
+ {% endif %} + + {% if datacontract.roles %} +
+
+

Access Roles

+

Access roles and approval workflows

+
+ +
+ + + + + + + + + + + {% for role in datacontract.roles %} + + + + + + + {% endfor %} + +
+ Role + + Access + + First Level Approver + + Second Level Approver +
{{ + role.role }} + + + {{ role.access or "-" }} + + {{ + role.firstLevelApprovers or "-" }} + {{ + role.secondLevelApprovers or "-" }} +
+
+
+ {% endif %} + + + {% if datacontract.slaProperties %} +
+
+

Service Level Agreements

+

SLA properties and commitments

+
+ +
+ + + + + + + + + + + + {% for sla in datacontract.slaProperties %} + + + + + + + + {% endfor %} + +
PropertyValueUnitElementDriver
{{ sla.property }}{{ sla.value }}{{ sla.unit or "-" }}{{ sla.element or "-" }}{{ sla.driver or "-" }}
+
+
+ {% endif %} + + + {% if datacontract.servers %} +
+
+

Servers

+

Infrastructure servers of the data contract

+
+ +
    + {% for server in datacontract.servers %} + {{ render_partial('partials/server.html', server_name = server.server, server = + server) }} + {% endfor %} +
+ +
+ {% endif %} + + + {% if datacontract.customProperties %} +
+
+

Custom Properties

+

Additional custom properties and metadata

+
+ +
+ + + + + + + + + {% for prop in datacontract.customProperties %} + + + + + {% endfor %} + +
PropertyValue
{{ prop.property }} + {% if prop.value is iterable and prop.value is not string %} + {% for item in prop.value %}{{ item }}{% if not loop.last %}, {% endif %}{% endfor %} + {% else %} + {{ prop.value }} + {% endif %} +
+
+
+ {% endif %} + +
+
+ +
+ Created at {{formatted_date}} with Data Contract CLI v{{datacontract_cli_version}} +
+ +
+
+ + +
+ +
+
+
+
+ +
+
+
+ + +
+
{{datacontract_yaml}}
+
+
+ +
+
+
+
+
+ + + + +
+ + + diff --git a/datacontract/templates/index.html b/datacontract/templates/index.html index 7621eb00d..fb4a2bb1d 100644 --- a/datacontract/templates/index.html +++ b/datacontract/templates/index.html @@ -80,6 +80,7 @@

Filters

data-search="{{ contract.spec.info.title|lower|e }} {{ contract.spec.info.owner|lower|e if contract.spec.info.owner else '' }} {{ + contract.spec.info.version|lower|e if contract.spec.info.version else '' }} {{ contract.spec.info.description|lower|e }} {% for model_name, model in contract.spec.models.items() %} {{ model.description|lower|e }} {% @@ -94,6 +95,7 @@

Filters

{{contract.spec.info.title}}

+

{{ contract.spec.info.version }}

{% if contract.spec.info.owner %}
diff --git a/datacontract/templates/partials/server.html b/datacontract/templates/partials/server.html index dc49cfc6f..b36096e4b 100644 --- a/datacontract/templates/partials/server.html +++ b/datacontract/templates/partials/server.html @@ -195,6 +195,7 @@
{% endif %} + {% if server.model_extra %} {% for key, value in server.model_extra.items() %}
@@ -205,5 +206,6 @@
{% endfor %} + {% endif %} \ No newline at end of file diff --git a/datacontract/templates/style/output.css b/datacontract/templates/style/output.css index 6d1d5c70a..744e9313b 100644 --- a/datacontract/templates/style/output.css +++ b/datacontract/templates/style/output.css @@ -1,113 +1,5 @@ -*, ::before, ::after { - --tw-border-spacing-x: 0; - --tw-border-spacing-y: 0; - --tw-translate-x: 0; - --tw-translate-y: 0; - --tw-rotate: 0; - --tw-skew-x: 0; - --tw-skew-y: 0; - --tw-scale-x: 1; - --tw-scale-y: 1; - --tw-pan-x: ; - --tw-pan-y: ; - --tw-pinch-zoom: ; - --tw-scroll-snap-strictness: proximity; - --tw-gradient-from-position: ; - --tw-gradient-via-position: ; - --tw-gradient-to-position: ; - --tw-ordinal: ; - --tw-slashed-zero: ; - --tw-numeric-figure: ; - --tw-numeric-spacing: ; - --tw-numeric-fraction: ; - --tw-ring-inset: ; - --tw-ring-offset-width: 0px; - --tw-ring-offset-color: #fff; - --tw-ring-color: rgb(59 130 246 / 0.5); - --tw-ring-offset-shadow: 0 0 #0000; - --tw-ring-shadow: 0 0 #0000; - --tw-shadow: 0 0 #0000; - --tw-shadow-colored: 0 0 #0000; - --tw-blur: ; - --tw-brightness: ; - --tw-contrast: ; - --tw-grayscale: ; - --tw-hue-rotate: ; - --tw-invert: ; - --tw-saturate: ; - --tw-sepia: ; - --tw-drop-shadow: ; - --tw-backdrop-blur: ; - --tw-backdrop-brightness: ; - --tw-backdrop-contrast: ; - --tw-backdrop-grayscale: ; - --tw-backdrop-hue-rotate: ; - --tw-backdrop-invert: ; - --tw-backdrop-opacity: ; - --tw-backdrop-saturate: ; - --tw-backdrop-sepia: ; - --tw-contain-size: ; - --tw-contain-layout: ; - --tw-contain-paint: ; - --tw-contain-style: ; -} - -::backdrop { - --tw-border-spacing-x: 0; - --tw-border-spacing-y: 0; - --tw-translate-x: 0; - --tw-translate-y: 0; - --tw-rotate: 0; - --tw-skew-x: 0; - --tw-skew-y: 0; - --tw-scale-x: 1; - --tw-scale-y: 1; - --tw-pan-x: ; - --tw-pan-y: ; - --tw-pinch-zoom: ; - --tw-scroll-snap-strictness: proximity; - --tw-gradient-from-position: ; - --tw-gradient-via-position: ; - --tw-gradient-to-position: ; - --tw-ordinal: ; - --tw-slashed-zero: ; - --tw-numeric-figure: ; - --tw-numeric-spacing: ; - --tw-numeric-fraction: ; - --tw-ring-inset: ; - --tw-ring-offset-width: 0px; - --tw-ring-offset-color: #fff; - --tw-ring-color: rgb(59 130 246 / 0.5); - --tw-ring-offset-shadow: 0 0 #0000; - --tw-ring-shadow: 0 0 #0000; - --tw-shadow: 0 0 #0000; - --tw-shadow-colored: 0 0 #0000; - --tw-blur: ; - --tw-brightness: ; - --tw-contrast: ; - --tw-grayscale: ; - --tw-hue-rotate: ; - --tw-invert: ; - --tw-saturate: ; - --tw-sepia: ; - --tw-drop-shadow: ; - --tw-backdrop-blur: ; - --tw-backdrop-brightness: ; - --tw-backdrop-contrast: ; - --tw-backdrop-grayscale: ; - --tw-backdrop-hue-rotate: ; - --tw-backdrop-invert: ; - --tw-backdrop-opacity: ; - --tw-backdrop-saturate: ; - --tw-backdrop-sepia: ; - --tw-contain-size: ; - --tw-contain-layout: ; - --tw-contain-paint: ; - --tw-contain-style: ; -} - /* -! tailwindcss v3.4.17 | MIT License | https://tailwindcss.com +! tailwindcss v3.4.3 | MIT License | https://tailwindcss.com */ /* @@ -550,10 +442,152 @@ video { /* Make elements with the HTML hidden attribute stay hidden by default */ -[hidden]:where(:not([hidden="until-found"])) { +[hidden] { display: none; } +*, ::before, ::after { + --tw-border-spacing-x: 0; + --tw-border-spacing-y: 0; + --tw-translate-x: 0; + --tw-translate-y: 0; + --tw-rotate: 0; + --tw-skew-x: 0; + --tw-skew-y: 0; + --tw-scale-x: 1; + --tw-scale-y: 1; + --tw-pan-x: ; + --tw-pan-y: ; + --tw-pinch-zoom: ; + --tw-scroll-snap-strictness: proximity; + --tw-gradient-from-position: ; + --tw-gradient-via-position: ; + --tw-gradient-to-position: ; + --tw-ordinal: ; + --tw-slashed-zero: ; + --tw-numeric-figure: ; + --tw-numeric-spacing: ; + --tw-numeric-fraction: ; + --tw-ring-inset: ; + --tw-ring-offset-width: 0px; + --tw-ring-offset-color: #fff; + --tw-ring-color: rgb(59 130 246 / 0.5); + --tw-ring-offset-shadow: 0 0 #0000; + --tw-ring-shadow: 0 0 #0000; + --tw-shadow: 0 0 #0000; + --tw-shadow-colored: 0 0 #0000; + --tw-blur: ; + --tw-brightness: ; + --tw-contrast: ; + --tw-grayscale: ; + --tw-hue-rotate: ; + --tw-invert: ; + --tw-saturate: ; + --tw-sepia: ; + --tw-drop-shadow: ; + --tw-backdrop-blur: ; + --tw-backdrop-brightness: ; + --tw-backdrop-contrast: ; + --tw-backdrop-grayscale: ; + --tw-backdrop-hue-rotate: ; + --tw-backdrop-invert: ; + --tw-backdrop-opacity: ; + --tw-backdrop-saturate: ; + --tw-backdrop-sepia: ; + --tw-contain-size: ; + --tw-contain-layout: ; + --tw-contain-paint: ; + --tw-contain-style: ; +} + +::backdrop { + --tw-border-spacing-x: 0; + --tw-border-spacing-y: 0; + --tw-translate-x: 0; + --tw-translate-y: 0; + --tw-rotate: 0; + --tw-skew-x: 0; + --tw-skew-y: 0; + --tw-scale-x: 1; + --tw-scale-y: 1; + --tw-pan-x: ; + --tw-pan-y: ; + --tw-pinch-zoom: ; + --tw-scroll-snap-strictness: proximity; + --tw-gradient-from-position: ; + --tw-gradient-via-position: ; + --tw-gradient-to-position: ; + --tw-ordinal: ; + --tw-slashed-zero: ; + --tw-numeric-figure: ; + --tw-numeric-spacing: ; + --tw-numeric-fraction: ; + --tw-ring-inset: ; + --tw-ring-offset-width: 0px; + --tw-ring-offset-color: #fff; + --tw-ring-color: rgb(59 130 246 / 0.5); + --tw-ring-offset-shadow: 0 0 #0000; + --tw-ring-shadow: 0 0 #0000; + --tw-shadow: 0 0 #0000; + --tw-shadow-colored: 0 0 #0000; + --tw-blur: ; + --tw-brightness: ; + --tw-contrast: ; + --tw-grayscale: ; + --tw-hue-rotate: ; + --tw-invert: ; + --tw-saturate: ; + --tw-sepia: ; + --tw-drop-shadow: ; + --tw-backdrop-blur: ; + --tw-backdrop-brightness: ; + --tw-backdrop-contrast: ; + --tw-backdrop-grayscale: ; + --tw-backdrop-hue-rotate: ; + --tw-backdrop-invert: ; + --tw-backdrop-opacity: ; + --tw-backdrop-saturate: ; + --tw-backdrop-sepia: ; + --tw-contain-size: ; + --tw-contain-layout: ; + --tw-contain-paint: ; + --tw-contain-style: ; +} + +.container { + width: 100%; +} + +@media (min-width: 640px) { + .container { + max-width: 640px; + } +} + +@media (min-width: 768px) { + .container { + max-width: 768px; + } +} + +@media (min-width: 1024px) { + .container { + max-width: 1024px; + } +} + +@media (min-width: 1280px) { + .container { + max-width: 1280px; + } +} + +@media (min-width: 1536px) { + .container { + max-width: 1536px; + } +} + .sr-only { position: absolute; width: 1px; @@ -635,12 +669,12 @@ video { margin-bottom: 0.5rem; } -.-ml-0\.5 { - margin-left: -0.125rem; +.-ml-0 { + margin-left: -0px; } -.mb-2 { - margin-bottom: 0.5rem; +.-ml-0\.5 { + margin-left: -0.125rem; } .mb-3 { @@ -651,6 +685,10 @@ video { margin-bottom: 1.5rem; } +.ml-3 { + margin-left: 0.75rem; +} + .mr-1 { margin-right: 0.25rem; } @@ -798,6 +836,11 @@ video { width: 2rem; } +.w-fit { + width: -moz-fit-content; + width: fit-content; +} + .w-full { width: 100%; } @@ -900,6 +943,12 @@ video { margin-left: calc(0.75rem * calc(1 - var(--tw-space-x-reverse))); } +.space-x-4 > :not([hidden]) ~ :not([hidden]) { + --tw-space-x-reverse: 0; + margin-right: calc(1rem * var(--tw-space-x-reverse)); + margin-left: calc(1rem * calc(1 - var(--tw-space-x-reverse))); +} + .space-x-6 > :not([hidden]) ~ :not([hidden]) { --tw-space-x-reverse: 0; margin-right: calc(1.5rem * var(--tw-space-x-reverse)); @@ -912,6 +961,18 @@ video { margin-left: calc(2rem * calc(1 - var(--tw-space-x-reverse))); } +.space-y-1 > :not([hidden]) ~ :not([hidden]) { + --tw-space-y-reverse: 0; + margin-top: calc(0.25rem * calc(1 - var(--tw-space-y-reverse))); + margin-bottom: calc(0.25rem * var(--tw-space-y-reverse)); +} + +.space-y-4 > :not([hidden]) ~ :not([hidden]) { + --tw-space-y-reverse: 0; + margin-top: calc(1rem * calc(1 - var(--tw-space-y-reverse))); + margin-bottom: calc(1rem * var(--tw-space-y-reverse)); +} + .space-y-6 > :not([hidden]) ~ :not([hidden]) { --tw-space-y-reverse: 0; margin-top: calc(1.5rem * calc(1 - var(--tw-space-y-reverse))); @@ -926,17 +987,17 @@ video { .divide-gray-100 > :not([hidden]) ~ :not([hidden]) { --tw-divide-opacity: 1; - border-color: rgb(243 244 246 / var(--tw-divide-opacity, 1)); + border-color: rgb(243 244 246 / var(--tw-divide-opacity)); } .divide-gray-200 > :not([hidden]) ~ :not([hidden]) { --tw-divide-opacity: 1; - border-color: rgb(229 231 235 / var(--tw-divide-opacity, 1)); + border-color: rgb(229 231 235 / var(--tw-divide-opacity)); } .divide-gray-300 > :not([hidden]) ~ :not([hidden]) { --tw-divide-opacity: 1; - border-color: rgb(209 213 219 / var(--tw-divide-opacity, 1)); + border-color: rgb(209 213 219 / var(--tw-divide-opacity)); } .overflow-hidden { @@ -983,42 +1044,57 @@ video { .bg-blue-50 { --tw-bg-opacity: 1; - background-color: rgb(239 246 255 / var(--tw-bg-opacity, 1)); + background-color: rgb(239 246 255 / var(--tw-bg-opacity)); } .bg-gray-100 { --tw-bg-opacity: 1; - background-color: rgb(243 244 246 / var(--tw-bg-opacity, 1)); + background-color: rgb(243 244 246 / var(--tw-bg-opacity)); } .bg-gray-50 { --tw-bg-opacity: 1; - background-color: rgb(249 250 251 / var(--tw-bg-opacity, 1)); + background-color: rgb(249 250 251 / var(--tw-bg-opacity)); } .bg-gray-500 { --tw-bg-opacity: 1; - background-color: rgb(107 114 128 / var(--tw-bg-opacity, 1)); + background-color: rgb(107 114 128 / var(--tw-bg-opacity)); +} + +.bg-green-50 { + --tw-bg-opacity: 1; + background-color: rgb(240 253 244 / var(--tw-bg-opacity)); } .bg-indigo-100 { --tw-bg-opacity: 1; - background-color: rgb(224 231 255 / var(--tw-bg-opacity, 1)); + background-color: rgb(224 231 255 / var(--tw-bg-opacity)); } .bg-indigo-600 { --tw-bg-opacity: 1; - background-color: rgb(79 70 229 / var(--tw-bg-opacity, 1)); + background-color: rgb(79 70 229 / var(--tw-bg-opacity)); +} + +.bg-purple-50 { + --tw-bg-opacity: 1; + background-color: rgb(250 245 255 / var(--tw-bg-opacity)); +} + +.bg-red-50 { + --tw-bg-opacity: 1; + background-color: rgb(254 242 242 / var(--tw-bg-opacity)); } .bg-white { --tw-bg-opacity: 1; - background-color: rgb(255 255 255 / var(--tw-bg-opacity, 1)); + background-color: rgb(255 255 255 / var(--tw-bg-opacity)); } .bg-yellow-50 { --tw-bg-opacity: 1; - background-color: rgb(254 252 232 / var(--tw-bg-opacity, 1)); + background-color: rgb(254 252 232 / var(--tw-bg-opacity)); } .bg-opacity-75 { @@ -1072,6 +1148,11 @@ video { padding-right: 1.5rem; } +.py-0 { + padding-top: 0px; + padding-bottom: 0px; +} + .py-0\.5 { padding-top: 0.125rem; padding-bottom: 0.125rem; @@ -1092,6 +1173,16 @@ video { padding-bottom: 0.5rem; } +.py-3 { + padding-top: 0.75rem; + padding-bottom: 0.75rem; +} + +.py-4 { + padding-top: 1rem; + padding-bottom: 1rem; +} + .py-5 { padding-top: 1.25rem; padding-bottom: 1.25rem; @@ -1190,12 +1281,16 @@ video { font-weight: 500; } +.font-normal { + font-weight: 400; +} + .font-semibold { font-weight: 600; } -.font-normal { - font-weight: 400; +.uppercase { + text-transform: uppercase; } .italic { @@ -1214,49 +1309,93 @@ video { line-height: 1.75rem; } +.tracking-wider { + letter-spacing: 0.05em; +} + .text-blue-600 { --tw-text-opacity: 1; - color: rgb(37 99 235 / var(--tw-text-opacity, 1)); + color: rgb(37 99 235 / var(--tw-text-opacity)); +} + +.text-blue-700 { + --tw-text-opacity: 1; + color: rgb(29 78 216 / var(--tw-text-opacity)); +} + +.text-blue-800 { + --tw-text-opacity: 1; + color: rgb(30 64 175 / var(--tw-text-opacity)); } .text-gray-400 { --tw-text-opacity: 1; - color: rgb(156 163 175 / var(--tw-text-opacity, 1)); + color: rgb(156 163 175 / var(--tw-text-opacity)); } .text-gray-500 { --tw-text-opacity: 1; - color: rgb(107 114 128 / var(--tw-text-opacity, 1)); + color: rgb(107 114 128 / var(--tw-text-opacity)); } .text-gray-600 { --tw-text-opacity: 1; - color: rgb(75 85 99 / var(--tw-text-opacity, 1)); + color: rgb(75 85 99 / var(--tw-text-opacity)); +} + +.text-gray-700 { + --tw-text-opacity: 1; + color: rgb(55 65 81 / var(--tw-text-opacity)); } .text-gray-800 { --tw-text-opacity: 1; - color: rgb(31 41 55 / var(--tw-text-opacity, 1)); + color: rgb(31 41 55 / var(--tw-text-opacity)); } .text-gray-900 { --tw-text-opacity: 1; - color: rgb(17 24 39 / var(--tw-text-opacity, 1)); + color: rgb(17 24 39 / var(--tw-text-opacity)); +} + +.text-green-600 { + --tw-text-opacity: 1; + color: rgb(22 163 74 / var(--tw-text-opacity)); +} + +.text-green-700 { + --tw-text-opacity: 1; + color: rgb(21 128 61 / var(--tw-text-opacity)); +} + +.text-purple-800 { + --tw-text-opacity: 1; + color: rgb(107 33 168 / var(--tw-text-opacity)); +} + +.text-red-800 { + --tw-text-opacity: 1; + color: rgb(153 27 27 / var(--tw-text-opacity)); } .text-sky-500 { --tw-text-opacity: 1; - color: rgb(14 165 233 / var(--tw-text-opacity, 1)); + color: rgb(14 165 233 / var(--tw-text-opacity)); } .text-white { --tw-text-opacity: 1; - color: rgb(255 255 255 / var(--tw-text-opacity, 1)); + color: rgb(255 255 255 / var(--tw-text-opacity)); } .text-yellow-600 { --tw-text-opacity: 1; - color: rgb(202 138 4 / var(--tw-text-opacity, 1)); + color: rgb(202 138 4 / var(--tw-text-opacity)); +} + +.text-yellow-800 { + --tw-text-opacity: 1; + color: rgb(133 77 14 / var(--tw-text-opacity)); } .shadow { @@ -1289,30 +1428,54 @@ video { .ring-black { --tw-ring-opacity: 1; - --tw-ring-color: rgb(0 0 0 / var(--tw-ring-opacity, 1)); + --tw-ring-color: rgb(0 0 0 / var(--tw-ring-opacity)); } .ring-blue-500\/10 { --tw-ring-color: rgb(59 130 246 / 0.1); } +.ring-blue-600\/20 { + --tw-ring-color: rgb(37 99 235 / 0.2); +} + .ring-gray-300 { --tw-ring-opacity: 1; - --tw-ring-color: rgb(209 213 219 / var(--tw-ring-opacity, 1)); + --tw-ring-color: rgb(209 213 219 / var(--tw-ring-opacity)); } .ring-gray-500\/10 { --tw-ring-color: rgb(107 114 128 / 0.1); } +.ring-gray-600\/20 { + --tw-ring-color: rgb(75 85 99 / 0.2); +} + .ring-gray-900\/5 { --tw-ring-color: rgb(17 24 39 / 0.05); } +.ring-green-600\/20 { + --tw-ring-color: rgb(22 163 74 / 0.2); +} + +.ring-purple-600\/20 { + --tw-ring-color: rgb(147 51 234 / 0.2); +} + +.ring-red-600\/20 { + --tw-ring-color: rgb(220 38 38 / 0.2); +} + .ring-yellow-500\/10 { --tw-ring-color: rgb(234 179 8 / 0.1); } +.ring-yellow-600\/20 { + --tw-ring-color: rgb(202 138 4 / 0.2); +} + .ring-opacity-5 { --tw-ring-opacity: 0.05; } @@ -1331,32 +1494,37 @@ video { .placeholder\:text-gray-400::-moz-placeholder { --tw-text-opacity: 1; - color: rgb(156 163 175 / var(--tw-text-opacity, 1)); + color: rgb(156 163 175 / var(--tw-text-opacity)); } .placeholder\:text-gray-400::placeholder { --tw-text-opacity: 1; - color: rgb(156 163 175 / var(--tw-text-opacity, 1)); + color: rgb(156 163 175 / var(--tw-text-opacity)); } .hover\:bg-gray-50:hover { --tw-bg-opacity: 1; - background-color: rgb(249 250 251 / var(--tw-bg-opacity, 1)); + background-color: rgb(249 250 251 / var(--tw-bg-opacity)); } .hover\:bg-indigo-500:hover { --tw-bg-opacity: 1; - background-color: rgb(99 102 241 / var(--tw-bg-opacity, 1)); + background-color: rgb(99 102 241 / var(--tw-bg-opacity)); } .hover\:text-gray-500:hover { --tw-text-opacity: 1; - color: rgb(107 114 128 / var(--tw-text-opacity, 1)); + color: rgb(107 114 128 / var(--tw-text-opacity)); } .hover\:text-gray-700:hover { --tw-text-opacity: 1; - color: rgb(55 65 81 / var(--tw-text-opacity, 1)); + color: rgb(55 65 81 / var(--tw-text-opacity)); +} + +.hover\:text-sky-700:hover { + --tw-text-opacity: 1; + color: rgb(3 105 161 / var(--tw-text-opacity)); } .focus\:ring-2:focus { @@ -1371,7 +1539,7 @@ video { .focus\:ring-indigo-600:focus { --tw-ring-opacity: 1; - --tw-ring-color: rgb(79 70 229 / var(--tw-ring-opacity, 1)); + --tw-ring-color: rgb(79 70 229 / var(--tw-ring-opacity)); } .focus-visible\:outline:focus-visible { @@ -1457,6 +1625,12 @@ video { align-items: center; } + .sm\:space-x-4 > :not([hidden]) ~ :not([hidden]) { + --tw-space-x-reverse: 0; + margin-right: calc(1rem * var(--tw-space-x-reverse)); + margin-left: calc(1rem * calc(1 - var(--tw-space-x-reverse))); + } + .sm\:space-x-6 > :not([hidden]) ~ :not([hidden]) { --tw-space-x-reverse: 0; margin-right: calc(1.5rem * var(--tw-space-x-reverse)); diff --git a/datacontract/templates/style/tailwind.config.js b/datacontract/templates/style/tailwind.config.js index 9e5c25588..2fbf6773a 100644 --- a/datacontract/templates/style/tailwind.config.js +++ b/datacontract/templates/style/tailwind.config.js @@ -1,6 +1,7 @@ module.exports = { content: [ "../datacontract.html", + "../datacontract_odcs.html", "../index.html", "../partials/model_field.html", "../partials/server.html", diff --git a/pyproject.toml b/pyproject.toml index 20ca1b69d..32e2e909b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,8 @@ [project] name = "datacontract-cli" -version = "0.10.23" +version = "0.10.28" description = "The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library." +license = "MIT" readme = "README.md" authors = [ { name = "Jochen Christ", email = "jochen.christ@innoq.com" }, @@ -10,29 +11,30 @@ authors = [ ] classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] requires-python = ">=3.10" dependencies = [ "typer>=0.15.1,<0.16", - "pydantic>=2.8.2,<2.11.0", + "pydantic>=2.8.2,<2.12.0", "pyyaml~=6.0.1", "requests>=2.31,<2.33", "fastjsonschema>=2.19.1,<2.22.0", "fastparquet>=2024.5.0,<2025.0.0", "numpy>=1.26.4,<2.0.0", # transitive dependency, needs to be <2.0.0 https://github.com/datacontract/datacontract-cli/issues/575 - "python-multipart==0.0.20", - "rich>=13.7,<13.10", + "python-multipart>=0.0.20,<1.0.0", + "rich>=13.7,<15.0", "sqlglot>=26.6.0,<27.0.0", "duckdb>=1.0.0,<2.0.0", - "soda-core-duckdb>=3.3.20,<3.5.0", + "soda-core-duckdb>=3.3.20,<3.6.0", # remove setuptools when https://github.com/sodadata/soda-core/issues/2091 is resolved "setuptools>=60", - "python-dotenv~=1.0.0", - "boto3>=1.34.41,<1.37.23", - "Jinja2>=3.1.5", - "jinja_partials >= 0.2.1", + "python-dotenv>=1.0.0,<2.0.0", + "boto3>=1.34.41,<2.0.0", + "Jinja2>=3.1.5,<4.0.0", + "jinja_partials>=0.2.1,<1.0.0", + "datacontract-specification>=1.1.1,<2.0.0", + "open-data-contract-standard>=3.0.4,<4.0.0", ] [project.optional-dependencies] @@ -46,15 +48,20 @@ bigquery = [ ] csv = [ - "clevercsv >= 0.8.2", "pandas >= 2.0.0", ] +excel = [ + "openpyxl>=3.1.5,<4.0.0", +] + + databricks = [ - "soda-core-spark-df>=3.3.20,<3.5.0", - "soda-core-spark[databricks]>=3.3.20,<3.5.0", - "databricks-sql-connector>=3.7.0,<3.8.0", - "databricks-sdk<0.50.0", + "soda-core-spark-df>=3.3.20,<3.6.0", + "soda-core-spark[databricks]>=3.3.20,<3.6.0", + "databricks-sql-connector>=3.7.0,<4.1.0", + "databricks-sdk<0.55.0", + "pyspark==3.5.5", ] iceberg = [ @@ -63,29 +70,30 @@ iceberg = [ kafka = [ "datacontract-cli[avro]", - "soda-core-spark-df>=3.3.20,<3.5.0" + "soda-core-spark-df>=3.3.20,<3.6.0", + "pyspark==3.5.5", ] postgres = [ - "soda-core-postgres>=3.3.20,<3.5.0" + "soda-core-postgres>=3.3.20,<3.6.0" ] s3 = [ - "s3fs==2025.2.0", - "aiobotocore>=2.17.0,<2.20.0", + "s3fs>=2025.2.0,<2026.0.0", + "aiobotocore>=2.17.0,<2.23.0", ] snowflake = [ "snowflake-connector-python[pandas]>=3.6,<3.15", - "soda-core-snowflake>=3.3.20,<3.5.0" + "soda-core-snowflake>=3.3.20,<3.6.0" ] sqlserver = [ - "soda-core-sqlserver>=3.3.20,<3.5.0" + "soda-core-sqlserver>=3.3.20,<3.6.0" ] trino = [ - "soda-core-trino>=3.3.20,<3.5.0" + "soda-core-trino>=3.3.20,<3.6.0" ] dbt = [ @@ -106,7 +114,7 @@ rdf = [ api = [ "fastapi==0.115.12", - "uvicorn==0.34.0", + "uvicorn==0.34.2", ] protobuf = [ @@ -114,21 +122,22 @@ protobuf = [ ] all = [ - "datacontract-cli[kafka,bigquery,csv,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]" + "datacontract-cli[kafka,bigquery,csv,excel,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]" ] +# for development, we pin all libraries to an exact version dev = [ "datacontract-cli[all]", "httpx==0.28.1", "kafka-python", - "moto==5.1.1", + "moto==5.1.5", "pandas>=2.1.0", - "pre-commit>=3.7.1,<4.2.0", + "pre-commit>=3.7.1,<4.3.0", "pytest", "pytest-xdist", - "pymssql==2.3.2", + "pymssql==2.3.4", "ruff", - "testcontainers[minio,postgres,kafka,mssql]==4.9.2", + "testcontainers[minio,postgres,kafka,mssql]==4.10.0", "trino==0.333.0", ] @@ -156,3 +165,6 @@ line-length = 120 extend-select = [ "I", # re-order imports in alphabetic order ] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401", "F403"] diff --git a/release b/release index 7c3ed60f1..1ca2ba651 100755 --- a/release +++ b/release @@ -1,8 +1,14 @@ #!/bin/bash set -e +# Release steps: +# 1. Update release version in pyproject.toml +# 2. Update CHANGELOG.md header +# 3. Run ./release +# 4. Update release notes in Github + # pip install toml-cli -VERSION=$(toml get --toml-path pyproject.toml project.version) +VERSION=$(uvx --from toml-cli toml get --toml-path pyproject.toml project.version) TAG_VERSION=v$VERSION echo "Checking that everything is committed" diff --git a/tests/fixtures/bigquery/export/bq_table_schema.json b/tests/fixtures/bigquery/export/bq_table_schema.json index c0d9df5c1..e4a33567c 100644 --- a/tests/fixtures/bigquery/export/bq_table_schema.json +++ b/tests/fixtures/bigquery/export/bq_table_schema.json @@ -106,7 +106,7 @@ }, { "name": "timestamp_ntz_field", - "type": "TIME", + "type": "DATETIME", "mode": "NULLABLE", "description": "a simple timestamp_ntz field" }, diff --git a/tests/fixtures/csv/data/sample_data_5_column.csv b/tests/fixtures/csv/data/sample_data_5_column.csv new file mode 100644 index 000000000..7603ba5a7 --- /dev/null +++ b/tests/fixtures/csv/data/sample_data_5_column.csv @@ -0,0 +1,11 @@ +field_one,field_two,field_three,field_four,field_five,field_six +CX-263-DU,50,2023-06-16 13:12:56,,true,test1@gmail.com +IK-894-MN,47,2023-10-08 22:40:57,,true,test1@gmail.com +ER-399-JY,22,2023-05-16 01:08:22,,true,test1@gmail.com +MT-939-FH,47,2023-03-15 05:15:21,,false,test1@gmail.com +LV-849-MI,50,2023-09-08 20:08:43,,false,test1@gmail.com +VS-079-OH,22,2023-04-15 00:50:32,,false,test1@gmail.com +DN-297-XY,50,2023-11-08 12:55:42,,false,test1@gmail.com +ZE-172-FP,14,,,true,test1@gmail.com +ID-840-EG,89,2023-10-02 17:17:58,,true, +FK-230-KZ,64,2023-11-27 15:21:48,,true,test1@gmail.com diff --git a/tests/fixtures/databricks-unity/import/datacontract.yaml b/tests/fixtures/databricks-unity/import/datacontract.yaml index efa2039a5..d3e98c5f7 100644 --- a/tests/fixtures/databricks-unity/import/datacontract.yaml +++ b/tests/fixtures/databricks-unity/import/datacontract.yaml @@ -10,24 +10,42 @@ models: title: test_table fields: id: - type: integer + type: int required: true + config: + databricksType: int name: - type: varchar + type: string required: false + config: + databricksType: varchar(255) age: - type: integer + type: int required: false + config: + databricksType: smallint salary: type: decimal required: false + config: + databricksType: decimal(10,2) join_date: type: date required: false + config: + databricksType: date updated_at: - type: timestamp + type: timestamp_ntz required: false + config: + databricksType: timestamp is_active: type: boolean required: false - + config: + databricksType: boolean +servers: + myserver: + type: databricks + catalog: mycatalog + schema: myschema diff --git a/tests/fixtures/databricks-unity/import/unity_table_schema.json b/tests/fixtures/databricks-unity/import/unity_table_schema.json index bb330c425..1fb55c687 100644 --- a/tests/fixtures/databricks-unity/import/unity_table_schema.json +++ b/tests/fixtures/databricks-unity/import/unity_table_schema.json @@ -1,7 +1,7 @@ { "name": "test_table", - "catalog_name": "string", - "schema_name": "string", + "catalog_name": "mycatalog", + "schema_name": "myschema", "table_type": "MANAGED", "data_source_format": "DELTA", "columns": [ diff --git a/tests/fixtures/excel/shipments-odcs.xlsx b/tests/fixtures/excel/shipments-odcs.xlsx new file mode 100644 index 000000000..e3a943518 Binary files /dev/null and b/tests/fixtures/excel/shipments-odcs.xlsx differ diff --git a/tests/fixtures/excel/shipments-odcs.yaml b/tests/fixtures/excel/shipments-odcs.yaml new file mode 100644 index 000000000..251c40a34 --- /dev/null +++ b/tests/fixtures/excel/shipments-odcs.yaml @@ -0,0 +1,176 @@ +apiVersion: v3.0.0 +kind: DataContract +id: fulfillment_shipments_v1 +name: Shipments +version: 1.0.0 +domain: sales-team +status: draft +tenant: company-A +tags: + - datalocation:EU +description: + purpose: This data can be used for analytical purposes + usage: Use this to analyze shipments + limitations: Not suitable for real-time use cases +customProperties: + - property: owner + value: controlling-team + - property: additionalField + value: some value +schema: + - name: shipments + logicalType: object + physicalType: table + physicalName: shipments_v1 + businessName: Shipments + description: "This table contains shipment data, including details about shipment IDs, associated orders, delivery dates, carriers, tracking numbers, statuses, and additional shipment information in JSON format." + dataGranularityDescription: Not Aggregated + tags: + - pii + properties: + - name: shipment_id + businessName: Shipment ID + physicalName: sid + logicalType: string + description: Unique identifier for each shipment. + physicalType: uuid + primaryKey: true + classification: internal + examples: + - 123e4567-e89b-12d3-a456-426614174000 + tags: + - businesskey + authoritativeDefinitions: + - type: businessDefinition + url: https://example.com/definitions/shipment_id + - name: order_id + businessName: Order ID + physicalName: oid + logicalType: string + logicalTypeOptions: + minLength: 8 + maxLength: 10 + description: Identifier for the order associated with the shipment. + required: true + physicalType: text + primaryKey: false + classification: internal + examples: + - ORD12345 + - name: delivery_date + businessName: Delivery Date + logicalType: date + description: The actual or expected delivery date of the shipment. + physicalType: timestamp_tz + primaryKey: false + classification: internal + examples: + - "2023-10-01T10:00:00Z" + - name: carrier + businessName: Carrier + logicalType: string + description: The shipping carrier used for the delivery. + physicalType: text + primaryKey: false + classification: internal + examples: + - FedEx + - UPS + - name: tracking_number + businessName: Tracking Number + logicalType: string + description: Tracking number provided by the carrier. + physicalType: text + primaryKey: false + classification: restricted + examples: + - 1Z999AA10123456784 + - name: status + businessName: Status + logicalType: string + description: Current status of the shipment. + physicalType: text + primaryKey: false + classification: internal + examples: + - Delivered + - In Transit + - name: inline_object_definition + businessName: Inline Object Definition + logicalType: object + description: A JSON representation of additional shipment info + physicalType: json + primaryKey: false + partitionKeyPosition: -1 + classification: internal + examples: + - "{\"destination\": \"New York\"}" + quality: + - type: text + description: "{field} must contain the field \"destination\"" + - name: address + businessName: Shipment Address + logicalType: object + description: Shipping address details. + required: true + physicalType: JSON + classification: restricted + properties: + - name: street + businessName: Street + logicalType: string + description: Street address. + required: true + physicalType: text + primaryKey: false + classification: restricted + examples: + - Marienplatz 1 + - name: city + businessName: City + logicalType: string + description: City of the shipping address. + required: true + physicalType: text + primaryKey: false + classification: restricted + examples: + - Munich + - name: country + businessName: Country + logicalType: string + description: Country of the shipping address. + required: true + physicalType: text + primaryKey: false + classification: restricted + examples: + - DE +servers: + - server: production + type: bigquery + environment: production + dataset: shipments_v1 + project: acme_shipments_prod +support: + - channel: slackname + url: http://find.me.here + tool: slack + scope: interactive +price: + priceAmount: 1.0 + priceCurrency: EUR + priceUnit: Per 1000 requests +team: + - username: vimportant + role: administrator + dateIn: "2020-01-01" + - username: nimportant + role: reader + dateIn: "2020-01-01" + dateOut: "2024-10-10" +slaProperties: + - property: availability + value: 95% + unit: '%' + driver: operational diff --git a/tests/fixtures/export/datacontract_no_model_type.yaml b/tests/fixtures/export/datacontract_no_model_type.yaml new file mode 100644 index 000000000..cd652b929 --- /dev/null +++ b/tests/fixtures/export/datacontract_no_model_type.yaml @@ -0,0 +1,91 @@ +dataContractSpecification: 1.1.0 +id: orders-unit-test +info: + title: Orders Unit Test + version: 1.0.0 + status: active + owner: checkout + description: The orders data contract + contact: + email: team-orders@example.com + url: https://wiki.example.com/teams/checkout + otherField: otherValue +terms: + usage: This data contract serves to demo datacontract CLI export. + limitations: Not intended to use in production + billing: free + noticePeriod: P3M +servers: + production: + type: snowflake + environment: production + account: my-account + database: my-database + schema: my-schema + roles: + - name: analyst_us + description: Access to the data for US region +models: + orders: + title: Webshop Orders + description: The orders model + primaryKey: + - order_id + - order_status + customModelProperty1: customModelProperty1Value + fields: + order_id: + title: Order ID + type: varchar + unique: true + required: true + minLength: 8 + maxLength: 10 + pii: true + classification: sensitive + tags: + - order_id + pattern: ^B[0-9]+$ + examples: + - B12345678 + - B12345679 + customFieldProperty1: customFieldProperty1Value + order_total: + type: bigint + required: true + description: The order_total field + minimum: 0 + maximum: 1000000 + quality: + - type: sql + description: 95% of all order total values are expected to be between 10 and 499 EUR. + query: | + SELECT quantile_cont(order_total, 0.95) AS percentile_95 + FROM orders + mustBeBetween: [1000, 49900] + order_status: + type: text + required: true + enum: + - pending + - shipped + - delivered + quality: + - type: sql + description: Row Count + query: | + SELECT COUNT(*) AS row_count + FROM orders + mustBeGreaterThan: 1000 +definitions: + customer_id: + title: Customer ID + type: string + format: uuid + description: Unique identifier for the customer. + examples: + - acbd1a47-9dca-4cb8-893e-87aa0aa0f243 + - 5742637f-bb8b-4f0c-8ed1-afb1a91300a9 + tags: + - features + - pii \ No newline at end of file diff --git a/tests/fixtures/great-expectations/datacontract_quality_column.yaml b/tests/fixtures/great-expectations/datacontract_quality_column.yaml new file mode 100644 index 000000000..dcd59ff07 --- /dev/null +++ b/tests/fixtures/great-expectations/datacontract_quality_column.yaml @@ -0,0 +1,35 @@ +dataContractSpecification: 1.1.0 +id: my-data-contract-id +info: + title: Orders Unit Test + version: 1.1.1 + owner: checkout + description: The orders data contract + contact: + email: team-orders@example.com + url: https://wiki.example.com/teams/checkout +models: + orders: + description: test + fields: + id: + description: Unique identifier for each alert. + type: string + required: true + primaryKey: true + unique: true + type: + description: The type of alert that has fired. + type: string + required: true + enum: [ "A", "B", "C", "D", "E" ] + quality: + - type: custom + engine: great-expectations + description: "Accepted Values for type" + implementation: + expectation_type: expect_column_value_lengths_to_equal + kwargs: + value: 1 + meta: + notes: "Ensures that column length is 1." \ No newline at end of file diff --git a/tests/fixtures/great-expectations/datacontract_quality_yaml.yaml b/tests/fixtures/great-expectations/datacontract_quality_yaml.yaml new file mode 100644 index 000000000..a122019c5 --- /dev/null +++ b/tests/fixtures/great-expectations/datacontract_quality_yaml.yaml @@ -0,0 +1,26 @@ +dataContractSpecification: 0.9.1 +id: my-data-contract-id + +info: + title: Orders Unit Test + version: 1.0.0 + owner: checkout + description: The orders data contract + contact: + email: team-orders@example.com + url: https://wiki.example.com/teams/checkout +models: + orders: + description: test + fields: + order_id: + type: string + required: true + quality: + - type: custom + engine: great-expectations + implementation: + expectation_type: expect_table_row_count_to_be_between + kwargs: + min_value: 10 + meta: {} diff --git a/tests/fixtures/local-json/data/nested_types.json b/tests/fixtures/local-json/data/nested_types.json new file mode 100644 index 000000000..2354407ea --- /dev/null +++ b/tests/fixtures/local-json/data/nested_types.json @@ -0,0 +1,28 @@ +[ + { + "id": 1, + "tags": [ + { + "foo": "bar", + "arr": [ 1, 2, 3 ] + }, + { + "foo": "lap", + "arr": [ 4 ] + } + ], + "name": { + "first": "John", + "last": "Doe" + } + }, + { + "id": 2, + "tags": [ + { + "foo": "zap", + "arr": [ ] + } + ] + } +] \ No newline at end of file diff --git a/tests/fixtures/odcs_v3/adventureworks.datacontract.yml b/tests/fixtures/odcs_v3/adventureworks.datacontract.yml index fbe37277e..c9d9d86ef 100644 --- a/tests/fixtures/odcs_v3/adventureworks.datacontract.yml +++ b/tests/fixtures/odcs_v3/adventureworks.datacontract.yml @@ -4407,3 +4407,81 @@ models: criticalDataElement: false partitioned: false physicalType: timestamp + StoreHolidayHours: + title: StoreHolidayHours + type: array + required: false + primaryKey: false + items: + type: object + fields: + Date: + title: Date + type: date + required: false + primaryKey: false + examples: + - '2024-08-13' + config: + criticalDataElement: false + partitioned: false + physicalType: string + Close: + title: Close + type: date + required: false + primaryKey: false + examples: + - 02:00 PM + config: + criticalDataElement: false + partitioned: false + physicalType: string + Open: + title: Open + type: date + required: false + primaryKey: false + examples: + - 10:00 AM + config: + criticalDataElement: false + partitioned: false + physicalType: string + config: + criticalDataElement: false + partitioned: false + physicalType: array + extendedData: + title: extendedData + type: object + required: true + primaryKey: false + fields: + pharmacyUUID: + title: pharmacyUUID + type: string + required: true + primaryKey: false + unique: true + examples: + - ec43dd63-c258-4506-8965-88a9e0c130ad + config: + criticalDataElement: false + partitioned: false + physicalType: string + config: + criticalDataElement: false + partitioned: false + physicalType: object + ArrayComments: + title: ArrayComments + type: array + required: false + primaryKey: false + items: + type: string + config: + criticalDataElement: false + partitioned: false + physicalType: array \ No newline at end of file diff --git a/tests/fixtures/odcs_v3/adventureworks.odcs.yaml b/tests/fixtures/odcs_v3/adventureworks.odcs.yaml index c5e2a42dc..b1fed2cbc 100644 --- a/tests/fixtures/odcs_v3/adventureworks.odcs.yaml +++ b/tests/fixtures/odcs_v3/adventureworks.odcs.yaml @@ -5320,4 +5320,54 @@ schema: criticalDataElement: false primaryKey: false required: false + - name: StoreHolidayHours + businessName: StoreHolidayHours + logicalType: array + physicalType: array + required: false + unique: false + items: + logicalType: object + properties: + - name: Date + businessName: Date + logicalType: date + physicalType: string + examples: + - "2024-08-13" + - name: Close + businessName: Close + logicalType: date + physicalType: string + examples: + - "02:00 PM" + - name: Open + businessName: Open + logicalType: date + physicalType: string + examples: + - "10:00 AM" + - name: extendedData + businessName: extendedData + logicalType: object + physicalType: object + required: true + unique: false + properties: + - name : pharmacyUUID + businessName: pharmacyUUID + logicalType: string + physicalType: string + required: true + unique: true + examples: + - "ec43dd63-c258-4506-8965-88a9e0c130ad" + - name: ArrayComments + businessName: ArrayComments + logicalType: array + physicalType: array + required: false + unique: false + items: + logicalType: string contractCreatedTs: "2023-09-28T20:24:49.331+00:00" diff --git a/tests/fixtures/spark/import/users_datacontract_desc.yml b/tests/fixtures/spark/import/users_datacontract_desc.yml new file mode 100644 index 000000000..c93d614d7 --- /dev/null +++ b/tests/fixtures/spark/import/users_datacontract_desc.yml @@ -0,0 +1,59 @@ +dataContractSpecification: 1.1.0 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +servers: + local: + type: dataframe +models: + users: + description: description + fields: + id: + type: string + required: false + name: + type: string + required: false + address: + type: struct + required: false + fields: + number: + type: integer + required: false + street: + type: string + required: false + city: + type: string + required: false + tags: + type: array + required: false + items: + type: string + required: false + metadata: + type: map + required: false + keys: + type: string + required: true + values: + type: struct + required: false + fields: + value: + type: string + required: false + type: + type: string + required: false + timestamp: + type: long + required: false + source: + type: string + required: false \ No newline at end of file diff --git a/tests/fixtures/spark/import/users_datacontract_no_desc.yml b/tests/fixtures/spark/import/users_datacontract_no_desc.yml new file mode 100644 index 000000000..256e72fe5 --- /dev/null +++ b/tests/fixtures/spark/import/users_datacontract_no_desc.yml @@ -0,0 +1,58 @@ +dataContractSpecification: 1.1.0 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +servers: + local: + type: dataframe +models: + users: + fields: + id: + type: string + required: false + name: + type: string + required: false + address: + type: struct + required: false + fields: + number: + type: integer + required: false + street: + type: string + required: false + city: + type: string + required: false + tags: + type: array + required: false + items: + type: string + required: false + metadata: + type: map + required: false + keys: + type: string + required: true + values: + type: struct + required: false + fields: + value: + type: string + required: false + type: + type: string + required: false + timestamp: + type: long + required: false + source: + type: string + required: false \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py index 68098f86e..7db685a0c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -16,7 +16,7 @@ def test_lint(): assert response.status_code == 200 print(response.json()) assert response.json()["result"] == "passed" - assert len(response.json()["checks"]) == 7 + assert len(response.json()["checks"]) == 6 assert all([check["result"] == "passed" for check in response.json()["checks"]]) diff --git a/tests/test_duckdb_json.py b/tests/test_duckdb_json.py new file mode 100644 index 000000000..e7e34f70f --- /dev/null +++ b/tests/test_duckdb_json.py @@ -0,0 +1,77 @@ +from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection +from datacontract.lint import resolve +from datacontract.model.run import Run + + +def test_nested_json(): + data_contract_str = """ +dataContractSpecification: 1.1.0 +id: "61111-0002" +info: + title: Sample data of nested types + version: 1.0.0 +servers: + sample: + type: local + path: ./fixtures/local-json/data/nested_types.json + format: json + delimiter: array +models: + sample_data: + type: object + fields: + id: + type: integer + required: true + tags: + type: array + required: true + items: + type: object + fields: + foo: + type: string + required: true + arr: + type: array + items: + type: integer + name: + type: object + required: false + fields: + first: + type: string + last: + type: string + """ + data_contract = resolve.resolve_data_contract(data_contract_str=data_contract_str) + run = Run.create_run() + con = get_duckdb_connection(data_contract, data_contract.servers["sample"], run) + tbl = con.table("sample_data") + assert tbl.columns == ["id", "tags", "name"] + assert [x[1].lower() for x in tbl.description] == ["number", "list", "dict"] + # test that duckdb correct unpacked the nested structures. + assert tbl.fetchone() == ( + 1, + [{"foo": "bar", "arr": [1, 2, 3]}, {"foo": "lap", "arr": [4]}], + {"first": "John", "last": "Doe"}, + ) + assert tbl.fetchone() == (2, [{"foo": "zap", "arr": []}], None) + assert tbl.fetchone() is None + ## check nested tables + tbl = con.table("sample_data__tags") + assert tbl.columns == ["foo", "arr"] + assert [x[1].lower() for x in tbl.description] == ["string", "list"] + assert tbl.fetchone() == ("bar", [1, 2, 3]) + assert tbl.fetchone() == ("lap", [4]) + assert tbl.fetchone() == ("zap", []) + assert tbl.fetchone() is None + tbl = con.table("sample_data__tags__arr") + assert tbl.columns == ["arr"] + assert [x[1].lower() for x in tbl.description] == ["number"] + assert tbl.fetchall() == [(1,), (2,), (3,), (4,)] + tbl = con.table("sample_data__name") + assert tbl.columns == ["first", "last"] + assert [x[1].lower() for x in tbl.description] == ["string", "string"] + assert tbl.fetchall() == [("John", "Doe")] diff --git a/tests/test_export_dbt_models.py b/tests/test_export_dbt_models.py index 237fad655..f6c216409 100644 --- a/tests/test_export_dbt_models.py +++ b/tests/test_export_dbt_models.py @@ -131,6 +131,57 @@ def test_to_dbt_models_with_server(): assert result == yaml.safe_load(expected_dbt_model) +def test_to_dbt_models_with_no_model_type(): + data_contract = DataContractSpecification.from_file("fixtures/export/datacontract_no_model_type.yaml") + expected_dbt_model = """ +version: 2 +models: +- name: orders + config: + meta: + data_contract: orders-unit-test + owner: checkout + description: The orders model + columns: + - name: order_id + data_tests: + - not_null + - unique + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 8 + max_value: 10 + - dbt_expectations.expect_column_values_to_match_regex: + regex: ^B[0-9]+$ + data_type: VARCHAR + meta: + pii: true + classification: sensitive + tags: + - order_id + - name: order_total + data_tests: + - not_null + - dbt_expectations.expect_column_values_to_be_between: + min_value: 0 + max_value: 1000000 + data_type: NUMBER + description: The order_total field + - name: order_status + data_tests: + - not_null + - accepted_values: + values: + - pending + - shipped + - delivered + data_type: TEXT +""" + + result = yaml.safe_load(to_dbt_models_yaml(data_contract)) + + assert result == yaml.safe_load(expected_dbt_model) + + def read_file(file): if not os.path.exists(file): print(f"The file '{file}' does not exist.") diff --git a/tests/test_export_great_expectations.py b/tests/test_export_great_expectations.py index 223f4e6ef..d66573832 100644 --- a/tests/test_export_great_expectations.py +++ b/tests/test_export_great_expectations.py @@ -88,6 +88,22 @@ def data_contract_great_expectations_quality_file() -> DataContractSpecification ) +@pytest.fixture +def data_contract_great_expectations_quality_yaml() -> DataContractSpecification: + return resolve.resolve_data_contract_from_location( + "./fixtures/great-expectations/datacontract_quality_yaml.yaml", + inline_quality=True, + ) + + +@pytest.fixture +def data_contract_great_expectations_quality_column() -> DataContractSpecification: + return resolve.resolve_data_contract_from_location( + "./fixtures/great-expectations/datacontract_quality_column.yaml", + inline_quality=True, + ) + + @pytest.fixture def expected_json_suite() -> Dict[str, Any]: return { @@ -119,6 +135,66 @@ def expected_json_suite() -> Dict[str, Any]: } +@pytest.fixture +def expected_json_suite_table_quality() -> Dict[str, Any]: + return { + "data_asset_type": "null", + "expectation_suite_name": "orders.1.0.0", + "expectations": [ + {"expectation_type": "expect_table_row_count_to_be_between", "kwargs": {"min_value": 10}, "meta": {}}, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": {"column_list": ["order_id"]}, + "meta": {}, + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": {"column": "order_id", "type_": "string"}, + "meta": {}, + }, + ], + "meta": {}, + } + + +@pytest.fixture +def expected_json_suite_with_enum() -> Dict[str, Any]: + return { + "data_asset_type": "null", + "expectation_suite_name": "orders.1.1.1", + "expectations": [ + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": {"column_list": ["id", "type"]}, + "meta": {}, + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": {"column": "id", "type_": "string"}, + "meta": {}, + }, + {"expectation_type": "expect_column_values_to_be_unique", "kwargs": {"column": "id"}, "meta": {}}, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": {"column": "type", "type_": "string"}, + "meta": {}, + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": {"column": "type", "value_set": ["A", "B", "C", "D", "E"]}, + "meta": {}, + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": {"value": 1}, + "meta": {"notes": "Ensures that column length is 1."}, + "column": "type", + }, + ], + "meta": {}, + } + + @pytest.fixture def expected_spark_engine() -> Dict[str, Any]: return { @@ -290,6 +366,11 @@ def test_to_great_expectation(data_contract_basic: DataContractSpecification): "kwargs": {"column": "order_status", "type_": "text"}, "meta": {}, }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": {"column": "order_status", "value_set": ["pending", "shipped", "delivered"]}, + "meta": {}, + }, ], "meta": {}, } @@ -602,3 +683,25 @@ def test_to_great_expectation_missing_quality_json_file(): assert False except DataContractException as dataContractException: assert dataContractException.reason == "Cannot resolve reference ./fixtures/great-expectations/missing.json" + + +def test_to_great_expectation_quality_yaml( + data_contract_great_expectations_quality_yaml: DataContractSpecification, + expected_json_suite_table_quality: Dict[str, Any], +): + """ + Test with Quality definition in a model quality list + """ + result = to_great_expectations(data_contract_great_expectations_quality_yaml, "orders") + assert result == json.dumps(expected_json_suite_table_quality, indent=2) + + +def test_to_great_expectation_quality_column( + data_contract_great_expectations_quality_column: DataContractSpecification, + expected_json_suite_with_enum: Dict[str, Any], +): + """ + Test with quality definition in a field quality list + """ + result = to_great_expectations(data_contract_great_expectations_quality_column, "orders") + assert result == json.dumps(expected_json_suite_with_enum, indent=2) diff --git a/tests/test_export_mermaid.py b/tests/test_export_mermaid.py new file mode 100644 index 000000000..9b1e31bd4 --- /dev/null +++ b/tests/test_export_mermaid.py @@ -0,0 +1,56 @@ +import os +from pathlib import Path + +from typer.testing import CliRunner + +from datacontract.cli import app + + +def test_cli(): + runner = CliRunner() + result = runner.invoke(app, ["export", "./fixtures/export/datacontract.yaml", "--format", "mermaid"]) + assert result.exit_code == 0 + + +def test_cli_with_output(tmp_path: Path): + runner = CliRunner() + result = runner.invoke( + app, + [ + "export", + "./fixtures/export/datacontract.yaml", + "--format", + "mermaid", + "--output", + tmp_path / "datacontract.mermaid", + ], + ) + assert result.exit_code == 0 + assert os.path.exists(tmp_path / "datacontract.mermaid") + + +def test_mermaid_structure(tmp_path: Path): + datacontract_file = "fixtures/export/datacontract.yaml" + runner = CliRunner() + result = runner.invoke( + app, + [ + "export", + datacontract_file, + "--format", + "mermaid", + "--output", + tmp_path / "datacontract.mermaid", + ], + ) + assert result.exit_code == 0 + + with open(tmp_path / "datacontract.mermaid") as file: + content = file.read() + + # Check structure + assert "erDiagram" in content + assert "orders" in content + assert "order_id" in content + assert "order_total" in content + assert "order_status" in content diff --git a/tests/test_export_odcs_v3.py b/tests/test_export_odcs_v3.py index ee922aee1..6c9d86cd1 100644 --- a/tests/test_export_odcs_v3.py +++ b/tests/test_export_odcs_v3.py @@ -27,7 +27,6 @@ def test_to_odcs(): version: 1.0.0 status: active description: - purpose: null limitations: Not intended to use in production usage: This data contract serves to demo datacontract CLI export. @@ -41,12 +40,12 @@ def test_to_odcs(): - name: order_id businessName: Order ID logicalType: string + physicalType: varchar logicalTypeOptions: minLength: 8 maxLength: 10 pattern: ^B[0-9]+$ - physicalType: varchar - nullable: false + required: true unique: true tags: - "order_id" @@ -61,11 +60,11 @@ def test_to_odcs(): value: true - name: order_total logicalType: integer + physicalType: bigint logicalTypeOptions: minimum: 0 maximum: 1000000 - physicalType: bigint - nullable: false + required: true description: The order_total field quality: - type: sql @@ -77,7 +76,7 @@ def test_to_odcs(): - name: order_status logicalType: string physicalType: text - nullable: false + required: true quality: - type: sql description: Row Count @@ -96,7 +95,7 @@ def test_to_odcs(): database: my-database schema: my-schema roles: - - name: analyst_us + - role: analyst_us description: Access to the data for US region support: @@ -114,7 +113,16 @@ def test_to_odcs(): odcs = to_odcs_v3_yaml(data_contract) - assert yaml.safe_load(odcs) == yaml.safe_load(expected_odcs_model) + assert_equals_odcs_yaml_str(expected_odcs_model, odcs) + + +def assert_equals_odcs_yaml_str(expected, actual): + from open_data_contract_standard.model import OpenDataContractStandard + + expected_yaml = OpenDataContractStandard.from_string(expected).to_yaml() + print(expected_yaml) + assert expected_yaml == actual + assert yaml.safe_load(actual) == yaml.safe_load(expected) def read_file(file): diff --git a/tests/test_export_sql.py b/tests/test_export_sql.py index 885aebb04..0234f3d8e 100644 --- a/tests/test_export_sql.py +++ b/tests/test_export_sql.py @@ -32,18 +32,18 @@ def test_to_sql_ddl_snowflake(): -- Data Contract: urn:datacontract:checkout:snowflake_orders_pii_v2 -- SQL Dialect: snowflake CREATE TABLE orders ( - ORDER_ID TEXT not null, - ORDER_TIMESTAMP TIMESTAMP_TZ not null, - ORDER_TOTAL NUMBER not null, - CUSTOMER_ID TEXT, - CUSTOMER_EMAIL_ADDRESS TEXT not null, - PROCESSING_TIMESTAMP TIMESTAMP_LTZ not null -); + ORDER_ID TEXT not null COMMENT 'An internal ID that identifies an order in the online shop.', + ORDER_TIMESTAMP TIMESTAMP_TZ not null COMMENT 'The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful.', + ORDER_TOTAL NUMBER not null COMMENT 'Total amount the smallest monetary unit (e.g., cents).', + CUSTOMER_ID TEXT COMMENT 'Unique identifier for the customer.', + CUSTOMER_EMAIL_ADDRESS TEXT not null COMMENT 'The email address, as entered by the customer. The email address was not verified.', + PROCESSING_TIMESTAMP TIMESTAMP_LTZ not null COMMENT 'The processing timestamp in the current session’s time zone.' +) COMMENT='One record per order. Includes cancelled and deleted orders.'; CREATE TABLE line_items ( - LINE_ITEM_ID TEXT not null, - ORDER_ID TEXT, - SKU TEXT -); + LINE_ITEM_ID TEXT not null COMMENT 'Primary key of the lines_item_id table', + ORDER_ID TEXT COMMENT 'An internal ID that identifies an order in the online shop.', + SKU TEXT COMMENT 'The purchased article number' +) COMMENT='A single article that is part of an order.'; """.strip() assert actual == expected diff --git a/tests/test_import_csv.py b/tests/test_import_csv.py index 2e8b75388..9ce8eda33 100644 --- a/tests/test_import_csv.py +++ b/tests/test_import_csv.py @@ -24,10 +24,20 @@ def test_cli(): assert result.exit_code == 0 -def test_import_sql(): - result = DataContract().import_from_source("csv", csv_file_path) +def test_import_csv(): + source = "fixtures/csv/data/sample_data_5_column.csv" + result = DataContract().import_from_source("csv", source) + model = result.models["sample_data_5_column"] + assert model is not None + assert len(model.fields["field_one"].examples) == 5 + assert len(model.fields["field_two"].examples) > 0 + assert len(model.fields["field_three"].examples) > 0 + assert model.fields["field_four"].examples is None + assert model.fields["field_five"].examples is None + for k in model.fields.keys(): + model.fields[k].examples = None - expected = """dataContractSpecification: 1.1.0 + expected = f"""dataContractSpecification: 1.1.0 id: my-data-contract-id info: title: My Data Contract @@ -36,19 +46,33 @@ def test_import_sql(): production: type: local format: csv - path: fixtures/csv/data/sample_data.csv + path: {source} delimiter: ',' models: - sample_data: - description: Csv file with encoding ascii + sample_data_5_column: + description: Generated model of fixtures/csv/data/sample_data_5_column.csv type: table fields: field_one: type: string + required: true + unique: true field_two: type: integer + required: true + minimum: 14 + maximum: 89 field_three: + type: timestamp + unique: true + field_four: type: string + field_five: + type: boolean + required: true + field_six: + type: string + format: email """ print("Result", result.to_yaml()) assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) diff --git a/tests/test_import_excel.py b/tests/test_import_excel.py new file mode 100644 index 000000000..33c84e4a3 --- /dev/null +++ b/tests/test_import_excel.py @@ -0,0 +1,40 @@ +import os +import sys + +import yaml +from typer.testing import CliRunner + +from datacontract.cli import app +from datacontract.imports.excel_importer import import_excel_as_odcs + +# logging.basicConfig(level=logging.DEBUG, force=True) + + +def test_cli(): + runner = CliRunner() + result = runner.invoke( + app, + [ + "import", + "--format", + "excel", + "--source", + "./fixtures/excel/shipments-odcs.xlsx", + ], + ) + assert result.exit_code == 0 + + +def test_import_excel_odcs(): + result = import_excel_as_odcs("./fixtures/excel/shipments-odcs.xlsx") + expected_datacontract = read_file("fixtures/excel/shipments-odcs.yaml") + assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected_datacontract) + + +def read_file(file): + if not os.path.exists(file): + print(f"The file '{file}' does not exist.") + sys.exit(1) + with open(file, "r") as file: + file_content = file.read() + return file_content diff --git a/tests/test_import_spark.py b/tests/test_import_spark.py index c09cff62c..fe7d073f1 100644 --- a/tests/test_import_spark.py +++ b/tests/test_import_spark.py @@ -6,67 +6,6 @@ from datacontract.cli import app from datacontract.data_contract import DataContract -expected = """ -dataContractSpecification: 1.1.0 -id: my-data-contract-id -info: - title: My Data Contract - version: 0.0.1 -servers: - local: - type: dataframe -models: - users: - fields: - id: - type: string - required: false - name: - type: string - required: false - address: - type: struct - required: false - fields: - number: - type: integer - required: false - street: - type: string - required: false - city: - type: string - required: false - tags: - type: array - required: false - items: - type: string - required: false - metadata: - type: map - required: false - keys: - type: string - required: true - values: - type: struct - required: false - fields: - value: - type: string - required: false - type: - type: string - required: false - timestamp: - type: long - required: false - source: - type: string - required: false - """ - @pytest.fixture(scope="session") def spark(tmp_path_factory) -> SparkSession: @@ -89,62 +28,87 @@ def spark(tmp_path_factory) -> SparkSession: return spark -def test_cli(spark: SparkSession): - df_user = spark.createDataFrame( - data=[ - { - "id": "1", - "name": "John Doe", - "address": { - "number": 123, - "street": "Maple Street", - "city": "Anytown", - }, - "tags": ["tag1", "tag2"], - "metadata": { - "my-source-metadata": { - "value": "1234567890", - "type": "STRING", - "timestamp": 1646053400, - "source": "my-source", - } - }, +@pytest.fixture() +def user_datacontract_desc(): + with open("fixtures/spark/import/users_datacontract_desc.yml", "r") as f: + data_contract_str = f.read() + return data_contract_str + + +@pytest.fixture() +def user_datacontract_no_desc(): + with open("fixtures/spark/import/users_datacontract_no_desc.yml", "r") as f: + data_contract_str = f.read() + return data_contract_str + + +@pytest.fixture() +def user_row(): + return { + "id": "1", + "name": "John Doe", + "address": { + "number": 123, + "street": "Maple Street", + "city": "Anytown", + }, + "tags": ["tag1", "tag2"], + "metadata": { + "my-source-metadata": { + "value": "1234567890", + "type": "STRING", + "timestamp": 1646053400, + "source": "my-source", } - ], - schema=types.StructType( - [ - types.StructField("id", types.StringType()), - types.StructField("name", types.StringType()), - types.StructField( - "address", - types.StructType( + }, + } + + +@pytest.fixture() +def user_schema(): + return types.StructType( + [ + types.StructField("id", types.StringType()), + types.StructField("name", types.StringType()), + types.StructField( + "address", + types.StructType( + [ + types.StructField("number", types.IntegerType()), + types.StructField("street", types.StringType()), + types.StructField("city", types.StringType()), + ] + ), + ), + types.StructField("tags", types.ArrayType(types.StringType())), + types.StructField( + "metadata", + types.MapType( + keyType=types.StringType(), + valueType=types.StructType( [ - types.StructField("number", types.IntegerType()), - types.StructField("street", types.StringType()), - types.StructField("city", types.StringType()), + types.StructField("value", types.StringType()), + types.StructField("type", types.StringType()), + types.StructField("timestamp", types.LongType()), + types.StructField("source", types.StringType()), ] ), ), - types.StructField("tags", types.ArrayType(types.StringType())), - types.StructField( - "metadata", - types.MapType( - keyType=types.StringType(), - valueType=types.StructType( - [ - types.StructField("value", types.StringType()), - types.StructField("type", types.StringType()), - types.StructField("timestamp", types.LongType()), - types.StructField("source", types.StringType()), - ] - ), - ), - ), - ] - ), + ), + ] ) - df_user.createOrReplaceTempView("users") + +@pytest.fixture() +def df_user(spark: SparkSession, user_row, user_schema): + return spark.createDataFrame(data=[user_row], schema=user_schema) + + +def test_cli(spark: SparkSession, df_user, user_datacontract_no_desc): + df_user.write.mode("overwrite").saveAsTable("users") + + expected_no_desc = user_datacontract_no_desc + runner = CliRunner() result = runner.invoke( app, @@ -159,7 +123,7 @@ def test_cli(spark: SparkSession): output = result.stdout assert result.exit_code == 0 - assert output.strip() == expected.strip() + assert output.strip() == expected_no_desc.strip() def test_table_not_exists(): @@ -178,62 +142,24 @@ def test_table_not_exists(): assert result.exit_code == 1 -def test_prog(spark: SparkSession): - df_user = spark.createDataFrame( - data=[ - { - "id": "1", - "name": "John Doe", - "address": { - "number": 123, - "street": "Maple Street", - "city": "Anytown", - }, - "tags": ["tag1", "tag2"], - "metadata": { - "my-source-metadata": { - "value": "1234567890", - "type": "STRING", - "timestamp": 1646053400, - "source": "my-source", - } - }, - } - ], - schema=types.StructType( - [ - types.StructField("id", types.StringType()), - types.StructField("name", types.StringType()), - types.StructField( - "address", - types.StructType( - [ - types.StructField("number", types.IntegerType()), - types.StructField("street", types.StringType()), - types.StructField("city", types.StringType()), - ] - ), - ), - types.StructField("tags", types.ArrayType(types.StringType())), - types.StructField( - "metadata", - types.MapType( - keyType=types.StringType(), - valueType=types.StructType( - [ - types.StructField("value", types.StringType()), - types.StructField("type", types.StringType()), - types.StructField("timestamp", types.LongType()), - types.StructField("source", types.StringType()), - ] - ), - ), - ), - ] - ), - ) +def test_prog(spark: SparkSession, df_user, user_datacontract_no_desc, user_datacontract_desc): + df_user.write.mode("overwrite").saveAsTable("users") + + expected_desc = user_datacontract_desc + expected_no_desc = user_datacontract_no_desc + + # does not include a table level description (table method) + result1 = DataContract().import_from_source("spark", "users") + assert yaml.safe_load(result1.to_yaml()) == yaml.safe_load(expected_no_desc) + + # does include a table level description (table method) + result2 = DataContract().import_from_source("spark", "users", description="description") + assert yaml.safe_load(result2.to_yaml()) == yaml.safe_load(expected_desc) - df_user.createOrReplaceTempView("users") - result = DataContract().import_from_source("spark", "users") + # does not include a table level description (dataframe object method) + result3 = DataContract().import_from_source("spark", "users", dataframe=df_user) + assert yaml.safe_load(result3.to_yaml()) == yaml.safe_load(expected_no_desc) - assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) + # does include a table level description (dataframe object method) + result4 = DataContract().import_from_source("spark", "users", dataframe=df_user, description="description") + assert yaml.safe_load(result4.to_yaml()) == yaml.safe_load(expected_desc) diff --git a/tests/test_import_unity_file.py b/tests/test_import_unity_file.py index 050a88fdb..3f9a14a36 100644 --- a/tests/test_import_unity_file.py +++ b/tests/test_import_unity_file.py @@ -1,3 +1,4 @@ +import pytest import yaml from typer.testing import CliRunner @@ -30,8 +31,9 @@ def test_import_unity(): with open("fixtures/databricks-unity/import/datacontract.yaml") as file: expected = file.read() - print("Result:\n", result.to_yaml()) - assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) + result_yaml = result.to_yaml() + print("Result:\n", result_yaml) + assert yaml.safe_load(result_yaml) == yaml.safe_load(expected) assert DataContract(data_contract_str=expected).lint(enabled_linters="none").has_passed() @@ -51,6 +53,7 @@ def test_cli_complex_types(): assert result.exit_code == 0 +@pytest.mark.skip(reason="Complex types are not perfectly supported for the unity catalog import") def test_import_unity_complex_types(): print("running test_import_unity_complex_types") result = DataContract().import_from_source( @@ -63,3 +66,55 @@ def test_import_unity_complex_types(): print("Result:\n", result.to_yaml()) assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) assert DataContract(data_contract_str=expected).lint(enabled_linters="none").has_passed() + + +def test_import_unity_with_owner_and_id(): + print("running test_import_unity_with_owner_and_id") + result = DataContract().import_from_source( + "unity", "fixtures/databricks-unity/import/unity_table_schema.json", owner="sales-team", id="orders-v1" + ) + + # Verify owner and id are set correctly + assert result.id == "orders-v1" + assert result.info.owner == "sales-team" + + # Verify the rest of the contract is imported correctly + with open("fixtures/databricks-unity/import/datacontract.yaml") as file: + expected = file.read() + expected_dict = yaml.safe_load(expected) + result_dict = yaml.safe_load(result.to_yaml()) + + # Remove owner and id from comparison since we set them differently + expected_dict.pop("id", None) + expected_dict["info"].pop("owner", None) + result_dict.pop("id", None) + result_dict["info"].pop("owner", None) + + assert result_dict == expected_dict + + +def test_cli_with_owner_and_id(): + print("running test_cli_with_owner_and_id") + runner = CliRunner() + result = runner.invoke( + app, + [ + "import", + "--format", + "unity", + "--source", + "fixtures/databricks-unity/import/unity_table_schema.json", + "--owner", + "sales-team", + "--id", + "orders-v1", + ], + ) + assert result.exit_code == 0 + + # Parse the output YAML + output_dict = yaml.safe_load(result.stdout) + + # Verify owner and id are set correctly + assert output_dict["id"] == "orders-v1" + assert output_dict["info"]["owner"] == "sales-team" diff --git a/tests/test_lint.py b/tests/test_lint.py index 5bb6552f2..5cc1fb514 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -27,7 +27,7 @@ def test_lint_invalid_data_contract(): def test_lint_cli_valid(): data_contract_file = "fixtures/lint/valid_datacontract.yaml" - expected_output = "🟢 data contract is valid. Run 7 checks." + expected_output = "🟢 data contract is valid. Run 6 checks." result = runner.invoke(app, ["lint", data_contract_file]) diff --git a/tests/test_quality_schema_linter.py b/tests/test_quality_schema_linter.py deleted file mode 100644 index 278f87fb8..000000000 --- a/tests/test_quality_schema_linter.py +++ /dev/null @@ -1,35 +0,0 @@ -import datacontract.lint.resolve as resolve -from datacontract.lint.linters.quality_schema_linter import QualityUsesSchemaLinter -from datacontract.model.run import Check - - -def construct_error_check(msg: str) -> Check: - return Check( - type="lint", - name="Linter 'Quality check(s) use model'", - result="warning", - engine="datacontract", - reason=msg, - ) - - -success_check = Check(type="lint", name="Linter 'Quality check(s) use model'", result="passed", engine="datacontract") - -data_contract_file = "fixtures/lint/datacontract_quality_schema.yaml" - - -def test_lint_correct_sodacl(): - base_contract_sodacl = resolve.resolve_data_contract_from_location(data_contract_file) - result = QualityUsesSchemaLinter().lint(base_contract_sodacl) - assert result == [success_check] - - -def test_lint_incorrect_sodacl(): - base_contract_sodacl = resolve.resolve_data_contract_from_location(data_contract_file) - incorrect_contract = base_contract_sodacl.model_copy(deep=True) - incorrect_contract.quality.specification = """ - checks for tests: - - freshness(column_1) < 1d - """ - result = QualityUsesSchemaLinter().lint(incorrect_contract) - assert result == [construct_error_check("Quality check on unknown model 'tests'")] diff --git a/update_help.py b/update_help.py index e9515ab71..546524a58 100644 --- a/update_help.py +++ b/update_help.py @@ -24,7 +24,7 @@ def fetch_help(command: str) -> str: print(f"Fetching help text for command: {command}") - env = {"COLUMNS": "80"} # Set terminal width to 80 columns (or your preferred width) + env = {"COLUMNS": "100"} # Set terminal width to 100 columns (or your preferred width) result = subprocess.run( ["datacontract", command, "--help"], capture_output=True,