diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d0994df22..260547716 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -64,16 +64,19 @@ jobs: run: datacontract --version - name: Test datacontract init run: datacontract init new-datacontract.yaml + - name: Test datacontract export + run: datacontract export --format sql tests/fixtures/export/datacontract.odcs.yaml + - name: Install dependencies with duckdb + run: | + pip install -e '.[duckdb]' - name: Test datacontract test run: cd tests && datacontract test fixtures/local-delta/datacontract.yaml - - name: Test datacontract export - run: datacontract export --format odcs tests/fixtures/export/datacontract.yaml docker: runs-on: ubuntu-latest needs: - test - if: github.event_name != 'pull_request' + if: github.event_name != 'pull_request' && github.repository == 'datacontract/datacontract-cli' steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 568c27ce5..ce25bea1a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -100,6 +100,14 @@ jobs: id-token: write # IMPORTANT: mandatory for sigstore steps: + - uses: actions/checkout@v4 + - name: Extract changelog for version + run: | + VERSION="${{ github.ref_name }}" + VERSION="${VERSION#v}" # Strip 'v' prefix + # Extract section between this version header and next version header + sed -n "/^## \[$VERSION\]/,/^## \[/p" CHANGELOG.md | head -n -1 > release_notes.md + cat release_notes.md - name: Download all the dists uses: actions/download-artifact@v4 with: @@ -118,7 +126,7 @@ jobs: gh release create '${{ github.ref_name }}' --repo '${{ github.repository }}' - --notes "" + --notes-file release_notes.md - name: Upload artifact signatures to GitHub Release env: GITHUB_TOKEN: ${{ github.token }} @@ -192,3 +200,37 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} sbom: true + + push-to-ecr: + runs-on: ubuntu-latest + needs: + - docker + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up AWS CLI + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Login to AWS ECR + run: | + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/s4e5k7s9 + + - name: Copy Multi-Arch Image from Docker Hub to AWS ECR + run: | + VERSION="${{ github.ref_name }}" + VERSION="${VERSION#v}" # Strip 'v' prefix to match Docker Hub tag + docker buildx imagetools create \ + --tag public.ecr.aws/s4e5k7s9/datacontract-cli:latest \ + --tag public.ecr.aws/s4e5k7s9/datacontract-cli:${VERSION} \ + docker.io/datacontract/cli:${VERSION} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2251d7d7..e53648d7c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.4.7 + rev: v0.14.3 hooks: # Run the linter. - id: ruff diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml deleted file mode 100644 index e5158f1d8..000000000 --- a/.pre-commit-hooks.yaml +++ /dev/null @@ -1,17 +0,0 @@ -- id: datacontract-lint - name: Data Contract Linter - description: This hook lint the data contract. - entry: datacontract lint - files: "datacontract*.yaml" - language: python - additional_dependencies: ['.[all]'] - types: [yaml] - -- id: datacontract-test - name: Data Contract Tester - description: This hook test the data contract. - entry: datacontract test - files: "datacontract*.yaml" - language: python - additional_dependencies: ['.[all]'] - types: [yaml] diff --git a/API.md b/API.md index dc6d4c491..f44074632 100644 --- a/API.md +++ b/API.md @@ -39,7 +39,7 @@ curl -X 'POST' \ 'http://localhost:4242/test?server=production' \ -H 'accept: application/json' \ -H 'Content-Type: application/yaml' \ - -d 'dataContractSpecification: 1.2.0 + -d 'dataContractSpecification: 1.2.1 id: urn:datacontract:checkout:orders-latest info: title: Orders Latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a422b2ec..048d38895 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,185 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Fix BigQuery import for repeated fields +- Make Markdown export compatible with XHTML by replacing `
` with `
` (#1030) + +## [0.11.4] - 2026-01-19 + +### Changed + +- Made `duckdb` an optional dependency. Install with `pip install datacontract-cli[duckdb]` for local/S3/GCS/Azure file testing. +- Removed unused `fastparquet` and `numpy` core dependencies. + +### Added + +- Include searchable tags in catalog index.html + +### Fixed + +- Fixed example(s) field mapping for Data Contract Specification importer (#992). +- Spark exporter now supports decimal precision/scale via `customProperties` or parsing from `physicalType` (e.g., `decimal(10,2)`) (#996) +- Fix catalog/HTML export failing on ODCS contracts with no schema or no properties (#971) + +## [0.11.3] - 2026-01-10 + +### Fixed + +- Fix `datacontract init` to generate ODCS format instead of deprecated Data Contract Specification (#984) +- Fix ODCS lint failing on optional relationship `type` field by updating open-data-contract-standard to v3.1.2 (#971) +- Restrict DuckDB dependency to < 1.4.0 (#972) +- Fixed schema evolution support for optional fields in CSV and Parquet formats. Optional fields marked with `required: false` are no longer incorrectly treated as required during validation, enabling proper schema evolution where optional fields can be added to contracts without breaking validation of historical data files (#977) +- Fixed decimals in pydantic model export. Fields marked with `type: decimal` will be mapped to `decimal.Decimal` instead of `float`. +- Fix BigQuery test failure for fields with FLOAT or BOOLEAN types by mapping them to equivalent types (BOOL and FLOAT64) + +## [0.11.2] - 2025-12-15 + +### Added +- Add Impala engine support for Soda scans via ODCS `impala` server type. +### Fixed +- Restrict DuckDB dependency to < 1.4.0 (#972) + +## [0.11.1] - 2025-12-14 + +This is a major release with breaking changes: +We switched the internal data model from [Data Contract Specification](https://datacontract-specification.com) to [Open Data Contract Standard](https://datacontract.com/#odcs) (ODCS). + +Not all features that were available are supported in this version, as some features are not supported by the Open Data Contract Standard, such as: + +- Internal definitions using `$ref` (you can refer to external definitions via `authoritativeDefinition`) +- Lineage (no real workaround, use customProperties or transformation object if needed) +- Support for different physical types (no real workaround, use customProperties if needed) +- Support for enums (use quality metric `invalidValues`) +- Support for properties with type map and defining `keys` and `values` (use logical type map) +- Support for `scale` and `precision` (define them in `physicalType`) + +The reason for this change is that the Data Contract Specification is deprecated, we focus on best possible support for the Open Data Contract Standard. +We try to make this transition as seamless as possible. +If you face issues, please open an issue on GitHub. + +We continue support reading [Data Contract Specification](https://datacontract-specification.com) data contracts during v0.11.x releases until end of 2026. +To migrate existing data contracts to Open Data Contract Standard use this instruction: https://datacontract-specification.com/#migration + +### Changed + +- ODCS v3.1.0 is now the default format for all imports. +- Renamed `--model` option to `--schema-name` in the `export` command to align with ODCS terminology. +- Renamed exporter files from `*_converter.py` to `*_exporter.py` for consistency (internal change). + +### Added + +- If an ODCS slaProperty "freshness" is defined with a reference to the element (column), the CLI will now test freshness of the data. +- If an ODCS slaProperty "retention" is defined with a reference to the element (column), the CLI will now test retention of the data. +- Support for custom Soda quality checks in ODCS using `type: custom` and `engine: soda` with raw SodaCL implementation. + +### Fixed + +- Oracle: Fix `service_name` attribute access to use ODCS field name `serviceName` + +### Removed + +- The `breaking`, `changelog`, and `diff` commands are now deleted (#925). +- The `terraform` export format has been removed. + + +## [0.10.41] - 2025-12-02 + +### Changed + +- Great Expectations export: Update to Great Expectations 1.x format (#919) + - Changed `expectation_suite_name` to `name` in suite output + - Changed `expectation_type` to `type` in expectations + - Removed `data_asset_type` field from suite output + - **Breaking**: Users with custom quality definitions using `expectation_type` must update to use `type` + +### Added + +- test: Log server name and type in output (#963) +- api: CORS is now enabled for all origins +- quality: Support `{schema}` and `${schema}` placeholder in SQL quality checks to reference the server's database schema (#957) +- SQL Server: Support `DATACONTRACT_SQLSERVER_DRIVER` environment variable to specify the ODBC driver (#959) +- Excel: Add Oracle server type support for Excel export/import (#960) +- Excel: Add local/CSV server type support for Excel export/import (#961) +- Excel Export: Complete server types (glue, kafka, postgres, s3, snowflake, sqlserver, custom) + +### Fixed + +- Protobuf import: Fix transitive imports across subdirectories (#943) +- Protobuf export now works without error (#951) +- lint: YAML date values (e.g., `2022-01-15`) are now kept as strings instead of being converted to datetime objects, fixing ODCS schema validation +- export: field annotation now matches to number/numeric/decimal types +- Excel: Server port is now correctly parsed as integer instead of string for all server types +- Excel: Remove invalid `table` and `view` fields from custom server import +- Fixed DuckDB DDL generation to use `JSON` type instead of invalid empty `STRUCT()` for objects without defined properties ([#940](https://github.com/datacontract/datacontract-cli/issues/940)) + +### Deprecated + +- The `breaking`, `changelog`, and `diff` commands are now deprecated and will be removed in a future version (#925) + +## [0.10.40] - 2025-11-25 + +### Added + +- Support for ODCS v3.1.0 + +## [0.10.39] - 2025-11-20 + +### Added + +- Oracle DB: Client Directory for Connection Mode 'Thick' can now be specified in the `DATACONTRACT_ORACLE_CLIENT_DIR` environment variable (#949) + +### Fixed + +- Import composite primary keys from open data contract spec + +## [0.10.38] - 2025-11-11 + +### Added + +- Support for Oracle Database (>= 19C) + +### Fixed + +- Athena: Now correctly uses the (optional) AWS session token specified in the `DATACONTRACT_S3_SESSION_TOKEN' environment variable when testing contracts (#934) + +## [0.10.37] - 2025-11-03 + +### Added + +- import: Support for nested arrays in odcs v3 importer +- lint: ODCS schema is now checked before converting +- --debug flag for all commands + +### Fixed + +- export: Excel exporter now exports critical data element + + +## [0.10.36] - 2025-10-17 + +### Added + +- Support for Data Contract Specification v1.2.1 (Data Quality Metrics) +- Support for decimal testing in spark and databricks (#902) +- Support for BigQuery Flexible Schema in Data Contract Checks (#909) + +### Changed + +- `DataContract().import_from_source()` as an instance method is now deprecated. Use `DataContract.import_from_source()` as a class method instead. + +### Fixed + +- Export to DQX: Correct DQX format for global-level quality check of data contract export. (#877) +- Import the table tags from a open data contract spec v3 (#895) +- dbt export: Enhanced model-level primaryKey support with automatic test generation for single and multiple column primary keys (#898) +- ODCS: field discarded when no logicalType defined (#891) + +### Removed + +- Removed specific linters, as the linters did not support ODCS (#913) + ## [0.10.35] - 2025-08-25 ### Added @@ -19,7 +198,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Improved regex for extracting Azure storage account names from URLs with containerName@storageAccountName format (#848) - JSON Schema Check: Add globbing support for local JSON files - +- Fixed server section rendering for markdown exporter ## [0.10.34] - 2025-08-06 @@ -74,7 +253,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - `pytest tests\test_api.py`: Fixed an issue where special characters were not read correctly from file. -- `datacontract export --format mermaid`: Fixed an issue where the `mermaid` export did not handle references correctly +- `datacontract export --format mermaid`: Fixed an issue where the `mermaid` export did not handle references correctly ## [0.10.28] - 2025-06-05 @@ -287,7 +466,7 @@ Code for proto to datacontract (#696) ### Fixed - SQL Server: cannot escape reserved word on model (#557) -- Export dbt-staging-sql error on multi models contracts (#587) +- Export dbt-staging-sql error on multi models contracts (#587) ### Removed - OpenTelemetry publisher, as it was hardly used diff --git a/README.md b/README.md index 94fcd8715..c7ff8da06 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,10 @@ Slack Status

-The `datacontract` CLI is an open-source command-line tool for working with data contracts. -It uses data contract YAML files as [Data Contract Specification](https://datacontract.com/) or [ODCS](https://bitol-io.github.io/open-data-contract-standard/latest/) to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library. +The `datacontract` CLI is an open-source command-line tool for working with [data contracts](https://datacontract.com). +It natively supports the [Open Data Contract Standard](https://bitol-io.github.io/open-data-contract-standard/latest/) to lint data contracts, connect to data sources and execute schema and quality tests, and export to different formats. +The tool is written in Python. +It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library. ![Main features of the Data Contract CLI](datacontractcli.png) @@ -17,133 +19,125 @@ It uses data contract YAML files as [Data Contract Specification](https://dataco ## Getting started Let's look at this data contract: -[https://datacontract.com/examples/orders-latest/datacontract.yaml](https://datacontract.com/examples/orders-latest/datacontract.yaml) +[https://datacontract.com/orders-v1.odcs.yaml](https://datacontract.com/orders-v1.odcs.yaml) -We have a _servers_ section with endpoint details to the S3 bucket, _models_ for the structure of the data, _servicelevels_ and _quality_ attributes that describe the expected freshness and number of rows. +We have a _servers_ section with endpoint details to a Postgres database, _schema_ for the structure and semantics of the data, _service levels_ and _quality_ attributes that describe the expected freshness and number of rows. -This data contract contains all information to connect to S3 and check that the actual data meets the defined schema and quality requirements. We can use this information to test if the actual data product in S3 is compliant to the data contract. +This data contract contains all information to connect to the database and check that the actual data meets the defined schema specification and quality expectations. +We can use this information to test if the actual data product is compliant to the data contract. -Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI (or use the [Docker image](#docker)), +Let's use [uv](https://docs.astral.sh/uv/) to install the CLI (or use the [Docker image](#docker)), ```bash -$ python3 -m pip install 'datacontract-cli[all]' +$ uv tool install --python python3.11 --upgrade 'datacontract-cli[all]' ``` -now, let's run the tests: +Now, let's run the tests: ```bash -$ datacontract test https://datacontract.com/examples/orders-latest/datacontract.yaml +$ export DATACONTRACT_POSTGRES_USERNAME=datacontract_cli.egzhawjonpfweuutedfy +$ export DATACONTRACT_POSTGRES_PASSWORD=jio10JuQfDfl9JCCPdaCCpuZ1YO +$ datacontract test https://datacontract.com/orders-v1.odcs.yaml # returns: -Testing https://datacontract.com/examples/orders-latest/datacontract.yaml -╭────────┬─────────────────────────────────────────────────────────────────────┬───────────────────────────────┬─────────╮ -│ Result │ Check │ Field │ Details │ -├────────┼─────────────────────────────────────────────────────────────────────┼───────────────────────────────┼─────────┤ -│ passed │ Check that JSON has valid schema │ orders │ │ -│ passed │ Check that JSON has valid schema │ line_items │ │ -│ passed │ Check that field order_id is present │ orders │ │ -│ passed │ Check that field order_timestamp is present │ orders │ │ -│ passed │ Check that field order_total is present │ orders │ │ -│ passed │ Check that field customer_id is present │ orders │ │ -│ passed │ Check that field customer_email_address is present │ orders │ │ -│ passed │ row_count >= 5000 │ orders │ │ -│ passed │ Check that required field order_id has no null values │ orders.order_id │ │ -│ passed │ Check that unique field order_id has no duplicate values │ orders.order_id │ │ -│ passed │ duplicate_count(order_id) = 0 │ orders.order_id │ │ -│ passed │ Check that required field order_timestamp has no null values │ orders.order_timestamp │ │ -│ passed │ freshness(order_timestamp) < 24h │ orders.order_timestamp │ │ -│ passed │ Check that required field order_total has no null values │ orders.order_total │ │ -│ passed │ Check that required field customer_email_address has no null values │ orders.customer_email_address │ │ -│ passed │ Check that field lines_item_id is present │ line_items │ │ -│ passed │ Check that field order_id is present │ line_items │ │ -│ passed │ Check that field sku is present │ line_items │ │ -│ passed │ values in (order_id) must exist in orders (order_id) │ line_items.order_id │ │ -│ passed │ row_count >= 5000 │ line_items │ │ -│ passed │ Check that required field lines_item_id has no null values │ line_items.lines_item_id │ │ -│ passed │ Check that unique field lines_item_id has no duplicate values │ line_items.lines_item_id │ │ -╰────────┴─────────────────────────────────────────────────────────────────────┴───────────────────────────────┴─────────╯ -🟢 data contract is valid. Run 22 checks. Took 6.739514 seconds. -``` - -Voilà, the CLI tested that the _datacontract.yaml_ itself is valid, all records comply with the schema, and all quality attributes are met. - -We can also use the datacontract.yaml to export in many [formats](#format), e.g., to generate a SQL DDL: +Testing https://datacontract.com/orders-v1.odcs.yaml +Server: production (type=postgres, host=aws-1-eu-central-2.pooler.supabase.com, port=6543, database=postgres, schema=dp_orders_v1) +╭────────┬──────────────────────────────────────────────────────────┬─────────────────────────┬─────────╮ +│ Result │ Check │ Field │ Details │ +├────────┼──────────────────────────────────────────────────────────┼─────────────────────────┼─────────┤ +│ passed │ Check that field 'line_item_id' is present │ line_items.line_item_id │ │ +│ passed │ Check that field line_item_id has type UUID │ line_items.line_item_id │ │ +│ passed │ Check that field line_item_id has no missing values │ line_items.line_item_id │ │ +│ passed │ Check that field 'order_id' is present │ line_items.order_id │ │ +│ passed │ Check that field order_id has type UUID │ line_items.order_id │ │ +│ passed │ Check that field 'price' is present │ line_items.price │ │ +│ passed │ Check that field price has type INTEGER │ line_items.price │ │ +│ passed │ Check that field price has no missing values │ line_items.price │ │ +│ passed │ Check that field 'sku' is present │ line_items.sku │ │ +│ passed │ Check that field sku has type TEXT │ line_items.sku │ │ +│ passed │ Check that field sku has no missing values │ line_items.sku │ │ +│ passed │ Check that field 'customer_id' is present │ orders.customer_id │ │ +│ passed │ Check that field customer_id has type TEXT │ orders.customer_id │ │ +│ passed │ Check that field customer_id has no missing values │ orders.customer_id │ │ +│ passed │ Check that field 'order_id' is present │ orders.order_id │ │ +│ passed │ Check that field order_id has type UUID │ orders.order_id │ │ +│ passed │ Check that field order_id has no missing values │ orders.order_id │ │ +│ passed │ Check that unique field order_id has no duplicate values │ orders.order_id │ │ +│ passed │ Check that field 'order_status' is present │ orders.order_status │ │ +│ passed │ Check that field order_status has type TEXT │ orders.order_status │ │ +│ passed │ Check that field 'order_timestamp' is present │ orders.order_timestamp │ │ +│ passed │ Check that field order_timestamp has type TIMESTAMPTZ │ orders.order_timestamp │ │ +│ passed │ Check that field 'order_total' is present │ orders.order_total │ │ +│ passed │ Check that field order_total has type INTEGER │ orders.order_total │ │ +│ passed │ Check that field order_total has no missing values │ orders.order_total │ │ +╰────────┴──────────────────────────────────────────────────────────┴─────────────────────────┴─────────╯ +🟢 data contract is valid. Run 25 checks. Took 3.938887 seconds. +``` + +Voilà, the CLI tested that the YAML itself is valid, all records comply with the schema, and all quality attributes are met. + +We can also use the data contract metadata to export in many [formats](#format), e.g., to generate a SQL DDL: ```bash -$ datacontract export --format sql https://datacontract.com/examples/orders-latest/datacontract.yaml +$ datacontract export --format sql https://datacontract.com/orders-v1.odcs.yaml # returns: --- Data Contract: urn:datacontract:checkout:orders-latest --- SQL Dialect: snowflake +-- Data Contract: orders +-- SQL Dialect: postgres CREATE TABLE orders ( - order_id TEXT not null primary key, - order_timestamp TIMESTAMP_TZ not null, - order_total NUMBER not null, - customer_id TEXT, - customer_email_address TEXT not null, - processed_timestamp TIMESTAMP_TZ not null + order_id None not null primary key, + customer_id text not null, + order_total integer not null, + order_timestamp None, + order_status text ); CREATE TABLE line_items ( - lines_item_id TEXT not null primary key, - order_id TEXT, - sku TEXT + line_item_id None not null primary key, + sku text not null, + price integer not null, + order_id None ); ``` Or generate an HTML export: ```bash -$ datacontract export --format html https://datacontract.com/examples/orders-latest/datacontract.yaml > datacontract.html +$ datacontract export --format html --output orders-v1.odcs.html https://datacontract.com/orders-v1.odcs.yaml ``` -which will create this [HTML export](https://datacontract.com/examples/orders-latest/datacontract.html). +[//]: # (which will create this [HTML export](https://datacontract.com/examples/orders-latest/datacontract.html).) ## Usage ```bash -# create a new data contract from example and write it to datacontract.yaml -$ datacontract init datacontract.yaml +# create a new data contract from example and write it to odcs.yaml +$ datacontract init odcs.yaml -# lint the datacontract.yaml -$ datacontract lint datacontract.yaml +# lint the odcs.yaml +$ datacontract lint odcs.yaml # execute schema and quality checks (define credentials as environment variables) -$ datacontract test datacontract.yaml +$ datacontract test odcs.yaml # export data contract as html (other formats: avro, dbt, dbt-sources, dbt-staging-sql, jsonschema, odcs, rdf, sql, sodacl, terraform, ...) -$ datacontract export --format html datacontract.yaml --output datacontract.html - -# export data contract to ODCS -$ datacontract export --format odcs datacontract.yaml --output odcs.yaml - -# import ODCS to data contract -$ datacontract import --format odcs odcs.yaml --output datacontract.yaml +$ datacontract export --format html datacontract.yaml --output odcs.html # import sql (other formats: avro, glue, bigquery, jsonschema, excel ...) -$ datacontract import --format sql --source my-ddl.sql --dialect postgres --output datacontract.yaml +$ datacontract import --format sql --source my-ddl.sql --dialect postgres --output odcs.yaml # import from Excel template -$ datacontract import --format excel --source odcs.xlsx --output datacontract.yaml +$ datacontract import --format excel --source odcs.xlsx --output odcs.yaml # export to Excel template -$ datacontract export --format excel --output odcs.xlsx datacontract.yaml - -# find differences between two data contracts -$ datacontract diff datacontract-v1.yaml datacontract-v2.yaml - -# find differences between two data contracts categorized into error, warning, and info. -$ datacontract changelog datacontract-v1.yaml datacontract-v2.yaml - -# fail pipeline on breaking changes. Uses changelog internally and showing only error and warning. -$ datacontract breaking datacontract-v1.yaml datacontract-v2.yaml +$ datacontract export --format excel --output odcs.xlsx odcs.yaml ``` ## Programmatic (Python) ```python from datacontract.data_contract import DataContract -data_contract = DataContract(data_contract_file="datacontract.yaml") +data_contract = DataContract(data_contract_file="odcs.yaml") run = data_contract.test() if not run.has_passed(): print("Data quality validation failed.") @@ -163,6 +157,14 @@ Choose the most appropriate installation method for your needs: ### uv +The preferred way to install is [uv](https://docs.astral.sh/uv/): + +``` +uv tool install --python python3.11 --upgrade 'datacontract-cli[all]' +``` + +### uvx + If you have [uv](https://docs.astral.sh/uv/) installed, you can run datacontract-cli directly without installing: ``` @@ -170,7 +172,7 @@ uv run --with 'datacontract-cli[all]' datacontract --version ``` ### pip -Python 3.10, 3.11, and 3.12 are supported. We recommend to use Python 3.11. +Python 3.10, 3.11, and 3.12 are supported. We recommend using Python 3.11. ```bash python3 -m pip install 'datacontract-cli[all]' @@ -223,7 +225,7 @@ The CLI tool defines several optional dependencies (also known as extras) that c With _all_, all server dependencies are included. ```bash -pip install datacontract-cli[all] +uv tool install --python python3.11 --upgrade 'datacontract-cli[all]' ``` A list of available extras: @@ -234,6 +236,7 @@ A list of available extras: | Avro Support | `pip install datacontract-cli[avro]` | | Google BigQuery | `pip install datacontract-cli[bigquery]` | | Databricks Integration | `pip install datacontract-cli[databricks]` | +| DuckDB (local/S3/GCS/Azure file testing) | `pip install datacontract-cli[duckdb]` | | Iceberg | `pip install datacontract-cli[iceberg]` | | Kafka Integration | `pip install datacontract-cli[kafka]` | | PostgreSQL Integration | `pip install datacontract-cli[postgres]` | @@ -241,6 +244,7 @@ A list of available extras: | Snowflake Integration | `pip install datacontract-cli[snowflake]` | | Microsoft SQL Server | `pip install datacontract-cli[sqlserver]` | | Trino | `pip install datacontract-cli[trino]` | +| Impala | `pip install datacontract-cli[impala]` | | dbt | `pip install datacontract-cli[dbt]` | | DBML | `pip install datacontract-cli[dbml]` | | Parquet | `pip install datacontract-cli[parquet]` | @@ -258,9 +262,6 @@ Commands - [test](#test) - [export](#export) - [import](#import) -- [breaking](#breaking) -- [changelog](#changelog) -- [diff](#diff) - [catalog](#catalog) - [publish](#publish) - [api](#api) @@ -272,6 +273,7 @@ Commands Create an empty data contract. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location of the data contract file to create. │ │ [default: datacontract.yaml] │ @@ -280,6 +282,7 @@ Commands │ --template TEXT URL of a template or data contract [default: None] │ │ --overwrite --no-overwrite Replace the existing datacontract.yaml │ │ [default: no-overwrite] │ +│ --debug --no-debug Enable debug logging [default: no-debug] │ │ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ @@ -292,20 +295,23 @@ Commands Validate that the datacontract.yaml is correctly formatted. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. │ │ [default: datacontract.yaml] │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --schema TEXT The location (url or path) of the Data Contract Specification │ -│ JSON Schema │ -│ [default: None] │ -│ --output PATH Specify the file path where the test results should be written │ -│ to (e.g., './test-results/TEST-datacontract.xml'). If no path is │ -│ provided, the output will be printed to stdout. │ -│ [default: None] │ -│ --output-format [junit] The target format for the test results. [default: None] │ -│ --help Show this message and exit. │ +│ --schema TEXT The location (url or path) of the ODCS JSON Schema │ +│ [default: None] │ +│ --output PATH Specify the file path where the test results should be │ +│ written to (e.g., │ +│ './test-results/TEST-datacontract.xml'). If no path is │ +│ provided, the output will be printed to stdout. │ +│ [default: None] │ +│ --output-format [junit] The target format for the test results. │ +│ [default: None] │ +│ --debug --no-debug Enable debug logging [default: no-debug] │ +│ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -317,14 +323,14 @@ Commands Run schema and quality tests on configured servers. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. │ │ [default: datacontract.yaml] │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ │ --schema TEXT The location (url or path) of │ -│ the Data Contract Specification │ -│ JSON Schema │ +│ the ODCS JSON Schema │ │ [default: None] │ │ --server TEXT The server configuration to run │ │ the schema and quality tests. │ @@ -334,12 +340,13 @@ Commands │ `production`, or `all` for all │ │ servers (default). │ │ [default: all] │ -│ --publish-test-results --no-publish-test-results Publish the results after the │ -│ test │ +│ --publish-test-results --no-publish-test-results Deprecated. Use publish │ +│ parameter. Publish the results │ +│ after the test │ │ [default: │ │ no-publish-test-results] │ -│ --publish TEXT DEPRECATED. The url to publish │ -│ the results after the test. │ +│ --publish TEXT The url to publish the results │ +│ after the test. │ │ [default: None] │ │ --output PATH Specify the file path where the │ │ test results should be written │ @@ -353,6 +360,8 @@ Commands │ --ssl-verification --no-ssl-verification SSL verification when publishing │ │ the data contract. │ │ [default: ssl-verification] │ +│ --debug --no-debug Enable debug logging │ +│ [default: no-debug] │ │ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ @@ -379,6 +388,7 @@ Supported server types: - [bigquery](#bigquery) - [azure](#azure) - [sqlserver](#sqlserver) +- [oracle](#oracle) - [databricks](#databricks) - [databricks (programmatic)](#databricks-programmatic) - [dataframe (programmatic)](#dataframe-programmatic) @@ -386,6 +396,7 @@ Supported server types: - [kafka](#kafka) - [postgres](#postgres) - [trino](#trino) +- [impala](#impala) - [api](#api) - [local](#local) @@ -463,7 +474,7 @@ servers: regionName: eu-central-1 stagingDir: s3://my-bucket/athena-results/ models: - my_table: # corresponds to a table of view name + my_table: # corresponds to a table or view name type: table fields: my_column_1: # corresponds to a column @@ -548,8 +559,7 @@ datacontract.yaml servers: production: type: azure - storageAccount: datameshdatabricksdemo - location: abfss://dataproducts/inventory_events/*.parquet + location: abfss://datameshdatabricksdemo.dfs.core.windows.net/inventory_events/*.parquet format: parquet ``` @@ -598,6 +608,56 @@ models: | `DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION` | `True` | Use windows authentication, instead of login | | `DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE` | `True` | Trust self-signed certificate | | `DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION` | `True` | Use SSL | +| `DATACONTRACT_SQLSERVER_DRIVER` | `ODBC Driver 18 for SQL Server` | ODBC driver name | + + + + +#### Oracle + +Data Contract CLI can test data in Oracle Database. + +##### Example + +datacontract.yaml +```yaml +servers: + oracle: + type: oracle + host: localhost + port: 1521 + service_name: ORCL + schema: ADMIN +models: + my_table_1: # corresponds to a table + type: table + fields: + my_column_1: # corresponds to a column + type: decimal + description: Decimal number + my_column_2: # corresponds to another column + type: text + description: Unicode text string + config: + oracleType: NVARCHAR2 # optional: can be used to explicitly define the type used in the database + # if not set a default mapping will be used +``` + +##### Environment Variables + +These environment variable specify the credentials used by the datacontract tool to connect to the database. +If you've started the database from a container, e.g. [oracle-free](https://hub.docker.com/r/gvenzl/oracle-free) +this should match either `system` and what you specified as `ORACLE_PASSWORD` on the container or +alternatively what you've specified under `APP_USER` and `APP_USER_PASSWORD`. +If you require thick mode to connect to the database, you need to have an Oracle Instant Client +installed on the system and specify the path to the installation within the environment variable +`DATACONTRACT_ORACLE_CLIENT_DIR`. + +| Environment Variable | Example | Description | +|--------------------------------------------------|--------------------|--------------------------------------------| +| `DATACONTRACT_ORACLE_USERNAME` | `system` | Username | +| `DATACONTRACT_ORACLE_PASSWORD` | `0x162e53` | Password | +| `DATACONTRACT_ORACLE_CLIENT_DIR` | `C:\oracle\client` | Path to Oracle Instant Client installation | @@ -656,19 +716,37 @@ models: fields: ... ``` -Notebook -```python -%pip install datacontract-cli[databricks] -dbutils.library.restartPython() +##### Installing on Databricks Compute -from datacontract.data_contract import DataContract +**Important:** When using Databricks LTS ML runtimes (15.4, 16.4), installing via `%pip install` in notebooks can cause issues. -data_contract = DataContract( - data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml", - spark=spark) -run = data_contract.test() -run.result -``` +**Recommended approach:** Use Databricks' native library management instead: + +1. **Create or configure your compute cluster:** + - Navigate to **Compute** in the Databricks workspace + - Create a new cluster or select an existing one + - Go to the **Libraries** tab + +2. **Add the datacontract-cli library:** + - Click **Install new** + - Select **PyPI** as the library source + - Enter package name: `datacontract-cli[databricks]` + - Click **Install** + +3. **Restart the cluster** to apply the library installation + +4. **Use in your notebook** without additional installation: + ```python + from datacontract.data_contract import DataContract + + data_contract = DataContract( + data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml", + spark=spark) + run = data_contract.test() + run.result + ``` + +Databricks' library management properly resolves dependencies during cluster initialization, rather than at runtime in the notebook. #### Dataframe (programmatic) @@ -848,6 +926,53 @@ models: | `DATACONTRACT_TRINO_PASSWORD` | `mysecretpassword` | Password | +#### Impala + +Data Contract CLI can run Soda checks against an Apache Impala cluster. + +##### Example + +datacontract.yaml +```yaml +servers: + impala: + type: impala + host: my-impala-host + port: 443 + # Optional default database used for Soda scans + database: my_database +models: + my_table_1: # corresponds to a table + type: table + # fields as usual … +``` + +##### Environment Variables + +| Environment Variable | Example | Description | +|------------------------------- |-------------------- |------------- | +| `DATACONTRACT_IMPALA_USERNAME` | `analytics_user` | Username used to connect to Impala | +| `DATACONTRACT_IMPALA_PASSWORD` | `mysecretpassword` | Password for the Impala user | +| `DATACONTRACT_IMPALA_USE_SSL` | `true` | Whether to use SSL; defaults to true if unset | +| `DATACONTRACT_IMPALA_AUTH_MECHANISM` | `LDAP` | Authentication mechanism; defaults to LDAP | +| `DATACONTRACT_IMPALA_USE_HTTP_TRANSPORT` | `true` | Whether to use the HTTP transport; defaults to true | +| `DATACONTRACT_IMPALA_HTTP_PATH` | `cliservice` | HTTP path for the Impala service; defaults to cliservice | + +### Type-mapping note (logicalType → Impala type) + +If `physicalType` is not specified in the schema, we recommend the following mapping from `logicalType` to Impala column types: + +|logicalType | Recommended Impala type | +|------------|-------------------------| +| `integer` | `INT` or `BIGINT` | +| `number` | `DOUBLE`/`decimal(..)` | +| `string` | `STRING` or `VARCHAR` | +| `boolean` | `BOOLEAN` | +| `date` | `DATE` | +| `datetime` | `TIMESTAMP` | + +This keeps the Impala schema compatible with the expectations of the Soda checks generated by datacontract-cli. + #### API Data Contract CLI can test APIs that return data in JSON format. @@ -912,43 +1037,48 @@ models: Convert data contract to a specific format. Saves to file specified by `output` option if present, otherwise prints to stdout. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. │ │ [default: datacontract.yaml] │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --format [jsonschema|pydantic-model|sodacl|db The export format. [default: None] │ -│ t|dbt-sources|dbt-staging-sql|odcs|r [required] │ -│ df|avro|protobuf|great-expectations| │ -│ terraform|avro-idl|sql|sql-query|mer │ -│ maid|html|go|bigquery|dbml|spark|sql │ -│ alchemy|data-caterer|dcs|markdown|ic │ -│ eberg|custom|excel|dqx] │ -│ --output PATH Specify the file path where the │ -│ exported data will be saved. If no │ -│ path is provided, the output will be │ -│ printed to stdout. │ -│ [default: None] │ -│ --server TEXT The server name to export. │ -│ [default: None] │ -│ --model TEXT Use the key of the model in the data │ -│ contract yaml file to refer to a │ -│ model, e.g., `orders`, or `all` for │ -│ all models (default). │ -│ [default: all] │ -│ --schema TEXT The location (url or path) of the │ -│ Data Contract Specification JSON │ -│ Schema │ -│ [default: None] │ -│ --engine TEXT [engine] The engine used for great │ -│ expection run. │ -│ [default: None] │ -│ --template PATH The file path or URL of a template. │ -│ For Excel format: path/URL to custom │ -│ Excel template. For custom format: │ -│ path to Jinja template. │ -│ [default: None] │ -│ --help Show this message and exit. │ +│ * --format [jsonschema|pydantic-model|sod The export format. │ +│ acl|dbt|dbt-sources|dbt-stagin [default: None] │ +│ g-sql|odcs|rdf|avro|protobuf|g [required] │ +│ reat-expectations|avro-idl|sql │ +│ |sql-query|mermaid|html|go|big │ +│ query|dbml|spark|sqlalchemy|da │ +│ ta-caterer|dcs|markdown|iceber │ +│ g|custom|excel|dqx] │ +│ --output PATH Specify the file path where │ +│ the exported data will be │ +│ saved. If no path is provided, │ +│ the output will be printed to │ +│ stdout. │ +│ [default: None] │ +│ --server TEXT The server name to export. │ +│ [default: None] │ +│ --schema-name TEXT The name of the schema to │ +│ export, e.g., `orders`, or │ +│ `all` for all schemas │ +│ (default). │ +│ [default: all] │ +│ --schema TEXT The location (url or path) of │ +│ the ODCS JSON Schema │ +│ [default: None] │ +│ --engine TEXT [engine] The engine used for │ +│ great expection run. │ +│ [default: None] │ +│ --template PATH The file path or URL of a │ +│ template. For Excel format: │ +│ path/URL to custom Excel │ +│ template. For custom format: │ +│ path to Jinja template. │ +│ [default: None] │ +│ --debug --no-debug Enable debug logging │ +│ [default: no-debug] │ +│ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ RDF Options ────────────────────────────────────────────────────────────────────────────────────╮ │ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ @@ -1017,7 +1147,7 @@ spark.conf.set(“spark.databricks.delta.schema.typeCheck.enabled”, “false #### Great Expectations The `export` function transforms a specified data contract into a comprehensive Great Expectations JSON suite. -If the contract includes multiple models, you need to specify the names of the model you wish to export. +If the contract includes multiple models, you need to specify the names of the schema/models you wish to export. ```shell datacontract export datacontract.yaml --format great-expectations --model orders @@ -1075,7 +1205,7 @@ logical data types are exported. #### DBT & DBT-SOURCES -The export funciton converts the datacontract to dbt models in YAML format, with support for SQL dialects. +The export function converts the datacontract to dbt models in YAML format, with support for SQL dialects. If a server is selected via the `--server` option (based on the `type` of that server) then the DBT column `data_types` match the expected data types of the server. If no server is selected, then it defaults to `snowflake`. @@ -1268,7 +1398,7 @@ FROM {{ ref('orders') }} ``` -#### ODCS Excel Templace +#### ODCS Excel Template The `export` function converts a data contract into an ODCS (Open Data Contract Standard) Excel template. This creates a user-friendly Excel spreadsheet that can be used for authoring, sharing, and managing data contracts using the familiar Excel interface. @@ -1292,85 +1422,89 @@ For more information about the Excel template structure, visit the [ODCS Excel T Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout. + ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --format [sql|avro|dbt|dbml|glue|jsonsc The format of the source file. │ -│ hema|json|bigquery|odcs|unity| [default: None] │ -│ spark|iceberg|parquet|csv|prot [required] │ -│ obuf|excel] │ -│ --output PATH Specify the file path where │ -│ the Data Contract will be │ -│ saved. If no path is provided, │ -│ the output will be printed to │ -│ stdout. │ -│ [default: None] │ -│ --source TEXT The path to the file that │ -│ should be imported. │ -│ [default: None] │ -│ --spec [datacontract_specification|od The format of the data │ -│ cs] contract to import. │ -│ [default: │ -│ datacontract_specification] │ -│ --dialect TEXT The SQL dialect to use when │ -│ importing SQL files, e.g., │ -│ postgres, tsql, bigquery. │ -│ [default: None] │ -│ --glue-table TEXT List of table ids to import │ -│ from the Glue Database (repeat │ -│ for multiple table ids, leave │ -│ empty for all tables in the │ -│ dataset). │ -│ [default: None] │ -│ --bigquery-project TEXT The bigquery project id. │ -│ [default: None] │ -│ --bigquery-dataset TEXT The bigquery dataset id. │ -│ [default: None] │ -│ --bigquery-table TEXT List of table ids to import │ -│ from the bigquery API (repeat │ -│ for multiple table ids, leave │ -│ empty for all tables in the │ -│ dataset). │ -│ [default: None] │ -│ --unity-table-full-name TEXT Full name of a table in the │ -│ unity catalog │ -│ [default: None] │ -│ --dbt-model TEXT List of models names to import │ -│ from the dbt manifest file │ -│ (repeat for multiple models │ -│ names, leave empty for all │ -│ models in the dataset). │ -│ [default: None] │ -│ --dbml-schema TEXT List of schema names to import │ -│ from the DBML file (repeat for │ -│ multiple schema names, leave │ -│ empty for all tables in the │ -│ file). │ -│ [default: None] │ -│ --dbml-table TEXT List of table names to import │ -│ from the DBML file (repeat for │ -│ multiple table names, leave │ -│ empty for all tables in the │ -│ file). │ -│ [default: None] │ -│ --iceberg-table TEXT Table name to assign to the │ -│ model created from the Iceberg │ -│ schema. │ -│ [default: None] │ -│ --template TEXT The location (url or path) of │ -│ the Data Contract │ -│ Specification Template │ -│ [default: None] │ -│ --schema TEXT The location (url or path) of │ -│ the Data Contract │ -│ Specification JSON Schema │ -│ [default: None] │ -│ --owner TEXT The owner or team responsible │ -│ for managing the data │ -│ contract. │ -│ [default: None] │ -│ --id TEXT The identifier for the the │ -│ data contract. │ -│ [default: None] │ -│ --help Show this message and exit. │ +│ * --format [sql|avro|dbt|dbml|glue| The format of the source │ +│ jsonschema|json|bigquery file. │ +│ |odcs|unity|spark|iceber [default: None] │ +│ g|parquet|csv|protobuf|e [required] │ +│ xcel] │ +│ --output PATH Specify the file path │ +│ where the Data Contract │ +│ will be saved. If no path │ +│ is provided, the output │ +│ will be printed to stdout. │ +│ [default: None] │ +│ --source TEXT The path to the file that │ +│ should be imported. │ +│ [default: None] │ +│ --dialect TEXT The SQL dialect to use │ +│ when importing SQL files, │ +│ e.g., postgres, tsql, │ +│ bigquery. │ +│ [default: None] │ +│ --glue-table TEXT List of table ids to │ +│ import from the Glue │ +│ Database (repeat for │ +│ multiple table ids, leave │ +│ empty for all tables in │ +│ the dataset). │ +│ [default: None] │ +│ --bigquery-project TEXT The bigquery project id. │ +│ [default: None] │ +│ --bigquery-dataset TEXT The bigquery dataset id. │ +│ [default: None] │ +│ --bigquery-table TEXT List of table ids to │ +│ import from the bigquery │ +│ API (repeat for multiple │ +│ table ids, leave empty for │ +│ all tables in the │ +│ dataset). │ +│ [default: None] │ +│ --unity-table-full-name TEXT Full name of a table in │ +│ the unity catalog │ +│ [default: None] │ +│ --dbt-model TEXT List of models names to │ +│ import from the dbt │ +│ manifest file (repeat for │ +│ multiple models names, │ +│ leave empty for all models │ +│ in the dataset). │ +│ [default: None] │ +│ --dbml-schema TEXT List of schema names to │ +│ import from the DBML file │ +│ (repeat for multiple │ +│ schema names, leave empty │ +│ for all tables in the │ +│ file). │ +│ [default: None] │ +│ --dbml-table TEXT List of table names to │ +│ import from the DBML file │ +│ (repeat for multiple table │ +│ names, leave empty for all │ +│ tables in the file). │ +│ [default: None] │ +│ --iceberg-table TEXT Table name to assign to │ +│ the model created from the │ +│ Iceberg schema. │ +│ [default: None] │ +│ --template TEXT The location (url or path) │ +│ of the ODCS template │ +│ [default: None] │ +│ --schema TEXT The location (url or path) │ +│ of the ODCS JSON Schema │ +│ [default: None] │ +│ --owner TEXT The owner or team │ +│ responsible for managing │ +│ the data contract. │ +│ [default: None] │ +│ --id TEXT The identifier for the the │ +│ data contract. │ +│ [default: None] │ +│ --debug --no-debug Enable debug logging │ +│ [default: no-debug] │ +│ --help Show this message and │ +│ exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1385,38 +1519,25 @@ datacontract import --format sql --source my_ddl.sql --dialect postgres --output Available import options: -| Type | Description | Status | -|--------------------|------------------------------------------------|--------| -| `avro` | Import from AVRO schemas | ✅ | -| `bigquery` | Import from BigQuery Schemas | ✅ | -| `csv` | Import from CSV File | ✅ | -| `dbml` | Import from DBML models | ✅ | -| `dbt` | Import from dbt models | ✅ | -| `excel` | Import from ODCS Excel Template | ✅ | -| `glue` | Import from AWS Glue DataCatalog | ✅ | -| `iceberg` | Import from an Iceberg JSON Schema Definition | partial | -| `jsonschema` | Import from JSON Schemas | ✅ | -| `odcs` | Import from Open Data Contract Standard (ODCS) | ✅ | -| `parquet` | Import from Parquet File Metadata | ✅ | -| `protobuf` | Import from Protobuf schemas | ✅ | -| `spark` | Import from Spark StructTypes, Variant | ✅ | -| `sql` | Import from SQL DDL | ✅ | -| `unity` | Import from Databricks Unity Catalog | partial | -| `excel` | Import from ODCS Excel Template | ✅ | -| Missing something? | Please create an issue on GitHub | TBD | - - -#### ODCS - -Import from Open Data Contract Standard (ODCS) v2 or v3. -The importer automatically detects the ODCS version and imports the data contract. +| Type | Description | Status | +|--------------------|-----------------------------------------------|---------| +| `avro` | Import from AVRO schemas | ✅ | +| `bigquery` | Import from BigQuery Schemas | ✅ | +| `csv` | Import from CSV File | ✅ | +| `dbml` | Import from DBML models | ✅ | +| `dbt` | Import from dbt models | ✅ | +| `excel` | Import from ODCS Excel Template | ✅ | +| `glue` | Import from AWS Glue DataCatalog | ✅ | +| `iceberg` | Import from an Iceberg JSON Schema Definition | partial | +| `jsonschema` | Import from JSON Schemas | ✅ | +| `parquet` | Import from Parquet File Metadata | ✅ | +| `protobuf` | Import from Protobuf schemas | ✅ | +| `spark` | Import from Spark StructTypes, Variant | ✅ | +| `sql` | Import from SQL DDL | ✅ | +| `unity` | Import from Databricks Unity Catalog | partial | +| `excel` | Import from ODCS Excel Template | ✅ | +| Missing something? | Please create an issue on GitHub | TBD | -Examples: - -```bash -# Example import from ODCS -datacontract import --format odcs --source my_data_contract.odcs.yaml -``` #### BigQuery @@ -1444,18 +1565,22 @@ datacontract import --format bigquery --bigquery-project --bigquery ``` #### Unity Catalog - ```bash # Example import from a Unity Catalog JSON file datacontract import --format unity --source my_unity_table.json ``` ```bash -# Example import single table from Unity Catalog via HTTP endpoint +# Example import single table from Unity Catalog via HTTP endpoint using PAT export DATACONTRACT_DATABRICKS_SERVER_HOSTNAME="https://xyz.cloud.databricks.com" -export DATACONTRACT_DATABRICKS_HTTP_PATH="/sql/1.0/warehouses/b053a331fa014fb4" export DATACONTRACT_DATABRICKS_TOKEN= datacontract import --format unity --unity-table-full-name +``` + Please refer to [Databricks documentation](https://docs.databricks.com/aws/en/dev-tools/auth/unified-auth) on how to set up a profile +```bash +# Example import single table from Unity Catalog via HTTP endpoint using Profile +export DATACONTRACT_DATABRICKS_PROFILE="my-profile" +datacontract import --format unity --unity-table-full-name ``` #### dbt @@ -1475,7 +1600,7 @@ datacontract import --format dbt --source --dbt-model ``` -### Excel +#### Excel Importing from [ODCS Excel Template](https://github.com/datacontract/open-data-contract-standard-excel-template). @@ -1514,20 +1639,20 @@ datacontract import --format spark --source "users,orders" ```bash # Example: Import Spark table -DataContract().import_from_source("spark", "users") -DataContract().import_from_source(format = "spark", source = "users") +DataContract.import_from_source("spark", "users") +DataContract.import_from_source(format = "spark", source = "users") # Example: Import Spark dataframe -DataContract().import_from_source("spark", "users", dataframe = df_user) -DataContract().import_from_source(format = "spark", source = "users", dataframe = df_user) +DataContract.import_from_source("spark", "users", dataframe = df_user) +DataContract.import_from_source(format = "spark", source = "users", dataframe = df_user) # Example: Import Spark table + table description -DataContract().import_from_source("spark", "users", description = "description") -DataContract().import_from_source(format = "spark", source = "users", description = "description") +DataContract.import_from_source("spark", "users", description = "description") +DataContract.import_from_source(format = "spark", source = "users", description = "description") # Example: Import Spark dataframe + table description -DataContract().import_from_source("spark", "users", dataframe = df_user, description = "description") -DataContract().import_from_source(format = "spark", source = "users", dataframe = df_user, description = "description") +DataContract.import_from_source("spark", "users", dataframe = df_user, description = "description") +DataContract.import_from_source(format = "spark", source = "users", dataframe = df_user, description = "description") ``` #### DBML @@ -1592,69 +1717,6 @@ datacontract import --format protobuf --source "test.proto" ``` -### breaking -``` - - Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW - - Identifies breaking changes between data contracts. Prints to stdout. - -╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ - -``` - -### changelog -``` - - Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW - - Generate a changelog between data contracts. Prints to stdout. - -╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ - -``` - -### diff -``` - - Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW - - PLACEHOLDER. Currently works as 'changelog' does. - -╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ -│ * location_old TEXT The location (url or path) of the old data contract yaml. │ -│ [default: None] │ -│ [required] │ -│ * location_new TEXT The location (url or path) of the new data contract yaml. │ -│ [default: None] │ -│ [required] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ - -``` - ### catalog ``` @@ -1662,14 +1724,16 @@ datacontract import --format protobuf --source "test.proto" Create a html catalog of data contracts. + ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --files TEXT Glob pattern for the data contract files to include in the catalog. │ -│ Applies recursively to any subfolders. │ -│ [default: *.yaml] │ -│ --output TEXT Output directory for the catalog html files. [default: catalog/] │ -│ --schema TEXT The location (url or path) of the Data Contract Specification JSON Schema │ -│ [default: None] │ -│ --help Show this message and exit. │ +│ --files TEXT Glob pattern for the data contract files to include in the │ +│ catalog. Applies recursively to any subfolders. │ +│ [default: *.yaml] │ +│ --output TEXT Output directory for the catalog html files. [default: catalog/] │ +│ --schema TEXT The location (url or path) of the ODCS JSON Schema │ +│ [default: None] │ +│ --debug --no-debug Enable debug logging [default: no-debug] │ +│ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1689,19 +1753,21 @@ datacontract catalog --files "*.odcs.yaml" Usage: datacontract publish [OPTIONS] [LOCATION] - Publish the data contract to the Data Mesh Manager. + Publish the data contract to the Entropy Data. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. │ │ [default: datacontract.yaml] │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --schema TEXT The location (url or path) of the Data │ -│ Contract Specification JSON Schema │ +│ --schema TEXT The location (url or path) of the ODCS JSON │ +│ Schema │ │ [default: None] │ │ --ssl-verification --no-ssl-verification SSL verification when publishing the data │ │ contract. │ │ [default: ssl-verification] │ +│ --debug --no-debug Enable debug logging [default: no-debug] │ │ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ @@ -1713,6 +1779,7 @@ datacontract catalog --files "*.odcs.yaml" Usage: datacontract api [OPTIONS] Start the datacontract CLI as server application with REST API. + The OpenAPI documentation as Swagger UI is available on http://localhost:4242. You can execute the commands directly from the Swagger UI. To protect the API, you can set the environment variable DATACONTRACT_CLI_API_KEY to a secret API @@ -1725,10 +1792,12 @@ datacontract catalog --files "*.odcs.yaml" `datacontract api --port 1234 --root_path /datacontract`. ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ -│ --port INTEGER Bind socket to this port. [default: 4242] │ -│ --host TEXT Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0 │ -│ [default: 127.0.0.1] │ -│ --help Show this message and exit. │ +│ --port INTEGER Bind socket to this port. [default: 4242] │ +│ --host TEXT Bind socket to this host. Hint: For running in docker, set it │ +│ to 0.0.0.0 │ +│ [default: 127.0.0.1] │ +│ --debug --no-debug Enable debug logging [default: no-debug] │ +│ --help Show this message and exit. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1737,19 +1806,18 @@ datacontract catalog --files "*.odcs.yaml" | Integration | Option | Description | |-----------------------|------------------------------|---------------------------------------------------------------------------------------------------------------| -| Data Mesh Manager | `--publish` | Push full results to the [Data Mesh Manager API](https://api.datamesh-manager.com/swagger/index.html) | -| Data Contract Manager | `--publish` | Push full results to the [Data Contract Manager API](https://api.datacontract-manager.com/swagger/index.html) | +| Entropy Data | `--publish` | Push full results to the [Entropy Data API](https://api.entropy-data.com/swagger/index.html) | -### Integration with Data Mesh Manager +### Integration with Entropy Data -If you use [Data Mesh Manager](https://datamesh-manager.com/) or [Data Contract Manager](https://datacontract-manager.com/), you can use the data contract URL and append the `--publish` option to send and display the test results. Set an environment variable for your API key. +If you use [Entropy Data](https://entropy-data.com/), you can use the data contract URL to reference to the contract and append the `--publish` option to send and display the test results. Set an environment variable for your API key. ```bash -# Fetch current data contract, execute tests on production, and publish result to data mesh manager -$ EXPORT DATAMESH_MANAGER_API_KEY=xxx -$ datacontract test https://demo.datamesh-manager.com/demo279750347121/datacontracts/4df9d6ee-e55d-4088-9598-b635b2fdcbbc/datacontract.yaml \ +# Fetch current data contract, execute tests on production, and publish result to entropy data +$ EXPORT ENTROPY_DATA_API_KEY=xxx +$ datacontract test https://demo.entropy-data.com/demo279750347121/datacontracts/4df9d6ee-e55d-4088-9598-b635b2fdcbbc/datacontract.yaml \ --server production \ - --publish https://api.datamesh-manager.com/api/test-results + --publish https://api.entropy-data.com/api/test-results ``` ## Best Practices @@ -1772,8 +1840,7 @@ Create a data contract based on the actual data. This is the fastest way to get $ datacontract test ``` -3. Make sure that all the best practices for a `datacontract.yaml` are met using the linter. You - probably forgot to document some fields and add the terms and conditions. +3. Validate that the `datacontract.yaml` is correctly formatted and adheres to the Data Contract Specification. ```bash $ datacontract lint ``` @@ -1794,8 +1861,7 @@ Create a data contract based on the requirements from use cases. ``` 2. Create the model and quality guarantees based on your business requirements. Fill in the terms, - descriptions, etc. Make sure you follow all best practices for a `datacontract.yaml` using the - linter. + descriptions, etc. Validate that your `datacontract.yaml` is correctly formatted. ```bash $ datacontract lint ``` @@ -1815,34 +1881,6 @@ Create a data contract based on the requirements from use cases. $ datacontract test ``` -### Schema Evolution - -#### Non-breaking Changes -Examples: adding models or fields - -- Add the models or fields in the datacontract.yaml -- Increment the minor version of the datacontract.yaml on any change. Simply edit the datacontract.yaml for this. -- You need a policy that these changes are non-breaking. That means that one cannot use the star expression in SQL to query a table under contract. Make the consequences known. -- Fail the build in the Pull Request if a datacontract.yaml accidentally adds a breaking change even despite only a minor version change - ```bash - $ datacontract breaking datacontract-from-pr.yaml datacontract-from-main.yaml - ``` -- Create a changelog of this minor change. - ```bash - $ datacontract changelog datacontract-from-pr.yaml datacontract-from-main.yaml - ``` -#### Breaking Changes -Examples: Removing or renaming models and fields. - -- Remove or rename models and fields in the datacontract.yaml, and any other change that might be part of this new major version of this data contract. -- Increment the major version of the datacontract.yaml for this and create a new file for the major version. The reason being, that one needs to offer an upgrade path for the data consumers from the old to the new major version. -- As data consumers need to migrate, try to reduce the frequency of major versions by making multiple breaking changes together if possible. -- Be aware of the notice period in the data contract as this is the minimum amount of time you have to offer both the old and the new version for a migration path. -- Do not fear making breaking changes with data contracts. It's okay to do them in this controlled way. Really! -- Create a changelog of this major change. - ```bash - $ datacontract changelog datacontract-from-pr.yaml datacontract-from-main.yaml - ``` ## Customizing Exporters and Importers @@ -1989,7 +2027,7 @@ if __name__ == "__main__": Output ```yaml -dataContractSpecification: 1.2.0 +dataContractSpecification: 1.2.1 id: uuid-custom info: title: my_custom_imported_data @@ -2009,17 +2047,42 @@ models: ## Development Setup - Install [uv](https://docs.astral.sh/uv/) -- Python base interpreter should be 3.11.x . +- Python base interpreter should be 3.11.x. - Docker engine must be running to execute the tests. ```bash # make sure uv is installed uv python pin 3.11 +uv venv uv pip install -e '.[dev]' uv run ruff check uv run pytest ``` +### Troubleshooting + +#### Windows: Some tests fail + +Run in wsl. (We need to fix the paths in the tests so that normal Windows will work, contributions are appreciated) + +#### PyCharm does not pick up the `.venv` + +This [uv issue](https://github.com/astral-sh/uv/issues/12545) might be relevant. + +Try to sync all groups: + +``` +uv sync --all-groups --all-extras +``` + +#### Errors in tests that use PySpark (e.g. test_test_kafka.py) + +Ensure you have a JDK 17 or 21 installed. Java 25 causes issues. + +``` +java --version +``` + ### Docker Build @@ -2052,27 +2115,6 @@ docker compose run --rm datacontract --version This command runs the container momentarily to check the version of the `datacontract` CLI. The `--rm` flag ensures that the container is automatically removed after the command executes, keeping your environment clean. -## Use with pre-commit - -To run `datacontract-cli` as part of a [pre-commit](https://pre-commit.com/) workflow, add something like the below to the `repos` list in the project's `.pre-commit-config.yaml`: - -```yaml -repos: - - repo: https://github.com/datacontract/datacontract-cli - rev: "v0.10.9" - hooks: - - id: datacontract-lint - - id: datacontract-test - args: ["--server", "production"] -``` - -### Available Hook IDs - -| Hook ID | Description | Dependency | -| ----------------- | -------------------------------------------------- | ---------- | -| datacontract-lint | Runs the lint subcommand. | Python3 | -| datacontract-test | Runs the test subcommand. Please look at | Python3 | -| | [test](#test) section for all available arguments. | | ## Release Steps @@ -2097,8 +2139,7 @@ We are happy to receive your contributions. Propose your change in an issue or d ## Related Tools -- [Data Contract Manager](https://www.datacontract-manager.com/) is a commercial tool to manage data contracts. It contains a web UI, access management, and data governance for a full enterprise data marketplace. -- [Data Contract GPT](https://gpt.datacontract.com) is a custom GPT that can help you write data contracts. +- [Entropy Data](https://www.entropy-data.com/) is a commercial tool to manage data contracts. It contains a web UI, access management, and data governance for a data product marketplace based on data contracts. - [Data Contract Editor](https://editor.datacontract.com) is an editor for Data Contracts, including a live html preview. - [Data Contract Playground](https://data-catering.github.io/data-contract-playground/) allows you to validate and export your data contract to different formats within your browser. diff --git a/datacontract/api.py b/datacontract/api.py index ff5ae82ad..09a940730 100644 --- a/datacontract/api.py +++ b/datacontract/api.py @@ -4,56 +4,199 @@ import typer from fastapi import Body, Depends, FastAPI, HTTPException, Query, status +from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import PlainTextResponse from fastapi.security.api_key import APIKeyHeader from datacontract.data_contract import DataContract, ExportFormat from datacontract.model.run import Run -DATA_CONTRACT_EXAMPLE_PAYLOAD = """dataContractSpecification: 1.2.0 -id: urn:datacontract:checkout:orders-latest -info: - title: Orders Latest - version: 2.0.0 - owner: Sales Team -servers: - production: - type: s3 - location: s3://datacontract-example-orders-latest/v2/{model}/*.json - format: json - delimiter: new_line -models: - orders: - description: One record per order. Includes cancelled and deleted orders. - type: table - fields: - order_id: - type: string +DATA_CONTRACT_EXAMPLE_PAYLOAD = """apiVersion: v3.1.0 +kind: DataContract +id: orders +name: Orders +version: 1.0.0 +status: active +description: + purpose: "Provides order and line item data for analytics and reporting" + usage: "Used by analytics team for sales analysis and business intelligence" + limitations: "Contains only the last 2 years of data" + customProperties: + - property: "sensitivity" + value: "secret" + description: "Data contains personally identifiable information" + authoritativeDefinitions: + - url: "https://entropy-data.com/policies/gdpr-compliance" + type: "businessDefinition" + description: "GDPR compliance policy for handling customer data" +schema: + - name: orders + physicalType: TABLE + description: All historic web shop orders since 2020-01-01. Includes successful and cancelled orders. + properties: + - name: order_id + logicalType: string + description: The internal order id for every orders. Do not show this to a customer. + businessName: Internal Order ID + physicalType: UUID + examples: + - 99e8bb10-3785-4634-9664-8dc79eb69d43 primaryKey: true - order_timestamp: - description: The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful. - type: timestamp + classification: internal required: true + unique: true + - name: customer_id + logicalType: string + description: A reference to the customer number + businessName: Customer Number + physicalType: TEXT examples: - - "2024-09-09T08:30:00Z" - order_total: - description: Total amount the smallest monetary unit (e.g., cents). - type: long + - c123456789 required: true + unique: false + logicalTypeOptions: + minLength: 10 + maxLength: 10 + authoritativeDefinitions: + - type: definition + url: https://example.com/definitions/sales/customer/customer_id + tags: + - pii:true + classification: internal + criticalDataElement: true + - name: order_total + logicalType: integer + description: The order total amount in cents, including tax, after discounts. + Includes shipping costs. + physicalType: INTEGER examples: - - 9999 + - "9999" quality: - - type: sql - description: 95% of all order total values are expected to be between 10 and 499 EUR. - query: | - SELECT quantile_cont(order_total, 0.95) AS percentile_95 - FROM orders - mustBeBetween: [1000, 99900] - customer_id: - description: Unique identifier for the customer. - type: text - minLength: 10 - maxLength: 20 + - type: text + description: The order_total equals the sum of all related line items. + required: true + businessName: Order Amount + - name: order_timestamp + logicalType: timestamp + description: The time including timezone when the order payment was successfully + confirmed. + physicalType: TIMESTAMPTZ + businessName: Order Date + examples: + - "2025-03-01 14:30:00+01" + - name: order_status + businessName: Status + description: The business status of the order + logicalType: string + physicalType: TEXT + examples: + - shipped + quality: + - type: library + description: Ensure that there are no other status values. + metric: invalidValues + arguments: + validValues: + - pending + - paid + - processing + - shipped + - delivered + - cancelled + - refunded + mustBe: 0 + quality: + - type: library + metric: rowCount + mustBeGreaterThan: 100000 + description: If there are less than 100k rows, something is wrong. + - name: line_items + physicalType: table + description: Details for each item in an order + properties: + - name: line_item_id + logicalType: string + description: Unique identifier for the line item + physicalType: UUID + examples: + - 12c9ba21-0c44-4e29-ba72-b8fd01c1be30 + logicalTypeOptions: + format: uuid + required: true + primaryKey: true + - name: sku + logicalType: string + businessName: Stock Keeping Unit + description: Identifier for the purchased product + physicalType: TEXT + examples: + - 111222333 + required: true + - name: price + logicalType: integer + description: Price in cents for this line item including tax + physicalType: INTEGER + examples: + - 9999 + required: true + - name: order_id + required: false + primaryKey: false + logicalType: string + physicalType: UUID + relationships: + - type: foreignKey + to: orders.order_id +servers: + - server: production + environment: prod + type: postgres + host: aws-1-eu-central-2.pooler.supabase.com + port: 6543 + database: postgres + schema: dp_orders_v1 +team: + name: sales + description: This data product is owned by the "Sales" team + members: + - username: john@example.com + name: John Doe + role: Owner + authoritativeDefinitions: + - type: slack + url: https://slack.example.com/teams/sales +roles: + - role: analyst_us + description: Read access for analytics to US orders + - role: analyst_eu + description: Read access for analytics to EU orders +slaProperties: + - property: availability + value: 99.9% + description: Data platform uptime guarantee + - property: retention + value: "1" + unit: year + description: Data will be deleted after 1 year + - property: freshness + value: "24" + unit: hours + description: Within 24 hours of order placement + - property: support + value: business hours + description: Support only during business hours +price: + priceAmount: 0 + priceCurrency: USD + priceUnit: monthly +tags: + - e-commerce + - transactions + - pii +customProperties: + - property: dataPlatformRole + value: role_orders_v1 +contractCreatedTs: "2025-01-15T10:00:00Z" """ app = FastAPI( @@ -90,6 +233,13 @@ ], ) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + api_key_header = APIKeyHeader( name="x-api-key", auto_error=False, # this makes authentication optional diff --git a/datacontract/breaking/breaking.py b/datacontract/breaking/breaking.py deleted file mode 100644 index 4263cf67b..000000000 --- a/datacontract/breaking/breaking.py +++ /dev/null @@ -1,546 +0,0 @@ -from datacontract.breaking.breaking_change import BreakingChange, Location, Severity -from datacontract.breaking.breaking_rules import BreakingRules -from datacontract.model.data_contract_specification import Contact, DeprecatedQuality, Field, Info, Model, Terms - - -def info_breaking_changes( - old_info: Info, - new_info: Info, - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - composition = ["info"] - - if old_info and new_info: - info_definition_fields = vars(new_info) | new_info.model_extra | old_info.model_extra - - for info_definition_field in info_definition_fields.keys(): - if info_definition_field == "contact": - continue - - old_value = getattr(old_info, info_definition_field, None) - new_value = getattr(new_info, info_definition_field, None) - - rule_name = None - description = None - - if old_value is None and new_value is not None: - rule_name = f"info_{_camel_to_snake(info_definition_field)}_added" - description = f"added with value: `{new_value}`" - - elif old_value is not None and new_value is None: - rule_name = f"info_{_camel_to_snake(info_definition_field)}_removed" - description = "removed info property" - - elif old_value != new_value: - rule_name = f"info_{_camel_to_snake(info_definition_field)}_updated" - description = f"changed from `{old_value}` to `{new_value}`" - - if rule_name is not None: - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [info_definition_field]), - ) - ) - - results.extend( - contact_breaking_changes( - old_contact=getattr(old_info, "contact", None), - new_contact=getattr(new_info, "contact", None), - composition=composition + ["contact"], - new_path=new_path, - include_severities=include_severities, - ) - ) - - return results - - -def contact_breaking_changes( - old_contact: Contact, - new_contact: Contact, - composition: list[str], - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - if not old_contact and new_contact: - rule_name = "contact_added" - severity = _get_rule(rule_name) - description = "added contact" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition), - ) - ) - - elif old_contact and not new_contact: - rule_name = "contact_removed" - severity = _get_rule(rule_name) - description = "removed contact" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition), - ) - ) - - elif old_contact and new_contact: - contact_definition_fields = vars(new_contact) | new_contact.model_extra | old_contact.model_extra - - for contact_definition_field in contact_definition_fields.keys(): - old_value = getattr(old_contact, contact_definition_field, None) - new_value = getattr(new_contact, contact_definition_field, None) - - rule_name = None - description = None - - if old_value is None and new_value is not None: - rule_name = f"contact_{_camel_to_snake(contact_definition_field)}_added" - description = f"added with value: `{new_value}`" - - elif old_value is not None and new_value is None: - rule_name = f"contact_{_camel_to_snake(contact_definition_field)}_removed" - description = "removed contact property" - - elif old_value != new_value: - rule_name = f"contact_{_camel_to_snake(contact_definition_field)}_updated" - description = f"changed from `{old_value}` to `{new_value}`" - - if rule_name is not None: - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [contact_definition_field]), - ) - ) - - return results - - -def terms_breaking_changes( - old_terms: Terms, - new_terms: Terms, - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - composition = ["terms"] - - if not old_terms and new_terms: - rule_name = "terms_added" - severity = _get_rule(rule_name) - description = "added terms" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition), - ) - ) - elif old_terms and not new_terms: - rule_name = "terms_removed" - severity = _get_rule(rule_name) - description = "removed terms" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition), - ) - ) - - if old_terms and new_terms: - terms_definition_fields = vars(new_terms) | new_terms.model_extra | old_terms.model_extra - - for terms_definition_field in terms_definition_fields.keys(): - old_value = getattr(old_terms, terms_definition_field, None) - new_value = getattr(new_terms, terms_definition_field, None) - - rule_name = None - description = None - - if old_value is None and new_value is not None: - rule_name = f"terms_{_camel_to_snake(terms_definition_field)}_added" - description = f"added with value: `{new_value}`" - - elif old_value is not None and new_value is None: - rule_name = f"terms_{_camel_to_snake(terms_definition_field)}_removed" - description = "removed info property" - - elif old_value != new_value: - rule_name = f"terms_{_camel_to_snake(terms_definition_field)}_updated" - description = f"changed from `{old_value}` to `{new_value}`" - - if rule_name is not None: - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [terms_definition_field]), - ) - ) - - return results - - -def quality_breaking_changes( - old_quality: DeprecatedQuality, - new_quality: DeprecatedQuality, - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - if not old_quality and new_quality: - rule_name = "quality_added" - severity = _get_rule(rule_name) - description = "added quality" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=["quality"]), - ) - ) - elif old_quality and not new_quality: - rule_name = "quality_removed" - severity = _get_rule(rule_name) - description = "removed quality" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=["quality"]), - ) - ) - - elif old_quality and new_quality: - if old_quality.type != new_quality.type: - rule_name = "quality_type_updated" - severity = _get_rule(rule_name) - description = f"changed from `{old_quality.type}` to `{new_quality.type}`" - - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=["quality", "type"]), - ) - ) - - if old_quality.specification != new_quality.specification: - rule_name = "quality_specification_updated" - severity = _get_rule(rule_name) - description = f"changed from `{old_quality.specification}` to `{new_quality.specification}`" - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=["quality", "specification"]), - ) - ) - - return results - - -def models_breaking_changes( - old_models: dict[str, Model], - new_models: dict[str, Model], - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - composition = ["models"] - results = list[BreakingChange]() - - for model_name, new_model in new_models.items(): - if model_name not in old_models.keys(): - rule_name = "model_added" - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description="added the model", - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [model_name]), - ) - ) - - for model_name, old_model in old_models.items(): - if model_name not in new_models.keys(): - rule_name = "model_removed" - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description="removed the model", - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [model_name]), - ) - ) - continue - - results.extend( - model_breaking_changes( - old_model=old_model, - new_model=new_models[model_name], - new_path=new_path, - composition=composition + [model_name], - include_severities=include_severities, - ) - ) - - return results - - -def model_breaking_changes( - old_model: Model, new_model: Model, new_path: str, composition: list[str], include_severities: [Severity] -) -> list[BreakingChange]: - results = list[BreakingChange]() - - model_definition_fields = vars(new_model) | new_model.model_extra | old_model.model_extra - - for model_definition_field in model_definition_fields.keys(): - if model_definition_field == "fields": - continue - - old_value = getattr(old_model, model_definition_field, None) - new_value = getattr(new_model, model_definition_field, None) - - rule_name = None - description = None - - if old_value is None and new_value is not None: - rule_name = f"model_{model_definition_field}_added" - description = f"added with value: `{new_value}`" - - elif old_value is not None and new_value is None: - rule_name = f"model_{model_definition_field}_removed" - description = "removed model property" - - elif old_value != new_value: - rule_name = f"model_{model_definition_field}_updated" - description = f"changed from `{old_value}` to `{new_value}`" - - if rule_name is not None: - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [model_definition_field]), - ) - ) - - results.extend( - fields_breaking_changes( - old_fields=old_model.fields, - new_fields=new_model.fields, - new_path=new_path, - composition=composition + ["fields"], - include_severities=include_severities, - ) - ) - - return results - - -def fields_breaking_changes( - old_fields: dict[str, Field], - new_fields: dict[str, Field], - new_path: str, - composition: list[str], - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - for field_name, new_field in new_fields.items(): - if field_name not in old_fields.keys(): - rule_name = "field_added" - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description="added the field", - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [field_name]), - ) - ) - - for field_name, old_field in old_fields.items(): - if field_name not in new_fields.keys(): - rule_name = "field_removed" - severity = _get_rule(rule_name) - if severity in include_severities: - results.append( - BreakingChange( - description="removed the field", - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [field_name]), - ) - ) - continue - - results.extend( - field_breaking_changes( - old_field=old_field, - new_field=new_fields[field_name], - composition=composition + [field_name], - new_path=new_path, - include_severities=include_severities, - ) - ) - return results - - -def field_breaking_changes( - old_field: Field, - new_field: Field, - composition: list[str], - new_path: str, - include_severities: [Severity], -) -> list[BreakingChange]: - results = list[BreakingChange]() - - field_definition_fields = vars(new_field) | new_field.model_extra | old_field.model_extra - for field_definition_field in field_definition_fields.keys(): - if field_definition_field == "ref_obj": - continue - - old_value = getattr(old_field, field_definition_field, None) - new_value = getattr(new_field, field_definition_field, None) - - if field_definition_field == "fields": - results.extend( - fields_breaking_changes( - old_fields=old_field.fields, - new_fields=new_field.fields, - new_path=new_path, - composition=composition + [field_definition_field], - include_severities=include_severities, - ) - ) - continue - - if field_definition_field == "items" and old_field.type == "array" and new_field.type == "array": - results.extend( - field_breaking_changes( - old_field=old_value, - new_field=new_value, - composition=composition + ["items"], - new_path=new_path, - include_severities=include_severities, - ) - ) - continue - - rule_name = None - description = None - - # logic for enum, tags and other arrays - if isinstance(old_value, list) and isinstance(new_value, list): - if not old_value and new_value: - rule_name = f"field_{_camel_to_snake(field_definition_field)}_added" - description = f"added with value: `{new_value}`" - elif old_value and not new_value: - rule_name = f"field_{_camel_to_snake(field_definition_field)}_removed" - description = "removed field property" - elif sorted(old_value) != sorted(new_value): - rule_name = f"field_{_camel_to_snake(field_definition_field)}_updated" - description = f"changed from `{old_value}` to `{new_value}`" - - # logic for normal fields - elif old_value is None and new_value is not None: - rule_name = f"field_{_camel_to_snake(field_definition_field)}_added" - description = f"added with value: `{str(new_value).lower() if isinstance(new_value, bool) else new_value}`" - - elif old_value is not None and new_value is None: - rule_name = f"field_{_camel_to_snake(field_definition_field)}_removed" - description = "removed field property" - - elif old_value != new_value: - rule_name = f"field_{_camel_to_snake(field_definition_field)}_updated" - description = ( - f"changed from `{str(old_value).lower() if isinstance(old_value, bool) else old_value}` " - f"to `{str(new_value).lower() if isinstance(new_value, bool) else new_value}`" - ) - - if rule_name is not None: - severity = _get_rule(rule_name) - field_schema_name = "$ref" if field_definition_field == "ref" else field_definition_field - if severity in include_severities: - results.append( - BreakingChange( - description=description, - check_name=rule_name, - severity=severity, - location=Location(path=new_path, composition=composition + [field_schema_name]), - ) - ) - - return results - - -def _get_rule(rule_name) -> Severity: - try: - return getattr(BreakingRules, rule_name) - except AttributeError: - try: - first, *_, last = rule_name.split("_") - short_rule = "__".join([first, last]) - return getattr(BreakingRules, short_rule) - except AttributeError: - print(f"WARNING: Breaking Rule not found for {rule_name}!") - return Severity.ERROR - - -def _camel_to_snake(s): - s = s.replace("-", "_") - return "".join(["_" + c.lower() if c.isupper() else c for c in s]).lstrip("_") diff --git a/datacontract/breaking/breaking_change.py b/datacontract/breaking/breaking_change.py deleted file mode 100644 index a9f76d9b8..000000000 --- a/datacontract/breaking/breaking_change.py +++ /dev/null @@ -1,73 +0,0 @@ -from enum import Enum -from typing import List - -from pydantic import BaseModel - - -class Severity(Enum): - ERROR = "error" - WARNING = "warning" - INFO = "info" - - def __str__(self) -> str: - return self.value - - -class Location(BaseModel): - path: str - composition: List[str] - - -class BreakingChange(BaseModel): - description: str - severity: Severity - check_name: str - location: Location - - def __str__(self) -> str: - return f"""{self.severity}\t\[{self.check_name}] at {self.location.path} - in {str.join(".", self.location.composition)} - {self.description}""" - - -class BreakingChanges(BaseModel): - breaking_changes: List[BreakingChange] - - def passed_checks(self) -> bool: - errors = len(list(filter(lambda x: x.severity == Severity.ERROR, self.breaking_changes))) - return errors == 0 - - def breaking_str(self) -> str: - changes_amount = len(self.breaking_changes) - errors = len(list(filter(lambda x: x.severity == Severity.ERROR, self.breaking_changes))) - warnings = len(list(filter(lambda x: x.severity == Severity.WARNING, self.breaking_changes))) - - headline = f"{changes_amount} breaking changes: {errors} error, {warnings} warning\n" - content = str.join("\n\n", map(lambda x: str(x), self.breaking_changes)) - - return headline + content - - def changelog_str(self) -> str: - changes_amount = len(self.breaking_changes) - errors = len(list(filter(lambda x: x.severity == Severity.ERROR, self.breaking_changes))) - warnings = len(list(filter(lambda x: x.severity == Severity.WARNING, self.breaking_changes))) - infos = len(list(filter(lambda x: x.severity == Severity.INFO, self.breaking_changes))) - - headline = f"{changes_amount} changes: {errors} error, {warnings} warning, {infos} info\n" - content = str.join("\n\n", map(lambda x: str(x), self.breaking_changes)) - - return headline + content - - -# -# [ -# { -# "description": "removed the field updated_at", -# "check_name": "field-removed", -# "severity": "error", -# "location": { -# "path": "./examples/breaking/datacontract-v2.yaml", -# "model": "my_table", -# } -# } -# ] diff --git a/datacontract/breaking/breaking_rules.py b/datacontract/breaking/breaking_rules.py deleted file mode 100644 index 51b0b937c..000000000 --- a/datacontract/breaking/breaking_rules.py +++ /dev/null @@ -1,130 +0,0 @@ -from datacontract.breaking.breaking_change import Severity - - -class BreakingRules: - # model rules - model_added = Severity.INFO - model_removed = Severity.ERROR - - model_description_added = Severity.INFO - model_description_removed = Severity.INFO - model_description_updated = Severity.INFO - - model_type_updated = Severity.ERROR - - model__removed = Severity.INFO # To support model extension keys - model__added = Severity.INFO - model__updated = Severity.INFO - - # field rules - field_added = Severity.INFO - field_removed = Severity.ERROR - - field_ref_added = Severity.WARNING - field_ref_removed = Severity.WARNING - field_ref_updated = Severity.WARNING - - field_title_added = Severity.INFO - field_title_removed = Severity.INFO - field_title_updated = Severity.INFO - - field_type_added = Severity.WARNING - field_type_removed = Severity.WARNING - field_type_updated = Severity.ERROR - - field_format_added = Severity.WARNING - field_format_removed = Severity.WARNING - field_format_updated = Severity.ERROR - - field_required_updated = Severity.ERROR - - field_primary_added = Severity.WARNING - field_primary_removed = Severity.WARNING - field_primary_updated = Severity.WARNING - - field_primary_key_added = Severity.WARNING - field_primary_key_removed = Severity.WARNING - field_primary_key_updated = Severity.WARNING - - field_references_added = Severity.WARNING - field_references_removed = Severity.WARNING - field_references_updated = Severity.WARNING - - field_unique_updated = Severity.ERROR - - field_description_added = Severity.INFO - field_description_removed = Severity.INFO - field_description_updated = Severity.INFO - - field_pii_added = Severity.WARNING - field_pii_removed = Severity.ERROR - field_pii_updated = Severity.ERROR - - field_classification_added = Severity.WARNING - field_classification_removed = Severity.ERROR - field_classification_updated = Severity.ERROR - - field_pattern_added = Severity.WARNING - field_pattern_removed = Severity.ERROR - field_pattern_updated = Severity.ERROR - - field_min_length_added = Severity.WARNING - field_min_length_removed = Severity.WARNING - field_min_length_updated = Severity.ERROR - - field_max_length_added = Severity.WARNING - field_max_length_removed = Severity.WARNING - field_max_length_updated = Severity.ERROR - - field_minimum_added = Severity.WARNING - field_minimum_removed = Severity.WARNING - field_minimum_updated = Severity.ERROR - - field_exclusive_minimum_added = Severity.WARNING - field_exclusive_minimum_removed = Severity.WARNING - field_exclusive_minimum_updated = Severity.ERROR - - field_maximum_added = Severity.WARNING - field_maximum_removed = Severity.WARNING - field_maximum_updated = Severity.ERROR - - field_exclusive_maximum_added = Severity.WARNING - field_exclusive_maximum_removed = Severity.WARNING - field_exclusive_maximum_updated = Severity.ERROR - - field_enum_added = Severity.WARNING - field_enum_removed = Severity.INFO - field_enum_updated = Severity.ERROR - - field_tags_added = Severity.INFO - field_tags_removed = Severity.INFO - field_tags_updated = Severity.INFO - - field_example_added = Severity.INFO - field_example_updated = Severity.INFO - field_example_removed = Severity.INFO - - field__removed = Severity.INFO # To support field extension keys - field__added = Severity.INFO - field__updated = Severity.INFO - - # quality Rules - quality_added = Severity.INFO - quality_removed = Severity.WARNING - - quality_type_updated = Severity.WARNING - quality_specification_updated = Severity.WARNING - - # info rules - info__added = Severity.INFO # will match `info__added` etc - info__removed = Severity.INFO - info__updated = Severity.INFO - - contact__added = Severity.INFO - contact__removed = Severity.INFO - contact__updated = Severity.INFO - - # terms rules - terms__added = Severity.INFO - terms__removed = Severity.INFO - terms__updated = Severity.INFO diff --git a/datacontract/catalog/catalog.py b/datacontract/catalog/catalog.py index 50a92aecc..c06066f45 100644 --- a/datacontract/catalog/catalog.py +++ b/datacontract/catalog/catalog.py @@ -1,21 +1,35 @@ +import logging from dataclasses import dataclass from datetime import datetime from pathlib import Path +from typing import Optional import pytz from jinja2 import Environment, PackageLoader, select_autoescape +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.data_contract import DataContract from datacontract.export.html_exporter import get_version -from datacontract.model.data_contract_specification import DataContractSpecification + + +def _get_owner(odcs: OpenDataContractStandard) -> Optional[str]: + """Get the owner from ODCS customProperties or team.""" + if odcs.team and hasattr(odcs.team, "name") and odcs.team.name: + return odcs.team.name + if odcs.customProperties: + for prop in odcs.customProperties: + if prop.property == "owner": + return prop.value + return None def create_data_contract_html(contracts, file: Path, path: Path, schema: str): + logging.debug(f"Creating data contract html for file {file} and schema {schema}") data_contract = DataContract( - data_contract_file=f"{file.absolute()}", inline_definitions=True, inline_quality=True, schema_location=schema + data_contract_file=f"{file.absolute()}", inline_definitions=True, schema_location=schema ) html = data_contract.export(export_format="html") - spec = data_contract.get_data_contract_specification() + odcs = data_contract.get_data_contract() file_without_suffix = file.with_suffix(".html") html_filepath = path / file_without_suffix html_filepath.parent.mkdir(parents=True, exist_ok=True) @@ -25,19 +39,71 @@ def create_data_contract_html(contracts, file: Path, path: Path, schema: str): DataContractView( html_filepath=html_filepath, html_link=file_without_suffix, - spec=spec, + odcs=odcs, ) ) print(f"Created {html_filepath}") +@dataclass +class _InfoView: + """Unified info view for templates.""" + title: str + version: str + owner: Optional[str] + description: Optional[str] + + +@dataclass +class _SpecView: + """Unified spec view for templates, compatible with DCS template structure.""" + info: _InfoView + models: dict + + @dataclass class DataContractView: """Class for keeping track of an item in inventory.""" html_filepath: Path html_link: Path - spec: DataContractSpecification + odcs: OpenDataContractStandard + + @property + def spec(self) -> _SpecView: + """Provide a DCS-compatible view for templates.""" + # Build models dict from ODCS schema + models = {} + if self.odcs.schema_: + for schema in self.odcs.schema_: + fields = {} + if schema.properties: + for prop in schema.properties: + fields[prop.name] = { + "description": prop.description, + } + models[schema.name] = { + "description": schema.description, + "fields": fields, + } + + # Get description + description = None + if self.odcs.description: + if isinstance(self.odcs.description, str): + description = self.odcs.description + elif hasattr(self.odcs.description, "purpose"): + description = self.odcs.description.purpose + + return _SpecView( + info=_InfoView( + title=self.odcs.name or self.odcs.id or "", + version=self.odcs.version or "", + owner=_get_owner(self.odcs), + description=description, + ), + models=models, + ) def create_index_html(contracts, path): @@ -72,7 +138,7 @@ def create_index_html(contracts, path): datacontract_cli_version=datacontract_cli_version, contracts=contracts, contracts_size=len(contracts), - owners=sorted(set(dc.spec.info.owner for dc in contracts if dc.spec.info.owner)), + owners=sorted(set(_get_owner(dc.odcs) for dc in contracts if _get_owner(dc.odcs))), ) f.write(html_string) print(f"Created {index_filepath}") diff --git a/datacontract/cli.py b/datacontract/cli.py index 475493bb0..21872ab01 100644 --- a/datacontract/cli.py +++ b/datacontract/cli.py @@ -1,4 +1,6 @@ +import logging import os +import sys from importlib import metadata from pathlib import Path from typing import Iterable, List, Optional @@ -11,17 +13,20 @@ from datacontract.catalog.catalog import create_data_contract_html, create_index_html from datacontract.data_contract import DataContract, ExportFormat -from datacontract.imports.importer import ImportFormat, Spec +from datacontract.imports.importer import ImportFormat from datacontract.init.init_template import get_init_template -from datacontract.integration.datamesh_manager import ( - publish_data_contract_to_datamesh_manager, +from datacontract.integration.entropy_data import ( + publish_data_contract_to_entropy_data, ) -from datacontract.lint.resolve import resolve_data_contract_dict +from datacontract.lint.resolve import resolve_data_contract, resolve_data_contract_dict +from datacontract.model.exceptions import DataContractException from datacontract.output.output_format import OutputFormat from datacontract.output.test_results_writer import write_test_result console = Console() +debug_option = Annotated[bool, typer.Option(help="Enable debug logging")] + class OrderedCommands(TyperGroup): def list_commands(self, ctx: Context) -> Iterable[str]: @@ -62,17 +67,20 @@ def common( pass -@app.command() +@app.command(name="init") def init( location: Annotated[ str, typer.Argument(help="The location of the data contract file to create.") ] = "datacontract.yaml", template: Annotated[str, typer.Option(help="URL of a template or data contract")] = None, overwrite: Annotated[bool, typer.Option(help="Replace the existing datacontract.yaml")] = False, + debug: debug_option = None, ): """ Create an empty data contract. """ + enable_debug_logging(debug) + if not overwrite and os.path.exists(location): console.print("File already exists, use --overwrite to overwrite") raise typer.Exit(code=1) @@ -82,7 +90,7 @@ def init( console.print("📄 data contract written to " + location) -@app.command() +@app.command(name="lint") def lint( location: Annotated[ str, @@ -90,7 +98,7 @@ def lint( ] = "datacontract.yaml", schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, output: Annotated[ Path, @@ -99,15 +107,25 @@ def lint( ), ] = None, output_format: Annotated[OutputFormat, typer.Option(help="The target format for the test results.")] = None, + debug: debug_option = None, ): """ Validate that the datacontract.yaml is correctly formatted. """ + enable_debug_logging(debug) + run = DataContract(data_contract_file=location, schema_location=schema).lint() write_test_result(run, console, output_format, output) -@app.command() +def enable_debug_logging(debug: bool): + if debug: + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", stream=sys.stderr + ) + + +@app.command(name="test") def test( location: Annotated[ str, @@ -115,7 +133,7 @@ def test( ] = "datacontract.yaml", schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, server: Annotated[ str, @@ -142,10 +160,13 @@ def test( bool, typer.Option(help="SSL verification when publishing the data contract."), ] = True, + debug: debug_option = None, ): """ Run schema and quality tests on configured servers. """ + enable_debug_logging(debug) + console.print(f"Testing {location}") if server == "all": server = None @@ -159,10 +180,14 @@ def test( ).test() if logs: _print_logs(run) - write_test_result(run, console, output_format, output) + try: + data_contract = resolve_data_contract(location, schema_location=schema) + except Exception: + data_contract = None + write_test_result(run, console, output_format, output, data_contract) -@app.command() +@app.command(name="export") def export( format: Annotated[ExportFormat, typer.Option(help="The export format.")], output: Annotated[ @@ -172,12 +197,11 @@ def export( ), ] = None, server: Annotated[str, typer.Option(help="The server name to export.")] = None, - model: Annotated[ + schema_name: Annotated[ str, typer.Option( - help="Use the key of the model in the data contract yaml file " - "to refer to a model, e.g., `orders`, or `all` for all " - "models (default)." + help="The name of the schema to export, e.g., `orders`, or `all` for all " + "schemas (default)." ), ] = "all", # TODO: this should be a subcommand @@ -202,7 +226,7 @@ def export( ] = "datacontract.yaml", schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, # TODO: this should be a subcommand engine: Annotated[ @@ -216,10 +240,13 @@ def export( help="The file path or URL of a template. For Excel format: path/URL to custom Excel template. For custom format: path to Jinja template." ), ] = None, + debug: debug_option = None, ): """ Convert data contract to a specific format. Saves to file specified by `output` option if present, otherwise prints to stdout. """ + enable_debug_logging(debug) + # Validate that Excel format requires an output file path if format == ExportFormat.excel and output is None: console.print("❌ Error: Excel export requires an output file path.") @@ -230,7 +257,7 @@ def export( # TODO exception handling result = DataContract(data_contract_file=location, schema_location=schema, server=server).export( export_format=format, - model=model, + schema_name=schema_name, server=server, rdf_base=rdf_base, sql_server_type=sql_server_type, @@ -264,10 +291,6 @@ def import_( Optional[str], typer.Option(help="The path to the file that should be imported."), ] = None, - spec: Annotated[ - Spec, - typer.Option(help="The format of the data contract to import. "), - ] = Spec.datacontract_specification, dialect: Annotated[ Optional[str], typer.Option(help="The SQL dialect to use when importing SQL files, e.g., postgres, tsql, bigquery."), @@ -313,11 +336,11 @@ def import_( ] = None, template: Annotated[ Optional[str], - typer.Option(help="The location (url or path) of the Data Contract Specification Template"), + typer.Option(help="The location (url or path) of the ODCS template"), ] = None, schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, owner: Annotated[ Optional[str], @@ -327,14 +350,16 @@ def import_( Optional[str], typer.Option(help="The identifier for the the data contract."), ] = None, + debug: debug_option = None, ): """ Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout. """ - result = DataContract().import_from_source( + enable_debug_logging(debug) + + result = DataContract.import_from_source( format=format, source=source, - spec=spec, template=template, schema=schema, dialect=dialect, @@ -366,17 +391,20 @@ def publish( ] = "datacontract.yaml", schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, ssl_verification: Annotated[ bool, typer.Option(help="SSL verification when publishing the data contract."), ] = True, + debug: debug_option = None, ): """ - Publish the data contract to the Data Mesh Manager. + Publish the data contract to the Entropy Data. """ - publish_data_contract_to_datamesh_manager( + enable_debug_logging(debug) + + publish_data_contract_to_entropy_data( data_contract_dict=resolve_data_contract_dict(location), ssl_verification=ssl_verification, ) @@ -393,12 +421,15 @@ def catalog( output: Annotated[Optional[str], typer.Option(help="Output directory for the catalog html files.")] = "catalog/", schema: Annotated[ str, - typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"), + typer.Option(help="The location (url or path) of the ODCS JSON Schema"), ] = None, + debug: debug_option = None, ): """ Create a html catalog of data contracts. """ + enable_debug_logging(debug) + path = Path(output) path.mkdir(parents=True, exist_ok=True) console.print(f"Created {output}") @@ -407,84 +438,17 @@ def catalog( for file in Path().rglob(files): try: create_data_contract_html(contracts, file, path, schema) + except DataContractException as e: + if e.reason == "Cannot parse ODPS product": + console.print(f"Skipped {file} due to error: {e.reason}") + else: + console.print(f"Skipped {file} due to error: {e}") except Exception as e: console.print(f"Skipped {file} due to error: {e}") create_index_html(contracts, path) -@app.command() -def breaking( - location_old: Annotated[ - str, - typer.Argument(help="The location (url or path) of the old data contract yaml."), - ], - location_new: Annotated[ - str, - typer.Argument(help="The location (url or path) of the new data contract yaml."), - ], -): - """ - Identifies breaking changes between data contracts. Prints to stdout. - """ - - # TODO exception handling - result = DataContract(data_contract_file=location_old, inline_definitions=True).breaking( - DataContract(data_contract_file=location_new, inline_definitions=True) - ) - - console.print(result.breaking_str()) - - if not result.passed_checks(): - raise typer.Exit(code=1) - - -@app.command() -def changelog( - location_old: Annotated[ - str, - typer.Argument(help="The location (url or path) of the old data contract yaml."), - ], - location_new: Annotated[ - str, - typer.Argument(help="The location (url or path) of the new data contract yaml."), - ], -): - """ - Generate a changelog between data contracts. Prints to stdout. - """ - - # TODO exception handling - result = DataContract(data_contract_file=location_old, inline_definitions=True).changelog( - DataContract(data_contract_file=location_new, inline_definitions=True) - ) - - console.print(result.changelog_str()) - - -@app.command() -def diff( - location_old: Annotated[ - str, - typer.Argument(help="The location (url or path) of the old data contract yaml."), - ], - location_new: Annotated[ - str, - typer.Argument(help="The location (url or path) of the new data contract yaml."), - ], -): - """ - PLACEHOLDER. Currently works as 'changelog' does. - """ - - # TODO change to diff output, not the changelog entries - result = DataContract(data_contract_file=location_old, inline_definitions=True).changelog( - DataContract(data_contract_file=location_new, inline_definitions=True) - ) - - console.print(result.changelog_str()) - - def _get_uvicorn_arguments(port: int, host: str, context: typer.Context) -> dict: """ Take the default datacontract uvicorn arguments and merge them with the @@ -503,13 +467,14 @@ def _get_uvicorn_arguments(port: int, host: str, context: typer.Context) -> dict return default_args | dict(zip(trimmed_keys, context.args[1::2])) -@app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True}) +@app.command(name="api", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}) def api( ctx: Annotated[typer.Context, typer.Option(help="Extra arguments to pass to uvicorn.run().")], port: Annotated[int, typer.Option(help="Bind socket to this port.")] = 4242, host: Annotated[ str, typer.Option(help="Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0") ] = "127.0.0.1", + debug: debug_option = None, ): """ Start the datacontract CLI as server application with REST API. @@ -527,6 +492,8 @@ def api( It is possible to run the API with extra arguments for `uvicorn.run()` as keyword arguments, e.g.: `datacontract api --port 1234 --root_path /datacontract`. """ + enable_debug_logging(debug) + import uvicorn from uvicorn.config import LOGGING_CONFIG diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index 43f11ff0c..b7da9dd1e 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -1,37 +1,19 @@ import logging import typing -from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard - -from datacontract.export.odcs_v3_exporter import to_odcs_v3 -from datacontract.imports.importer import ImportFormat, Spec -from datacontract.imports.odcs_v3_importer import import_from_odcs +from open_data_contract_standard.model import OpenDataContractStandard, Team if typing.TYPE_CHECKING: + from duckdb.duckdb import DuckDBPyConnection from pyspark.sql import SparkSession -from duckdb.duckdb import DuckDBPyConnection - -from datacontract.breaking.breaking import ( - info_breaking_changes, - models_breaking_changes, - quality_breaking_changes, - terms_breaking_changes, -) -from datacontract.breaking.breaking_change import BreakingChange, BreakingChanges, Severity from datacontract.engines.data_contract_test import execute_data_contract_test from datacontract.export.exporter import ExportFormat from datacontract.export.exporter_factory import exporter_factory from datacontract.imports.importer_factory import importer_factory from datacontract.init.init_template import get_init_template -from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager +from datacontract.integration.entropy_data import publish_test_results_to_entropy_data from datacontract.lint import resolve -from datacontract.lint.linters.description_linter import DescriptionLinter -from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter -from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter -from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter -from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter -from datacontract.model.data_contract_specification import DataContractSpecification, Info from datacontract.model.exceptions import DataContractException from datacontract.model.run import Check, ResultEnum, Run @@ -41,14 +23,13 @@ def __init__( self, data_contract_file: str = None, data_contract_str: str = None, - data_contract: DataContractSpecification = None, + data_contract: OpenDataContractStandard = None, schema_location: str = None, server: str = None, publish_url: str = None, spark: "SparkSession" = None, - duckdb_connection: DuckDBPyConnection = None, + duckdb_connection: "DuckDBPyConnection" = None, inline_definitions: bool = True, - inline_quality: bool = True, ssl_verification: bool = True, publish_test_results: bool = False, ): @@ -62,26 +43,15 @@ def __init__( self._spark = spark self._duckdb_connection = duckdb_connection self._inline_definitions = inline_definitions - self._inline_quality = inline_quality self._ssl_verification = ssl_verification - self.all_linters = { - FieldPatternLinter(), - FieldReferenceLinter(), - NoticePeriodLinter(), - ValidFieldConstraintsLinter(), - DescriptionLinter(), - } @classmethod - def init(cls, template: typing.Optional[str], schema: typing.Optional[str] = None) -> DataContractSpecification: + def init(cls, template: typing.Optional[str], schema: typing.Optional[str] = None) -> OpenDataContractStandard: template_str = get_init_template(template) return resolve.resolve_data_contract(data_contract_str=template_str, schema_location=schema) - def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run: - """Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters. - - enabled_linters can be either "all" or "none", or a set of linter IDs. The "schema" linter is always enabled, even with enabled_linters="none". - """ + def lint(self) -> Run: + """Lint the data contract by validating it against the JSON schema.""" run = Run.create_run() try: run.log_info("Linting data contract") @@ -91,7 +61,6 @@ def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run: self._data_contract, self._schema_location, inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, ) run.checks.append( Check( @@ -101,29 +70,8 @@ def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run: engine="datacontract", ) ) - if enabled_linters == "none": - linters_to_check = set() - elif enabled_linters == "all": - linters_to_check = self.all_linters - elif isinstance(enabled_linters, set): - linters_to_check = {linter for linter in self.all_linters if linter.id in enabled_linters} - else: - raise RuntimeError(f"Unknown argument enabled_linters={enabled_linters} for lint()") - for linter in linters_to_check: - try: - run.checks.extend(linter.lint(data_contract)) - except Exception as e: - run.checks.append( - Check( - type="general", - result=ResultEnum.error, - name=f"Linter '{linter.name}'", - reason=str(e), - engine="datacontract", - ) - ) run.dataContractId = data_contract.id - run.dataContractVersion = data_contract.info.version + run.dataContractVersion = data_contract.version except DataContractException as e: run.checks.append( Check(type=e.type, result=e.result, name=e.name, reason=e.reason, engine=e.engine, details="") @@ -153,7 +101,6 @@ def test(self) -> Run: self._data_contract, self._schema_location, inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, ) execute_data_contract_test(data_contract, run, self._server, self._spark, self._duckdb_connection) @@ -187,89 +134,38 @@ def test(self) -> Run: run.finish() if self._publish_url is not None or self._publish_test_results: - publish_test_results_to_datamesh_manager(run, self._publish_url, self._ssl_verification) + publish_test_results_to_entropy_data(run, self._publish_url, self._ssl_verification) return run - def breaking(self, other: "DataContract") -> BreakingChanges: - return self.changelog(other, include_severities=[Severity.ERROR, Severity.WARNING]) - - def changelog( - self, other: "DataContract", include_severities: [Severity] = (Severity.ERROR, Severity.WARNING, Severity.INFO) - ) -> BreakingChanges: - old = self.get_data_contract_specification() - new = other.get_data_contract_specification() - - breaking_changes = list[BreakingChange]() - - breaking_changes.extend( - info_breaking_changes( - old_info=old.info, - new_info=new.info, - new_path=other._data_contract_file, - include_severities=include_severities, - ) - ) - - breaking_changes.extend( - terms_breaking_changes( - old_terms=old.terms, - new_terms=new.terms, - new_path=other._data_contract_file, - include_severities=include_severities, - ) - ) - - breaking_changes.extend( - quality_breaking_changes( - old_quality=old.quality, - new_quality=new.quality, - new_path=other._data_contract_file, - include_severities=include_severities, - ) - ) - - breaking_changes.extend( - models_breaking_changes( - old_models=old.models, - new_models=new.models, - new_path=other._data_contract_file, - include_severities=include_severities, - ) - ) - - return BreakingChanges(breaking_changes=breaking_changes) - - def get_data_contract_specification(self) -> DataContractSpecification: + def get_data_contract(self) -> OpenDataContractStandard: return resolve.resolve_data_contract( data_contract_location=self._data_contract_file, data_contract_str=self._data_contract_str, data_contract=self._data_contract, schema_location=self._schema_location, inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, ) def export( - self, export_format: ExportFormat, model: str = "all", sql_server_type: str = "auto", **kwargs + self, export_format: ExportFormat, schema_name: str = "all", sql_server_type: str = "auto", **kwargs ) -> str | bytes: if ( export_format == ExportFormat.html or export_format == ExportFormat.mermaid or export_format == ExportFormat.excel ): - data_contract = resolve.resolve_data_contract_v2( + data_contract = resolve.resolve_data_contract( self._data_contract_file, self._data_contract_str, self._data_contract, schema_location=self._schema_location, inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, ) return exporter_factory.create(export_format).export( data_contract=data_contract, - model=model, + schema_name=schema_name, server=self._server, sql_server_type=sql_server_type, export_args=kwargs, @@ -281,103 +177,52 @@ def export( self._data_contract, schema_location=self._schema_location, inline_definitions=self._inline_definitions, - inline_quality=self._inline_quality, ) return exporter_factory.create(export_format).export( data_contract=data_contract, - model=model, + schema_name=schema_name, server=self._server, sql_server_type=sql_server_type, export_args=kwargs, ) - # REFACTOR THIS - # could be a class method, not using anything from the instance + @classmethod def import_from_source( - self, + cls, format: str, source: typing.Optional[str] = None, template: typing.Optional[str] = None, - schema: typing.Optional[str] = None, - spec: Spec = Spec.datacontract_specification, **kwargs, - ) -> DataContractSpecification | OpenDataContractStandard: + ) -> OpenDataContractStandard: + """Import a data contract from a source in a given format. + + All imports now return OpenDataContractStandard (ODCS) format. + """ id = kwargs.get("id") owner = kwargs.get("owner") - if spec == Spec.odcs or format == ImportFormat.excel: - data_contract_specification_initial = DataContract.init(template=template, schema=schema) - - odcs_imported = importer_factory.create(format).import_source( - data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs - ) - - if isinstance(odcs_imported, DataContractSpecification): - # convert automatically - odcs_imported = to_odcs_v3(odcs_imported) - - self._overwrite_id_in_odcs(odcs_imported, id) - self._overwrite_owner_in_odcs(odcs_imported, owner) - - return odcs_imported - elif spec == Spec.datacontract_specification: - data_contract_specification_initial = DataContract.init(template=template, schema=schema) - - data_contract_specification_imported = importer_factory.create(format).import_source( - data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs - ) - - if isinstance(data_contract_specification_imported, OpenDataContractStandard): - # convert automatically - data_contract_specification_imported = import_from_odcs( - data_contract_specification_initial, data_contract_specification_imported - ) - - self._overwrite_id_in_data_contract_specification(data_contract_specification_imported, id) - self._overwrite_owner_in_data_contract_specification(data_contract_specification_imported, owner) - - return data_contract_specification_imported - else: - raise DataContractException( - type="general", - result=ResultEnum.error, - name="Import Data Contract", - reason=f"Unsupported data contract format: {spec}", - engine="datacontract", - ) - - def _overwrite_id_in_data_contract_specification( - self, data_contract_specification: DataContractSpecification, id: str | None - ): - if not id: - return - - data_contract_specification.id = id + odcs_imported = importer_factory.create(format).import_source( + source=source, import_args=kwargs + ) - def _overwrite_owner_in_data_contract_specification( - self, data_contract_specification: DataContractSpecification, owner: str | None - ): - if not owner: - return + cls._overwrite_id_in_odcs(odcs_imported, id) + cls._overwrite_owner_in_odcs(odcs_imported, owner) - if data_contract_specification.info is None: - data_contract_specification.info = Info() - data_contract_specification.info.owner = owner + return odcs_imported - def _overwrite_owner_in_odcs(self, odcs: OpenDataContractStandard, owner: str | None): + @staticmethod + def _overwrite_owner_in_odcs(odcs: OpenDataContractStandard, owner: str | None): if not owner: return - if odcs.customProperties is None: - odcs.customProperties = [] - for customProperty in odcs.customProperties: - if customProperty.name == "owner": - customProperty.value = owner - return - odcs.customProperties.append(CustomProperty(property="owner", value=owner)) + if odcs.team is None: + odcs.team = Team(name=owner) + else: + odcs.team.name = owner - def _overwrite_id_in_odcs(self, odcs: OpenDataContractStandard, id: str | None): + @staticmethod + def _overwrite_id_in_odcs(odcs: OpenDataContractStandard, id: str | None): if not id: return diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index c98dcc1b7..b6e6df63e 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -1,13 +1,19 @@ import re import uuid from dataclasses import dataclass -from typing import List +from typing import List, Optional from venv import logger import yaml +from open_data_contract_standard.model import ( + DataQuality, + OpenDataContractStandard, + SchemaObject, + SchemaProperty, + Server, +) from datacontract.export.sql_type_converter import convert_to_sql_type -from datacontract.model.data_contract_specification import DataContractSpecification, Quality, Server from datacontract.model.run import Check @@ -15,76 +21,128 @@ class QuotingConfig: quote_field_name: bool = False quote_model_name: bool = False + quote_model_name_with_backticks: bool = False -def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]: +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_custom_property_value(prop: SchemaProperty, key: str) -> Optional[str]: + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_schema_custom_property_value(schema: SchemaObject, key: str) -> Optional[str]: + """Get a custom property value from schema.""" + if schema.customProperties is None: + return None + for cp in schema.customProperties: + if cp.property == key: + return cp.value + return None + + +def create_checks(data_contract: OpenDataContractStandard, server: Server) -> List[Check]: checks: List[Check] = [] - for model_key, model_value in data_contract_spec.models.items(): - model_checks = to_model_checks(model_key, model_value, server) - checks.extend(model_checks) - checks.extend(to_servicelevel_checks(data_contract_spec)) - checks.append(to_quality_check(data_contract_spec)) + if data_contract.schema_ is None: + return checks + for schema_obj in data_contract.schema_: + schema_checks = to_schema_checks(schema_obj, server) + checks.extend(schema_checks) + checks.extend(to_servicelevel_checks(data_contract)) return [check for check in checks if check is not None] -def to_model_checks(model_key, model_value, server: Server) -> List[Check]: +def to_schema_checks(schema_object: SchemaObject, server: Server) -> List[Check]: checks: List[Check] = [] server_type = server.type if server and server.type else None - model_name = to_model_name(model_key, model_value, server_type) - fields = model_value.fields + schema_name = to_schema_name(schema_object, server_type) + properties = schema_object.properties or [] check_types = is_check_types(server) - quoting_config = QuotingConfig( - quote_field_name=server_type in ["postgres", "sqlserver"], - quote_model_name=server_type in ["postgres", "sqlserver"], + type1 = server.type if server and server.type else None + config = QuotingConfig( + quote_field_name=type1 in ["postgres", "sqlserver", "azure"], + quote_model_name=type1 in ["postgres", "sqlserver"], + quote_model_name_with_backticks=type1 == "bigquery", ) - - for field_name, field in fields.items(): - checks.append(check_field_is_present(model_name, field_name, quoting_config)) - if check_types and field.type is not None: - sql_type = convert_to_sql_type(field, server_type) - checks.append(check_field_type(model_name, field_name, sql_type, quoting_config)) - if field.required: - checks.append(check_field_required(model_name, field_name, quoting_config)) - if field.unique: - checks.append(check_field_unique(model_name, field_name, quoting_config)) - if field.minLength is not None: - checks.append(check_field_min_length(model_name, field_name, field.minLength, quoting_config)) - if field.maxLength is not None: - checks.append(check_field_max_length(model_name, field_name, field.maxLength, quoting_config)) - if field.minimum is not None: - checks.append(check_field_minimum(model_name, field_name, field.minimum, quoting_config)) - if field.maximum is not None: - checks.append(check_field_maximum(model_name, field_name, field.maximum, quoting_config)) - if field.exclusiveMinimum is not None: - checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quoting_config)) - checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quoting_config)) - if field.exclusiveMaximum is not None: - checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quoting_config)) - checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quoting_config)) - if field.pattern is not None: - checks.append(check_field_regex(model_name, field_name, field.pattern, quoting_config)) - if field.enum is not None and len(field.enum) > 0: - checks.append(check_field_enum(model_name, field_name, field.enum, quoting_config)) - if field.quality is not None and len(field.quality) > 0: - quality_list = check_quality_list(model_name, field_name, field.quality, quoting_config) + quoting_config = config + + for prop in properties: + property_name = prop.name + logical_type = prop.logicalType + + checks.append(check_property_is_present(schema_name, property_name, quoting_config)) + if check_types and logical_type is not None: + sql_type: str = convert_to_sql_type(prop, server_type) + checks.append(check_property_type(schema_name, property_name, sql_type, quoting_config)) + if prop.required: + checks.append(check_property_required(schema_name, property_name, quoting_config)) + if prop.unique: + checks.append(check_property_unique(schema_name, property_name, quoting_config)) + + min_length = _get_logical_type_option(prop, "minLength") + if min_length is not None: + checks.append(check_property_min_length(schema_name, property_name, min_length, quoting_config)) + + max_length = _get_logical_type_option(prop, "maxLength") + if max_length is not None: + checks.append(check_property_max_length(schema_name, property_name, max_length, quoting_config)) + + minimum = _get_logical_type_option(prop, "minimum") + if minimum is not None: + checks.append(check_property_minimum(schema_name, property_name, minimum, quoting_config)) + + maximum = _get_logical_type_option(prop, "maximum") + if maximum is not None: + checks.append(check_property_maximum(schema_name, property_name, maximum, quoting_config)) + + exclusive_minimum = _get_logical_type_option(prop, "exclusiveMinimum") + if exclusive_minimum is not None: + checks.append(check_property_minimum(schema_name, property_name, exclusive_minimum, quoting_config)) + checks.append(check_property_not_equal(schema_name, property_name, exclusive_minimum, quoting_config)) + + exclusive_maximum = _get_logical_type_option(prop, "exclusiveMaximum") + if exclusive_maximum is not None: + checks.append(check_property_maximum(schema_name, property_name, exclusive_maximum, quoting_config)) + checks.append(check_property_not_equal(schema_name, property_name, exclusive_maximum, quoting_config)) + + pattern = _get_logical_type_option(prop, "pattern") + if pattern is not None: + checks.append(check_property_regex(schema_name, property_name, pattern, quoting_config)) + + enum_values = _get_logical_type_option(prop, "enum") + if enum_values is not None and len(enum_values) > 0: + checks.append(check_property_enum(schema_name, property_name, enum_values, quoting_config)) + + if prop.quality is not None and len(prop.quality) > 0: + quality_list = check_quality_list(schema_name, property_name, prop.quality, quoting_config, server) if (quality_list is not None) and len(quality_list) > 0: checks.extend(quality_list) - # TODO references: str = None - # TODO format - if model_value.quality is not None and len(model_value.quality) > 0: - quality_list = check_quality_list(model_name, None, model_value.quality) + if schema_object.quality is not None and len(schema_object.quality) > 0: + quality_list = check_quality_list(schema_name, None, schema_object.quality, quoting_config, server) if (quality_list is not None) and len(quality_list) > 0: checks.extend(quality_list) return checks -def checks_for(model_name, quote_model_name: bool): - if quote_model_name: +def checks_for(model_name: str, quoting_config: QuotingConfig, check_type: str) -> str: + if quoting_config.quote_model_name: return f'checks for "{model_name}"' + elif quoting_config.quote_model_name_with_backticks and check_type not in ["field_is_present", "field_type"]: + return f"checks for `{model_name}`" return f"checks for {model_name}" @@ -94,27 +152,25 @@ def is_check_types(server: Server) -> bool: return server.format != "json" and server.format != "csv" and server.format != "avro" -def to_model_name(model_key, model_value, server_type): - if server_type == "databricks": - if model_value.config is not None and "databricksTable" in model_value.config: - return model_value.config["databricksTable"] - if server_type == "snowflake": - if model_value.config is not None and "snowflakeTable" in model_value.config: - return model_value.config["snowflakeTable"] - if server_type == "sqlserver": - if model_value.config is not None and "sqlserverTable" in model_value.config: - return model_value.config["sqlserverTable"] - if server_type == "postgres" or server_type == "postgresql": - if model_value.config is not None and "postgresTable" in model_value.config: - return model_value.config["postgresTable"] - return model_key +def to_schema_name(schema_object: SchemaObject, server_type: str) -> str: + # For Kafka, use name (not physicalName) since the Spark SQL view uses schema name + # physicalName in Kafka represents the topic name, not the SQL view name + if server_type == "kafka": + return schema_object.name + + # Use physicalName if set (ODCS standard way to specify actual table name) + if schema_object.physicalName: + return schema_object.physicalName + return schema_object.name -def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check: + + +def check_property_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check: check_type = "field_is_present" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { "schema": { "name": check_key, @@ -139,13 +195,13 @@ def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig ) -def check_field_type( +def check_property_type( model_name: str, field_name: str, expected_type: str, quoting_config: QuotingConfig = QuotingConfig() ): check_type = "field_type" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { "schema": { "name": check_key, @@ -172,7 +228,7 @@ def check_field_type( ) -def check_field_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_property_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()): if quoting_config.quote_field_name: field_name_for_soda = f'"{field_name}"' else: @@ -181,7 +237,7 @@ def check_field_required(model_name: str, field_name: str, quoting_config: Quoti check_type = "field_required" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"missing_count({field_name_for_soda}) = 0": { "name": check_key, @@ -203,7 +259,7 @@ def check_field_required(model_name: str, field_name: str, quoting_config: Quoti ) -def check_field_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_property_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()): if quoting_config.quote_field_name: field_name_for_soda = f'"{field_name}"' else: @@ -212,7 +268,7 @@ def check_field_unique(model_name: str, field_name: str, quoting_config: Quoting check_type = "field_unique" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"duplicate_count({field_name_for_soda}) = 0": { "name": check_key, @@ -234,7 +290,7 @@ def check_field_unique(model_name: str, field_name: str, quoting_config: Quoting ) -def check_field_min_length( +def check_property_min_length( model_name: str, field_name: str, min_length: int, quoting_config: QuotingConfig = QuotingConfig() ): if quoting_config.quote_field_name: @@ -245,7 +301,7 @@ def check_field_min_length( check_type = "field_min_length" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -268,7 +324,7 @@ def check_field_min_length( ) -def check_field_max_length( +def check_property_max_length( model_name: str, field_name: str, max_length: int, quoting_config: QuotingConfig = QuotingConfig() ): if quoting_config.quote_field_name: @@ -279,7 +335,7 @@ def check_field_max_length( check_type = "field_max_length" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -302,7 +358,7 @@ def check_field_max_length( ) -def check_field_minimum( +def check_property_minimum( model_name: str, field_name: str, minimum: int, quoting_config: QuotingConfig = QuotingConfig() ): if quoting_config.quote_field_name: @@ -313,7 +369,7 @@ def check_field_minimum( check_type = "field_minimum" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -336,7 +392,7 @@ def check_field_minimum( ) -def check_field_maximum( +def check_property_maximum( model_name: str, field_name: str, maximum: int, quoting_config: QuotingConfig = QuotingConfig() ): if quoting_config.quote_field_name: @@ -347,7 +403,7 @@ def check_field_maximum( check_type = "field_maximum" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -370,7 +426,7 @@ def check_field_maximum( ) -def check_field_not_equal( +def check_property_not_equal( model_name: str, field_name: str, value: int, quoting_config: QuotingConfig = QuotingConfig() ): if quoting_config.quote_field_name: @@ -381,7 +437,7 @@ def check_field_not_equal( check_type = "field_not_equal" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -404,7 +460,7 @@ def check_field_not_equal( ) -def check_field_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()): +def check_property_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()): if quoting_config.quote_field_name: field_name_for_soda = f'"{field_name}"' else: @@ -413,7 +469,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quoting_confi check_type = "field_enum" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -436,7 +492,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quoting_confi ) -def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_property_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()): if quoting_config.quote_field_name: field_name_for_soda = f'"{field_name}"' else: @@ -445,7 +501,7 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_co check_type = "field_regex" check_key = f"{model_name}__{field_name}__{check_type}" sodacl_check_dict = { - checks_for(model_name, quoting_config.quote_model_name): [ + checks_for(model_name, quoting_config, check_type): [ { f"invalid_count({field_name_for_soda}) = 0": { "name": check_key, @@ -468,22 +524,249 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_co ) +def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()): + check_type = "row_count" + check_key = f"{model_name}__{check_type}" + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"row_count {threshold}": {"name": check_key}, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="schema", + type=check_type, + name=f"Check that model {model_name} has row_count {threshold}", + model=model_name, + field=None, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + +def check_model_duplicate_values( + model_name: str, cols: list[str], threshold: str, quoting_config: QuotingConfig = QuotingConfig() +): + check_type = "model_duplicate_values" + check_key = f"{model_name}__{check_type}" + col_joined = ", ".join(cols) + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"duplicate_count({col_joined}) {threshold}": {"name": check_key}, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}", + model=model_name, + field=None, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + +def check_property_duplicate_values( + model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig() +): + if quoting_config.quote_field_name: + field_name_for_soda = f'"{field_name}"' + else: + field_name_for_soda = field_name + + check_type = "field_duplicate_values" + check_key = f"{model_name}__{field_name}__{check_type}" + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"duplicate_count({field_name_for_soda}) {threshold}": { + "name": check_key, + }, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=f"Check that field {field_name} has duplicate_count {threshold}", + model=model_name, + field=field_name, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + +def check_property_null_values( + model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig() +): + if quoting_config.quote_field_name: + field_name_for_soda = f'"{field_name}"' + else: + field_name_for_soda = field_name + + check_type = "field_null_values" + check_key = f"{model_name}__{field_name}__{check_type}" + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"missing_count({field_name_for_soda}) {threshold}": { + "name": check_key, + }, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=f"Check that field {field_name} has missing_count {threshold}", + model=model_name, + field=field_name, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + +def check_property_invalid_values( + model_name: str, + field_name: str, + threshold: str, + valid_values: list = None, + quoting_config: QuotingConfig = QuotingConfig(), +): + if quoting_config.quote_field_name: + field_name_for_soda = f'"{field_name}"' + else: + field_name_for_soda = field_name + + check_type = "field_invalid_values" + check_key = f"{model_name}__{field_name}__{check_type}" + + sodacl_check_config = { + "name": check_key, + } + + if valid_values is not None: + sodacl_check_config["valid values"] = valid_values + + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=f"Check that field {field_name} has invalid_count {threshold}", + model=model_name, + field=field_name, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + +def check_property_missing_values( + model_name: str, + field_name: str, + threshold: str, + missing_values: list = None, + quoting_config: QuotingConfig = QuotingConfig(), +): + if quoting_config.quote_field_name: + field_name_for_soda = f'"{field_name}"' + else: + field_name_for_soda = field_name + + check_type = "field_missing_values" + check_key = f"{model_name}__{field_name}__{check_type}" + + sodacl_check_config = { + "name": check_key, + } + + if missing_values is not None: + filtered_missing_values = [v for v in missing_values if v is not None] + if filtered_missing_values: + sodacl_check_config["missing values"] = filtered_missing_values + + sodacl_check_dict = { + checks_for(model_name, quoting_config, check_type): [ + { + f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config, + } + ], + } + return Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=f"Check that field {field_name} has missing_count {threshold}", + model=model_name, + field=field_name, + engine="soda", + language="sodacl", + implementation=yaml.dump(sodacl_check_dict), + ) + + def check_quality_list( - model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig() + schema_name, + property_name, + quality_list: List[DataQuality], + quoting_config: QuotingConfig = QuotingConfig(), + server: Server = None, ) -> List[Check]: checks: List[Check] = [] count = 0 for quality in quality_list: - if quality.type == "sql": - if field_name is None: - check_key = f"{model_name}__quality_sql_{count}" + if quality.type == "custom" and quality.engine == "soda" and quality.implementation: + # Custom SodaCL quality check with raw implementation + check_key = f"{schema_name}__quality_custom_{count}" + check_type = "quality_custom_soda" + checks.append( + Check( + id=str(uuid.uuid4()), + key=check_key, + category="quality", + type=check_type, + name=quality.description if quality.description is not None else "Custom SodaCL Check", + model=schema_name, + field=property_name, + engine="soda", + language="sodacl", + implementation=quality.implementation, + ) + ) + elif quality.type == "sql": + if property_name is None: + check_key = f"{schema_name}__quality_sql_{count}" check_type = "field_quality_sql" else: - check_key = f"{model_name}__{field_name}__quality_sql_{count}" + check_key = f"{schema_name}__{property_name}__quality_sql_{count}" check_type = "model_quality_sql" threshold = to_sodacl_threshold(quality) - query = prepare_query(quality, model_name, field_name, quoting_config) + query = prepare_query(quality, schema_name, property_name, quoting_config, server) if query is None: logger.warning(f"Quality check {check_key} has no query") continue @@ -492,9 +775,9 @@ def check_quality_list( continue if quoting_config.quote_model_name: - model_name_for_soda = f'"{model_name}"' + model_name_for_soda = f'"{schema_name}"' else: - model_name_for_soda = model_name + model_name_for_soda = schema_name sodacl_check_dict = { f"checks for {model_name_for_soda}": [ { @@ -512,20 +795,66 @@ def check_quality_list( category="quality", type=check_type, name=quality.description if quality.description is not None else "Quality Check", - model=model_name, - field=field_name, + model=schema_name, + field=property_name, engine="soda", language="sodacl", implementation=yaml.dump(sodacl_check_dict), ) ) + elif quality.metric is not None: + threshold = to_sodacl_threshold(quality) + + if threshold is None: + logger.warning(f"Quality metric {quality.metric} has no valid threshold") + continue + + if quality.metric == "rowCount": + checks.append(check_row_count(schema_name, threshold, quoting_config)) + elif quality.metric == "duplicateValues": + if property_name is None: + checks.append( + check_model_duplicate_values( + schema_name, quality.arguments.get("properties"), threshold, quoting_config + ) + ) + else: + checks.append(check_property_duplicate_values(schema_name, property_name, threshold, quoting_config)) + elif quality.metric == "nullValues": + if property_name is not None: + checks.append(check_property_null_values(schema_name, property_name, threshold, quoting_config)) + else: + logger.warning("Quality check nullValues is only supported at field level") + elif quality.metric == "invalidValues": + if property_name is not None: + valid_values = quality.arguments.get("validValues") if quality.arguments else None + checks.append( + check_property_invalid_values(schema_name, property_name, threshold, valid_values, quoting_config) + ) + else: + logger.warning("Quality check invalidValues is only supported at field level") + elif quality.metric == "missingValues": + if property_name is not None: + missing_values = quality.arguments.get("missingValues") if quality.arguments else None + checks.append( + check_property_missing_values(schema_name, property_name, threshold, missing_values, quoting_config) + ) + else: + logger.warning("Quality check missingValues is only supported at field level") + else: + logger.warning(f"Quality check {quality.metric} is not yet supported") + count += 1 return checks def prepare_query( - quality: Quality, model_name: str, field_name: str = None, quoting_config: QuotingConfig = QuotingConfig() + quality: DataQuality, + model_name: str, + field_name: str = None, + quoting_config: QuotingConfig = QuotingConfig(), + server: Server = None, ) -> str | None: if quality.query is None: return None @@ -541,34 +870,46 @@ def prepare_query( if quoting_config.quote_model_name: model_name_for_soda = f'"{model_name}"' + elif quoting_config.quote_model_name_with_backticks: + model_name_for_soda = f"`{model_name}`" else: model_name_for_soda = model_name - query = re.sub(r'["\']?\{model}["\']?', model_name_for_soda, query) - query = re.sub(r'["\']?{schema}["\']?', model_name_for_soda, query) - query = re.sub(r'["\']?{table}["\']?', model_name_for_soda, query) + query = re.sub(r'["\']?\$?\{model}["\']?', model_name_for_soda, query) + query = re.sub(r'["\']?\$?\{table}["\']?', model_name_for_soda, query) + + if server and server.schema_: + if quoting_config.quote_model_name: + schema_name_for_soda = f'"{server.schema_}"' + elif quoting_config.quote_model_name_with_backticks: + schema_name_for_soda = f"`{server.schema_}`" + else: + schema_name_for_soda = server.schema_ + query = re.sub(r'["\']?\$?\{schema}["\']?', schema_name_for_soda, query) + else: + query = re.sub(r'["\']?\$?\{schema}["\']?', model_name_for_soda, query) if field_name is not None: - query = re.sub(r'["\']?{field}["\']?', field_name_for_soda, query) - query = re.sub(r'["\']?{column}["\']?', field_name_for_soda, query) - query = re.sub(r'["\']?{property}["\']?', field_name_for_soda, query) + query = re.sub(r'["\']?\$?\{field}["\']?', field_name_for_soda, query) + query = re.sub(r'["\']?\$?\{column}["\']?', field_name_for_soda, query) + query = re.sub(r'["\']?\$?\{property}["\']?', field_name_for_soda, query) return query -def to_sodacl_threshold(quality: Quality) -> str | None: +def to_sodacl_threshold(quality: DataQuality) -> str | None: if quality.mustBe is not None: return f"= {quality.mustBe}" if quality.mustNotBe is not None: return f"!= {quality.mustNotBe}" if quality.mustBeGreaterThan is not None: return f"> {quality.mustBeGreaterThan}" - if quality.mustBeGreaterThanOrEqualTo is not None: - return f">= {quality.mustBeGreaterThanOrEqualTo}" + if quality.mustBeGreaterOrEqualTo is not None: + return f">= {quality.mustBeGreaterOrEqualTo}" if quality.mustBeLessThan is not None: return f"< {quality.mustBeLessThan}" - if quality.mustBeLessThanOrEqualTo is not None: - return f"<= {quality.mustBeLessThanOrEqualTo}" + if quality.mustBeLessOrEqualTo is not None: + return f"<= {quality.mustBeLessOrEqualTo}" if quality.mustBeBetween is not None: if len(quality.mustBeBetween) != 2: logger.warning( @@ -586,195 +927,202 @@ def to_sodacl_threshold(quality: Quality) -> str | None: return None -def to_servicelevel_checks(data_contract_spec: DataContractSpecification) -> List[Check]: +def _get_schema_by_name(data_contract: OpenDataContractStandard, name: str) -> Optional[SchemaObject]: + """Get a schema object by name from the data contract.""" + if data_contract.schema_ is None: + return None + return next((s for s in data_contract.schema_ if s.name == name), None) + + +def to_servicelevel_checks(data_contract: OpenDataContractStandard) -> List[Check]: checks: List[Check] = [] - if data_contract_spec.servicelevels is None: + if data_contract.slaProperties is None: return checks - if data_contract_spec.servicelevels.freshness is not None: - checks.append(to_servicelevel_freshness_check(data_contract_spec)) - if data_contract_spec.servicelevels.retention is not None: - checks.append(to_servicelevel_retention_check(data_contract_spec)) - # only return checks that are not None - return [check for check in checks if check is not None] + + for sla in data_contract.slaProperties: + if sla.property == "freshness": + check = to_sla_freshness_check(data_contract, sla) + if check is not None: + checks.append(check) + elif sla.property == "retention": + check = to_servicelevel_retention_check(data_contract, sla) + if check is not None: + checks.append(check) + + return checks -def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecification) -> Check | None: - if data_contract_spec.servicelevels.freshness.timestampField is None: +def to_sla_freshness_check(data_contract: OpenDataContractStandard, sla) -> Check | None: + """Create a freshness check from an ODCS latency SLA property.""" + if sla.element is None: + logger.info("slaProperties.latency.element is not defined, skipping freshness check") return None - freshness_threshold = data_contract_spec.servicelevels.freshness.threshold - if freshness_threshold is None: - logger.info("servicelevel.freshness.threshold is not defined") + + if sla.value is None: + logger.info("slaProperties.latency.value is not defined, skipping freshness check") return None - if not ( - "d" in freshness_threshold - or "D" in freshness_threshold - or "h" in freshness_threshold - or "H" in freshness_threshold - or "m" in freshness_threshold - or "M" in freshness_threshold - ): - logger.info("servicelevel.freshness.threshold must be in days, hours, or minutes (e.g., PT1H, or 1h)") + # Parse element to get model and field (e.g., "my_table.field_three") + element = sla.element + if "." not in element: + logger.info("slaProperties.latency.element is not fully qualified (model.field), skipping freshness check") return None - timestamp_field_fully_qualified = data_contract_spec.servicelevels.freshness.timestampField - if "." not in timestamp_field_fully_qualified: - logger.info("servicelevel.freshness.timestampField is not fully qualified, skipping freshness check") + + if element.count(".") > 1: + logger.info("slaProperties.latency.element contains multiple dots, which is currently not supported") return None - if timestamp_field_fully_qualified.count(".") > 1: - logger.info( - "servicelevel.freshness.timestampField contains multiple dots, which is currently not supported, skipping freshness check" - ) + + model_name = element.split(".")[0] + field_name = element.split(".")[1] + + # Verify the model exists + schema = _get_schema_by_name(data_contract, model_name) + if schema is None: + logger.info(f"Model {model_name} not found in schema, skipping freshness check") return None - model_name = timestamp_field_fully_qualified.split(".")[0] - field_name = timestamp_field_fully_qualified.split(".")[1] - threshold = freshness_threshold - threshold = threshold.replace("P", "") - threshold = threshold.replace("T", "") - threshold = threshold.lower() - if model_name not in data_contract_spec.models: - logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping freshness check") + + # Build threshold from value and unit + unit = sla.unit.lower() if sla.unit else "d" + value = sla.value + + # Normalize unit to soda format (d, h, m) + if unit in ["d", "day", "days"]: + threshold = f"{value}d" + elif unit in ["h", "hr", "hour", "hours"]: + threshold = f"{value}h" + elif unit in ["m", "min", "minute", "minutes"]: + threshold = f"{value}m" + else: + logger.info(f"Unsupported unit {unit} for freshness check, must be days, hours, or minutes") return None check_type = "servicelevel_freshness" check_key = "servicelevel_freshness" - sodacl_check_dict = { - checks_for(model_name, False): [ + f"checks for {model_name}": [ { f"freshness({field_name}) < {threshold}": { "name": check_key, }, } - ] + ], } return Check( id=str(uuid.uuid4()), key=check_key, category="servicelevel", type=check_type, - name="Freshness", + name=f"Freshness of {model_name}.{field_name} < {threshold}", model=model_name, + field=field_name, engine="soda", language="sodacl", implementation=yaml.dump(sodacl_check_dict), ) -def to_servicelevel_retention_check(data_contract_spec) -> Check | None: - if data_contract_spec.servicelevels.retention is None: - return None - if data_contract_spec.servicelevels.retention.unlimited is True: +def to_servicelevel_retention_check(data_contract: OpenDataContractStandard, sla) -> Check | None: + """Create a retention check from an ODCS retention SLA property.""" + if sla.element is None: + logger.info("slaProperties.retention.element is not defined, skipping retention check") return None - if data_contract_spec.servicelevels.retention.timestampField is None: - logger.info("servicelevel.retention.timestampField is not defined") + + if sla.value is None: + logger.info("slaProperties.retention.value is not defined, skipping retention check") return None - if data_contract_spec.servicelevels.retention.period is None: - logger.info("servicelevel.retention.period is not defined") + + # Parse element to get model and field (e.g., "orders.processed_timestamp") + element = sla.element + if "." not in element: + logger.info("slaProperties.retention.element is not fully qualified (model.field), skipping retention check") return None - timestamp_field_fully_qualified = data_contract_spec.servicelevels.retention.timestampField - if "." not in timestamp_field_fully_qualified: - logger.info("servicelevel.retention.timestampField is not fully qualified, skipping retention check") + + if element.count(".") > 1: + logger.info("slaProperties.retention.element contains multiple dots, which is currently not supported") return None - if timestamp_field_fully_qualified.count(".") > 1: - logger.info( - "servicelevel.retention.timestampField contains multiple dots, which is currently not supported, skipping retention check" - ) + + model_name = element.split(".")[0] + field_name = element.split(".")[1] + + # Verify the model exists + schema = _get_schema_by_name(data_contract, model_name) + if schema is None: + logger.info(f"Model {model_name} not found in schema, skipping retention check") return None - model_name = timestamp_field_fully_qualified.split(".")[0] - field_name = timestamp_field_fully_qualified.split(".")[1] - period = data_contract_spec.servicelevels.retention.period - period_in_seconds = period_to_seconds(period) - if model_name not in data_contract_spec.models: - logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping retention check") + # Parse ISO 8601 duration to seconds + retention_period = sla.value + seconds = _parse_iso8601_to_seconds(retention_period) + if seconds is None: + logger.info(f"Could not parse retention period {retention_period}, skipping retention check") return None + check_type = "servicelevel_retention" check_key = "servicelevel_retention" sodacl_check_dict = { - checks_for(model_name, False): [ + f"checks for {model_name}": [ { - f"orders_servicelevel_retention < {period_in_seconds}": { - "orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)", + f"{model_name}_servicelevel_retention < {seconds}": { "name": check_key, - } - }, - ] + f"{model_name}_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)", + }, + } + ], } return Check( id=str(uuid.uuid4()), key=check_key, category="servicelevel", type=check_type, - name=f"Retention: Oldest entry has a max age of {period}", + name=f"Retention of {model_name}.{field_name} < {seconds}s", model=model_name, + field=field_name, engine="soda", language="sodacl", implementation=yaml.dump(sodacl_check_dict), ) -def period_to_seconds(period: str) -> int | None: +def _parse_iso8601_to_seconds(duration: str) -> int | None: + """Parse ISO 8601 duration to seconds.""" + if not duration: + return None + + duration = duration.upper() + + # Simple patterns: P1Y, P1M, P1D, PT1H, PT1M, PT1S + # For simplicity, we support only single unit durations import re - # if period is None: - # return None - # if period is in form "30d" or "24h" or "60m" - if re.match(r"^\d+[dhm]$", period): - if period[-1] == "d": - return int(period[:-1]) * 86400 - if period[-1] == "h": - return int(period[:-1]) * 3600 - if period[-1] == "m": - return int(period[:-1]) * 60 - # if it is in iso period format (do not use isodate, can also be years) - iso_period_regex = re.compile( - r"P(?:(?P\d+)Y)?(?:(?P\d+)M)?(?:(?P\d+)D)?" - r"(?:T(?:(?P\d+)H)?(?:(?P\d+)M)?(?:(?P\d+)S)?)?" - ) - match = iso_period_regex.match(period) + # Year + match = re.match(r"P(\d+)Y", duration) if match: - years = int(match.group("years") or 0) - months = int(match.group("months") or 0) - days = int(match.group("days") or 0) - hours = int(match.group("hours") or 0) - minutes = int(match.group("minutes") or 0) - seconds = int(match.group("seconds") or 0) - - # Convert everything to seconds - total_seconds = ( - years * 365 * 86400 # Approximate conversion of years to seconds - + months * 30 * 86400 # Approximate conversion of months to seconds - + days * 86400 - + hours * 3600 - + minutes * 60 - + seconds - ) - return total_seconds + return int(match.group(1)) * 365 * 24 * 60 * 60 - return None + # Month (approximate as 30 days) + match = re.match(r"P(\d+)M", duration) + if match: + return int(match.group(1)) * 30 * 24 * 60 * 60 + # Day + match = re.match(r"P(\d+)D", duration) + if match: + return int(match.group(1)) * 24 * 60 * 60 -# These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead -def to_quality_check(data_contract_spec) -> Check | None: - if data_contract_spec.quality is None: - return None - if data_contract_spec.quality.type is None: - return None - if data_contract_spec.quality.type.lower() != "sodacl": - return None - if isinstance(data_contract_spec.quality.specification, str): - quality_specification = yaml.safe_load(data_contract_spec.quality.specification) - else: - quality_specification = data_contract_spec.quality.specification + # Hour + match = re.match(r"PT(\d+)H", duration) + if match: + return int(match.group(1)) * 60 * 60 + + # Minute + match = re.match(r"PT(\d+)M", duration) + if match: + return int(match.group(1)) * 60 + + # Second + match = re.match(r"PT(\d+)S", duration) + if match: + return int(match.group(1)) + + return None - return Check( - id=str(uuid.uuid4()), - key="quality__sodacl", - category="quality", - type="quality", - name="Quality Check", - model=None, - engine="soda", - language="sodacl", - implementation=yaml.dump(quality_specification), - ) diff --git a/datacontract/engines/data_contract_test.py b/datacontract/engines/data_contract_test.py index 2b65e8d44..c35949550 100644 --- a/datacontract/engines/data_contract_test.py +++ b/datacontract/engines/data_contract_test.py @@ -4,11 +4,12 @@ import typing import requests -from duckdb.duckdb import DuckDBPyConnection +from open_data_contract_standard.model import OpenDataContractStandard, Server from datacontract.engines.data_contract_checks import create_checks if typing.TYPE_CHECKING: + from duckdb.duckdb import DuckDBPyConnection from pyspark.sql import SparkSession from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import ( @@ -16,70 +17,71 @@ ) from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema from datacontract.engines.soda.check_soda_execute import check_soda_execute -from datacontract.model.data_contract_specification import DataContractSpecification, Server from datacontract.model.exceptions import DataContractException from datacontract.model.run import ResultEnum, Run def execute_data_contract_test( - data_contract_specification: DataContractSpecification, + data_contract: OpenDataContractStandard, run: Run, server_name: str = None, spark: "SparkSession" = None, - duckdb_connection: DuckDBPyConnection = None, + duckdb_connection: "DuckDBPyConnection" = None, ): - if data_contract_specification.models is None or len(data_contract_specification.models) == 0: + if data_contract.schema_ is None or len(data_contract.schema_) == 0: raise DataContractException( type="lint", name="Check that data contract contains models", result=ResultEnum.warning, - reason="Models block is missing. Skip executing tests.", + reason="Schema block is missing. Skip executing tests.", engine="datacontract", ) if ( server_name is None - and data_contract_specification.servers is not None - and len(data_contract_specification.servers) > 0 + and data_contract.servers is not None + and len(data_contract.servers) > 0 ): - server_name = list(data_contract_specification.servers.keys())[0] - server = get_server(data_contract_specification, server_name) - run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}") - run.dataContractId = data_contract_specification.id - run.dataContractVersion = data_contract_specification.info.version - run.dataProductId = server.dataProductId - run.outputPortId = server.outputPortId + server_name = data_contract.servers[0].server + server = get_server(data_contract, server_name) + run.log_info(f"Running tests for data contract {data_contract.id} with server {server_name}") + run.dataContractId = data_contract.id + run.dataContractVersion = data_contract.version + run.dataProductId = data_contract.dataProduct + run.outputPortId = None # ODCS doesn't have outputPortId run.server = server_name if server.type == "api": server = process_api_response(run, server) - run.checks.extend(create_checks(data_contract_specification, server)) + run.checks.extend(create_checks(data_contract, server)) # TODO check server is supported type for nicer error messages # TODO check server credentials are complete for nicer error messages if server.format == "json" and server.type != "kafka": - check_jsonschema(run, data_contract_specification, server) - check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection) + check_jsonschema(run, data_contract, server) + check_soda_execute(run, data_contract, server, spark, duckdb_connection) -def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None: - """Get the server configuration from the data contract specification. +def get_server(data_contract: OpenDataContractStandard, server_name: str = None) -> Server | None: + """Get the server configuration from the data contract. Args: - data_contract_specification: The data contract specification + data_contract: The data contract server_name: Optional name of the server to use. If not provided, uses the first server. Returns: The selected server configuration """ - check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name) + check_that_datacontract_contains_valid_server_configuration(data_contract, server_name) + + if data_contract.servers is None: + return None if server_name is not None: - server = data_contract_specification.servers.get(server_name) + server = next((s for s in data_contract.servers if s.server == server_name), None) else: - server_name = list(data_contract_specification.servers.keys())[0] - server = data_contract_specification.servers.get(server_name) + server = data_contract.servers[0] if data_contract.servers else None return server @@ -103,11 +105,10 @@ def process_api_response(run, server): with open(f"{tmp_dir.name}/api_response.json", "w") as f: f.write(response.text) run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json") - server = Server( + new_server = Server( + server="api_local", type="local", format="json", path=f"{tmp_dir.name}/api_response.json", - dataProductId=server.dataProductId, - outputPortId=server.outputPortId, ) - return server + return new_server diff --git a/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py b/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py index 780f402be..082938b57 100644 --- a/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +++ b/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py @@ -1,9 +1,10 @@ -from datacontract.model.data_contract_specification import DataContractSpecification +from open_data_contract_standard.model import OpenDataContractStandard + from datacontract.model.exceptions import DataContractException def check_that_datacontract_contains_valid_server_configuration( - data_contract: DataContractSpecification, server_name: str | None + data_contract: OpenDataContractStandard, server_name: str | None ): if data_contract.servers is None or len(data_contract.servers) == 0: raise DataContractException( @@ -21,14 +22,17 @@ def check_that_datacontract_contains_valid_server_configuration( reason="Data contract contains multiple server configurations. Specify the server you want to test. Skip executing tests.", engine="datacontract", ) - if server_name is not None and server_name not in data_contract.servers: - raise DataContractException( - type="lint", - name="Check that data contract contains valid servers configuration", - result="warning", - reason=f"Cannot find server '{server_name}' in the data contract servers configuration. Skip executing tests.", - engine="datacontract", - ) + if server_name is not None: + # Check if server_name exists in the servers list + server_names = [s.server for s in data_contract.servers if s.server] + if server_name not in server_names: + raise DataContractException( + type="lint", + name="Check that data contract contains valid servers configuration", + result="warning", + reason=f"Cannot find server '{server_name}' in the data contract servers configuration. Skip executing tests.", + engine="datacontract", + ) # TODO check for server.type, if all required fields are present diff --git a/datacontract/engines/fastjsonschema/check_jsonschema.py b/datacontract/engines/fastjsonschema/check_jsonschema.py index 087171aea..d2971ce2f 100644 --- a/datacontract/engines/fastjsonschema/check_jsonschema.py +++ b/datacontract/engines/fastjsonschema/check_jsonschema.py @@ -7,10 +7,10 @@ import fastjsonschema from fastjsonschema import JsonSchemaValueException +from open_data_contract_standard.model import OpenDataContractStandard, Server from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files -from datacontract.export.jsonschema_converter import to_jsonschema -from datacontract.model.data_contract_specification import DataContractSpecification, Server +from datacontract.export.jsonschema_exporter import to_jsonschema from datacontract.model.exceptions import DataContractException from datacontract.model.run import Check, ResultEnum, Run @@ -231,7 +231,7 @@ def process_s3_file(run, server, schema, model_name, validate): process_exceptions(run, exceptions) -def check_jsonschema(run: Run, data_contract: DataContractSpecification, server: Server): +def check_jsonschema(run: Run, data_contract: OpenDataContractStandard, server: Server): run.log_info("Running engine jsonschema") # Early exit conditions @@ -248,14 +248,15 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server: run.log_warn("jsonschema: Server format is not 'json'. Skip jsonschema checks.") return - if not data_contract.models: - run.log_warn("jsonschema: No models found. Skip jsonschema checks.") + if not data_contract.schema_: + run.log_warn("jsonschema: No schema found. Skip jsonschema checks.") return - for model_name, model in iter(data_contract.models.items()): + for schema_obj in data_contract.schema_: + model_name = schema_obj.name # Process the model run.log_info(f"jsonschema: Converting model {model_name} to JSON Schema") - schema = to_jsonschema(model_name, model) + schema = to_jsonschema(model_name, schema_obj) run.log_info(f"jsonschema: {schema}") validate = fastjsonschema.compile( diff --git a/datacontract/engines/soda/check_soda_execute.py b/datacontract/engines/soda/check_soda_execute.py index 4f4f02a5f..d71c77d89 100644 --- a/datacontract/engines/soda/check_soda_execute.py +++ b/datacontract/engines/soda/check_soda_execute.py @@ -3,31 +3,33 @@ import uuid from datacontract.engines.soda.connections.athena import to_athena_soda_configuration +from datacontract.engines.soda.connections.oracle import initialize_client_and_create_soda_configuration if typing.TYPE_CHECKING: + from duckdb.duckdb import DuckDBPyConnection from pyspark.sql import SparkSession -from duckdb.duckdb import DuckDBPyConnection +from open_data_contract_standard.model import OpenDataContractStandard, Server from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection +from datacontract.engines.soda.connections.impala import to_impala_soda_configuration from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_configuration from datacontract.engines.soda.connections.trino import to_trino_soda_configuration -from datacontract.export.sodacl_converter import to_sodacl_yaml -from datacontract.model.data_contract_specification import DataContractSpecification, Server +from datacontract.export.sodacl_exporter import to_sodacl_yaml from datacontract.model.run import Check, Log, ResultEnum, Run def check_soda_execute( run: Run, - data_contract: DataContractSpecification, + data_contract: OpenDataContractStandard, server: Server, spark: "SparkSession" = None, - duckdb_connection: DuckDBPyConnection = None, + duckdb_connection: "DuckDBPyConnection" = None, ): from soda.common.config_helper import ConfigHelper @@ -94,6 +96,18 @@ def check_soda_execute( logging.info("Use Spark to connect to data source") scan.add_spark_session(spark, data_source_name="datacontract-cli") scan.set_data_source_name("datacontract-cli") + + # ------------------------------------------------------------------ + # NEW: native Impala server type + # ------------------------------------------------------------------ + elif server.type == "impala": + run.log_info("Connecting to Impala via Soda engine") + soda_configuration_str = to_impala_soda_configuration(server) + scan.add_configuration_yaml_str(soda_configuration_str) + # data source name must match what we configure in to_impala_soda_configuration + scan.set_data_source_name("impala") + + elif server.type == "kafka": if spark is None: spark = create_spark_session() @@ -104,6 +118,10 @@ def check_soda_execute( soda_configuration_str = to_sqlserver_soda_configuration(server) scan.add_configuration_yaml_str(soda_configuration_str) scan.set_data_source_name(server.type) + elif server.type == "oracle": + soda_configuration_str = initialize_client_and_create_soda_configuration(server) + scan.add_configuration_yaml_str(soda_configuration_str) + scan.set_data_source_name(server.type) elif server.type == "trino": soda_configuration_str = to_trino_soda_configuration(server) scan.add_configuration_yaml_str(soda_configuration_str) @@ -213,4 +231,4 @@ def update_reason(check, c): # print(check.reason) break # Exit the loop once the desired block is found if "fail" in c["diagnostics"]: - check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}" + check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}" \ No newline at end of file diff --git a/datacontract/engines/soda/connections/athena.py b/datacontract/engines/soda/connections/athena.py index 8272e3e30..0de865dbf 100644 --- a/datacontract/engines/soda/connections/athena.py +++ b/datacontract/engines/soda/connections/athena.py @@ -71,7 +71,7 @@ def to_athena_soda_configuration(server): data_source["catalog"] = server.catalog if s3_session_token: - data_source["aws_session_token"] = s3_session_token + data_source["session_token"] = s3_session_token soda_configuration = {f"data_source {server.type}": data_source} diff --git a/datacontract/engines/soda/connections/duckdb_connection.py b/datacontract/engines/soda/connections/duckdb_connection.py index d7c6a9795..5e1365754 100644 --- a/datacontract/engines/soda/connections/duckdb_connection.py +++ b/datacontract/engines/soda/connections/duckdb_connection.py @@ -1,19 +1,32 @@ import os -from typing import Any, Dict +import re +from typing import TYPE_CHECKING, Any, List, Optional -import duckdb +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty, Server from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server +from datacontract.export.sql_type_converter import convert_to_duckdb from datacontract.model.run import Run +if TYPE_CHECKING: + import duckdb + + +def _import_duckdb(): + try: + import duckdb + return duckdb + except ImportError: + raise ImportError("duckdb is required for this server type. Install with: pip install datacontract-cli[duckdb]") + def get_duckdb_connection( - data_contract: DataContractSpecification, + data_contract: OpenDataContractStandard, server: Server, run: Run, - duckdb_connection: duckdb.DuckDBPyConnection | None = None, -) -> duckdb.DuckDBPyConnection: + duckdb_connection: "duckdb.DuckDBPyConnection | None" = None, +) -> "duckdb.DuckDBPyConnection": + duckdb = _import_duckdb() if duckdb_connection is None: con = duckdb.connect(database=":memory:") else: @@ -31,102 +44,142 @@ def get_duckdb_connection( if server.type == "azure": path = server.location setup_azure_connection(con, server) - for model_name, model in data_contract.models.items(): - model_path = path - if "{model}" in model_path: - model_path = model_path.format(model=model_name) - run.log_info(f"Creating table {model_name} for {model_path}") - - if server.format == "json": - json_format = "auto" - if server.delimiter == "new_line": - json_format = "newline_delimited" - elif server.delimiter == "array": - json_format = "array" - columns = to_json_types(model) - if columns is None: - con.sql(f""" - CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1); - """) - else: - con.sql( - f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);""" - ) - add_nested_views(con, model_name, model.fields) - elif server.format == "parquet": - con.sql(f""" - CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1); - """) - elif server.format == "csv": - columns = to_csv_types(model) - run.log_info("Using columns: " + str(columns)) - if columns is None: - con.sql( - f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);""" - ) - else: - con.sql( - f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});""" - ) - elif server.format == "delta": - con.sql("update extensions;") # Make sure we have the latest delta extension - con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""") - table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf() - if table_info is not None and not table_info.empty: - run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}") + + if data_contract.schema_: + for schema_obj in data_contract.schema_: + model_name = schema_obj.name + model_path = path + if "{model}" in model_path: + model_path = model_path.format(model=model_name) + run.log_info(f"Creating table {model_name} for {model_path}") + + if server.format == "json": + json_format = "auto" + if server.delimiter == "new_line": + json_format = "newline_delimited" + elif server.delimiter == "array": + json_format = "array" + columns = to_json_types(schema_obj) + if columns is None: + con.sql(f""" + CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1); + """) + else: + con.sql( + f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);""" + ) + add_nested_views(con, model_name, schema_obj.properties) + elif server.format == "parquet": + create_view_with_schema_union(con, schema_obj, model_path, "read_parquet", to_parquet_types) + elif server.format == "csv": + create_view_with_schema_union(con, schema_obj, model_path, "read_csv", to_csv_types) + elif server.format == "delta": + con.sql("update extensions;") # Make sure we have the latest delta extension + con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""") + table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchall() + if table_info: + run.log_info(f"DuckDB Table Info: {table_info}") return con -def to_csv_types(model) -> dict[Any, str | None] | None: - if model is None: +def create_view_with_schema_union(con, schema_obj: SchemaObject, model_path: str, read_function: str, type_converter): + """Create a view by unioning empty schema table with data files using union_by_name""" + converted_types = type_converter(schema_obj) + model_name = schema_obj.name + if converted_types: + # Create empty table with contract schema + columns_def = [f'"{col_name}" {col_type}' for col_name, col_type in converted_types.items()] + create_empty_table = f"""CREATE TABLE "{model_name}" ({', '.join(columns_def)});""" + con.sql(create_empty_table) + + # Read columns existing in both current data contract and data + intersecting_columns = con.sql(f"""SELECT column_name + FROM (DESCRIBE SELECT * FROM {read_function}('{model_path}', union_by_name=true, hive_partitioning=1)) + INTERSECT SELECT column_name + FROM information_schema.columns + WHERE table_name = '{model_name}'""").fetchall() + selected_columns = ', '.join([column[0] for column in intersecting_columns]) + + # Insert data into table by name, but only columns existing in contract and data + insert_data_sql = f"""INSERT INTO {model_name} BY NAME + (SELECT {selected_columns} FROM {read_function}('{model_path}', union_by_name=true, hive_partitioning=1));""" + con.sql(insert_data_sql) + else: + # Fallback + con.sql( + f"""CREATE VIEW "{model_name}" AS SELECT * FROM {read_function}('{model_path}', union_by_name=true, hive_partitioning=1);""" + ) + +def to_csv_types(schema_obj: SchemaObject) -> dict[Any, str | None] | None: + if schema_obj is None: return None columns = {} - # ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR'] - for field_name, field in model.fields.items(): - columns[field_name] = convert_to_duckdb_csv_type(field) + if schema_obj.properties: + for prop in schema_obj.properties: + columns[prop.name] = convert_to_duckdb_csv_type(prop) return columns +def to_parquet_types(schema_obj: SchemaObject) -> dict[Any, str | None] | None: + """Get proper SQL types for Parquet (preserves decimals, etc.)""" + if schema_obj is None: + return None + columns = {} + if schema_obj.properties: + for prop in schema_obj.properties: + columns[prop.name] = convert_to_duckdb(prop) + return columns -def to_json_types(model: Model) -> dict[Any, str | None] | None: - if model is None: +def to_json_types(schema_obj: SchemaObject) -> dict[Any, str | None] | None: + if schema_obj is None: return None columns = {} - for field_name, field in model.fields.items(): - columns[field_name] = convert_to_duckdb_json_type(field) + if schema_obj.properties: + for prop in schema_obj.properties: + columns[prop.name] = convert_to_duckdb_json_type(prop) return columns -def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None): +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property. Prefers physicalType for accurate type checking.""" + if prop.physicalType: + return prop.physicalType + if prop.logicalType: + return prop.logicalType + return None + + +def add_nested_views(con: "duckdb.DuckDBPyConnection", model_name: str, properties: List[SchemaProperty] | None): model_name = model_name.strip('"') - if fields is None: + if properties is None: return - for field_name, field in fields.items(): - if field.type is None or field.type.lower() not in ["array", "object"]: + for prop in properties: + prop_type = _get_type(prop) + if prop_type is None or prop_type.lower() not in ["array", "object"]: continue - field_type = field.type.lower() - if field_type == "array" and field.items is None: + field_type = prop_type.lower() + if field_type == "array" and prop.items is None: continue - elif field_type == "object" and field.fields is None: + elif field_type == "object" and (prop.properties is None or len(prop.properties) == 0): continue - nested_model_name = f"{model_name}__{field_name}" + nested_model_name = f"{model_name}__{prop.name}" max_depth = 2 if field_type == "array" else 1 - ## if parent field is not required, the nested objects may respolve + ## if parent field is not required, the nested objects may resolve ## to a row of NULLs -- but if the objects themselves have required ## fields, this will fail the check. - where = "" if field.required else f" WHERE {field_name} IS NOT NULL" + where = "" if prop.required else f" WHERE {prop.name} IS NOT NULL" con.sql(f""" CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS - SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where} + SELECT unnest({prop.name}, max_depth := {max_depth}) as {prop.name} FROM "{model_name}" {where} """) if field_type == "array": - add_nested_views(con, nested_model_name, field.items.fields) + add_nested_views(con, nested_model_name, prop.items.properties if prop.items else None) elif field_type == "object": - add_nested_views(con, nested_model_name, field.fields) + add_nested_views(con, nested_model_name, prop.properties) -def setup_s3_connection(con, server): +def setup_s3_connection(con, server: Server): s3_region = os.getenv("DATACONTRACT_S3_REGION") s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID") s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY") @@ -169,22 +222,8 @@ def setup_s3_connection(con, server): ); """) - # con.sql(f""" - # SET s3_region = '{s3_region}'; - # SET s3_access_key_id = '{s3_access_key_id}'; - # SET s3_secret_access_key = '{s3_secret_access_key}'; - # """) - # else: - # con.sql(""" - # RESET s3_region; - # RESET s3_access_key_id; - # RESET s3_secret_access_key; - # """) - # con.sql("RESET s3_session_token") - # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'")) - - -def setup_gcs_connection(con, server): + +def setup_gcs_connection(con, server: Server): key_id = os.getenv("DATACONTRACT_GCS_KEY_ID") secret = os.getenv("DATACONTRACT_GCS_SECRET") @@ -202,11 +241,14 @@ def setup_gcs_connection(con, server): """) -def setup_azure_connection(con, server): +def setup_azure_connection(con, server: Server): tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID") client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID") client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET") - storage_account = server.storageAccount + storage_account = ( + to_azure_storage_account(server.location) if server.type == "azure" and "://" in server.location + else None + ) if tenant_id is None: raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set") @@ -239,3 +281,25 @@ def setup_azure_connection(con, server): CLIENT_SECRET '{client_secret}' ); """) + +def to_azure_storage_account(location: str) -> str | None: + """ + Converts a storage location string to extract the storage account name. + ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI. + This function parses a storage location string to identify and return the + storage account name. It handles two primary patterns: + 1. Protocol://containerName@storageAccountName + 2. Protocol://storageAccountName + :param location: The storage location string to parse, typically following + the format protocol://containerName@storageAccountName. or + protocol://storageAccountName. + :return: The extracted storage account name if found, otherwise None + """ + # to catch protocol://containerName@storageAccountName. pattern from location + match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE) + if match: + return match.group() + else: + # to catch protocol://storageAccountName. pattern from location + match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE) + return match.group() if match else None diff --git a/datacontract/engines/soda/connections/impala.py b/datacontract/engines/soda/connections/impala.py new file mode 100644 index 000000000..72c21b2c6 --- /dev/null +++ b/datacontract/engines/soda/connections/impala.py @@ -0,0 +1,74 @@ +import os + +import yaml + + +def _get_bool_env(name: str, default: bool) -> bool: + """ + Helper to read a boolean from an environment variable. + + Accepts: 1/0, true/false, yes/no, on/off (case-insensitive). + """ + value = os.getenv(name) + if value is None: + return default + return value.strip().lower() in ("1", "true", "yes", "y", "on") + + +def to_impala_soda_configuration(server): + """ + Build a Soda configuration for an Impala data source. + + Expects the datacontract `server` block to have at least: + - type (e.g. "impala") + - host + - port (optional; defaults to 443 if not set) + + Credentials are taken from environment variables: + - DATACONTRACT_IMPALA_USERNAME + - DATACONTRACT_IMPALA_PASSWORD + + Connection behaviour can be overridden via: + - DATACONTRACT_IMPALA_USE_SSL (default: true) + - DATACONTRACT_IMPALA_AUTH_MECHANISM (default: "LDAP") + - DATACONTRACT_IMPALA_USE_HTTP_TRANSPORT (default: true) + - DATACONTRACT_IMPALA_HTTP_PATH (default: "cliservice") + """ + + port = getattr(server, "port", None) + if port is None: + port = 443 + + # Optional database / schema default, e.g. "edpdevs_scratch" + database = getattr(server, "database", None) + + use_ssl = _get_bool_env("DATACONTRACT_IMPALA_USE_SSL", True) + auth_mechanism = os.getenv("DATACONTRACT_IMPALA_AUTH_MECHANISM", "LDAP") + use_http_transport = _get_bool_env( + "DATACONTRACT_IMPALA_USE_HTTP_TRANSPORT", True + ) + http_path = os.getenv("DATACONTRACT_IMPALA_HTTP_PATH", "cliservice") + + connection = { + "host": server.host, + "port": str(port), + "username": os.getenv("DATACONTRACT_IMPALA_USERNAME"), + "password": os.getenv("DATACONTRACT_IMPALA_PASSWORD"), + "use_ssl": use_ssl, + "auth_mechanism": auth_mechanism, + "use_http_transport": use_http_transport, + "http_path": http_path, + } + + if database: + connection["database"] = database + + soda_configuration = { + f"data_source {server.type}": { + "type": "impala", + "connection": connection, + } + } + + soda_configuration_str = yaml.dump(soda_configuration) + return soda_configuration_str diff --git a/datacontract/engines/soda/connections/kafka.py b/datacontract/engines/soda/connections/kafka.py index 94c1ce24d..3f5eff673 100644 --- a/datacontract/engines/soda/connections/kafka.py +++ b/datacontract/engines/soda/connections/kafka.py @@ -2,9 +2,11 @@ import logging import os import tempfile +from typing import List, Optional -from datacontract.export.avro_converter import to_avro_schema_json -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty, Server + +from datacontract.export.avro_exporter import to_avro_schema_json from datacontract.model.exceptions import DataContractException from datacontract.model.run import ResultEnum @@ -33,6 +35,8 @@ def create_spark_session(): .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse") .config("spark.streaming.stopGracefullyOnShutdown", "true") .config("spark.ui.enabled", "false") + .config("spark.driver.bindAddress", "127.0.0.1") + .config("spark.driver.host", "127.0.0.1") .config( "spark.jars.packages", f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}", @@ -44,26 +48,37 @@ def create_spark_session(): return spark -def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server): +def read_kafka_topic(spark, data_contract: OpenDataContractStandard, server: Server): """Read and process data from a Kafka topic based on the server configuration.""" - logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic) + if not data_contract.schema_ or len(data_contract.schema_) == 0: + raise DataContractException( + type="test", + name="Configuring Kafka checks", + result="warning", + reason="No schema defined in data contract. Skip executing tests.", + engine="datacontract", + ) + + schema_obj = data_contract.schema_[0] + model_name = schema_obj.name + topic = schema_obj.physicalName or schema_obj.name + + logging.info("Reading data from Kafka server %s topic %s", server.host, topic) df = ( spark.read.format("kafka") .options(**get_auth_options()) .option("kafka.bootstrap.servers", server.host) - .option("subscribe", server.topic) + .option("subscribe", topic) .option("startingOffsets", "earliest") .load() ) - model_name, model = next(iter(data_contract.models.items())) - match server.format: case "avro": - process_avro_format(df, model_name, model) + process_avro_format(df, model_name, schema_obj) case "json": - process_json_format(df, model_name, model) + process_json_format(df, model_name, schema_obj) case _: raise DataContractException( type="test", @@ -74,7 +89,7 @@ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Se ) -def process_avro_format(df, model_name, model): +def process_avro_format(df, model_name: str, schema_obj: SchemaObject): try: from pyspark.sql.avro.functions import from_avro from pyspark.sql.functions import col, expr @@ -88,7 +103,7 @@ def process_avro_format(df, model_name, model): original_exception=e, ) - avro_schema = to_avro_schema_json(model_name, model) + avro_schema = to_avro_schema_json(model_name, schema_obj) df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)")) options = {"mode": "PERMISSIVE"} df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select( @@ -96,7 +111,7 @@ def process_avro_format(df, model_name, model): ).createOrReplaceTempView(model_name) -def process_json_format(df, model_name, model): +def process_json_format(df, model_name: str, schema_obj: SchemaObject): try: from pyspark.sql.functions import col, from_json except ImportError as e: @@ -109,7 +124,7 @@ def process_json_format(df, model_name, model): original_exception=e, ) - struct_type = to_struct_type(model.fields) + struct_type = to_struct_type(schema_obj.properties or []) df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select( from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json") ).select(col("json.*")).createOrReplaceTempView(model_name) @@ -154,7 +169,16 @@ def get_auth_options(): } -def to_struct_type(fields): +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property. Prefers physicalType for accurate type checking.""" + if prop.physicalType: + return prop.physicalType + if prop.logicalType: + return prop.logicalType + return None + + +def to_struct_type(properties: List[SchemaProperty]): try: from pyspark.sql.types import StructType except ImportError as e: @@ -168,10 +192,10 @@ def to_struct_type(fields): ) """Convert field definitions to Spark StructType.""" - return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()]) + return StructType([to_struct_field(prop.name, prop) for prop in properties]) -def to_struct_field(field_name: str, field: Field): +def to_struct_field(field_name: str, prop: SchemaProperty): try: from pyspark.sql.types import ( ArrayType, @@ -201,7 +225,8 @@ def to_struct_field(field_name: str, field: Field): ) """Map field definitions to Spark StructField using match-case.""" - match field.type: + field_type = _get_type(prop) + match field_type: case "string" | "varchar" | "text": data_type = StringType() case "number" | "decimal" | "numeric": @@ -223,23 +248,19 @@ def to_struct_field(field_name: str, field: Field): case "time": data_type = DataType() # Specific handling for time type case "object" | "record" | "struct": - data_type = StructType( - [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()] - ) + nested_props = prop.properties or [] + data_type = StructType([to_struct_field(p.name, p) for p in nested_props]) case "binary": data_type = BinaryType() case "array": - element_type = ( - StructType( - [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()] - ) - if field.fields - else DataType() - ) + if prop.items and prop.items.properties: + element_type = StructType([to_struct_field(p.name, p) for p in prop.items.properties]) + else: + element_type = DataType() data_type = ArrayType(element_type) case "null": data_type = NullType() case _: data_type = DataType() # Fallback generic DataType - return StructField(field_name, data_type, nullable=not field.required) + return StructField(field_name, data_type, nullable=not prop.required) diff --git a/datacontract/engines/soda/connections/oracle.py b/datacontract/engines/soda/connections/oracle.py new file mode 100644 index 000000000..81d085682 --- /dev/null +++ b/datacontract/engines/soda/connections/oracle.py @@ -0,0 +1,50 @@ +import os + +import yaml +from open_data_contract_standard.model import Server + + +def initialize_client_and_create_soda_configuration(server: Server) -> str: + import oracledb + + soda_config = to_oracle_soda_configuration(server) + + oracle_client_dir = os.getenv("DATACONTRACT_ORACLE_CLIENT_DIR") + if oracle_client_dir is not None: + # Soda Core currently does not support thick mode natively, see https://github.com/sodadata/soda-core/issues/2036 + # but the oracledb client can be configured accordingly before Soda initializes as a work-around + oracledb.init_oracle_client(lib_dir=oracle_client_dir) + + return soda_config + + +def to_oracle_soda_configuration(server: Server) -> str: + """Serialize server config to soda configuration. + + + ### Example: + type: oracle + host: database-1.us-east-1.rds.amazonaws.com + port: '1521' + username: simple + password: simple_pass + connectstring: database-1.us-east-1.rds.amazonaws.com:1521/ORCL (database is equal to service name at oracle) + schema: SYSTEM + """ + + service_name = server.serviceName or server.database + # with service account key, using an external json file + soda_configuration = { + f"data_source {server.type}": { + "type": "oracle", + "host": server.host, + "port": str(server.port), + "username": os.getenv("DATACONTRACT_ORACLE_USERNAME", ""), + "password": os.getenv("DATACONTRACT_ORACLE_PASSWORD", ""), + "connectstring": f"{server.host}:{server.port}/{service_name}", + "schema": server.schema_, + } + } + + soda_configuration_str = yaml.dump(soda_configuration) + return soda_configuration_str diff --git a/datacontract/engines/soda/connections/sqlserver.py b/datacontract/engines/soda/connections/sqlserver.py index 8272bd322..72a3d9c58 100644 --- a/datacontract/engines/soda/connections/sqlserver.py +++ b/datacontract/engines/soda/connections/sqlserver.py @@ -1,8 +1,16 @@ import os import yaml +from open_data_contract_standard.model import Server -from datacontract.model.data_contract_specification import Server + +def _get_custom_property(server: Server, name: str) -> str | None: + """Get a custom property value from server.customProperties.""" + if server.customProperties: + for prop in server.customProperties: + if prop.property == name: + return prop.value + return None def to_sqlserver_soda_configuration(server: Server) -> str: @@ -35,7 +43,7 @@ def to_sqlserver_soda_configuration(server: Server) -> str: "trusted_connection": os.getenv("DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION", False), "trust_server_certificate": os.getenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", False), "encrypt": os.getenv("DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION", True), - "driver": server.driver, + "driver": _get_custom_property(server, "driver") or os.getenv("DATACONTRACT_SQLSERVER_DRIVER"), } } diff --git a/datacontract/export/avro_converter.py b/datacontract/export/avro_converter.py deleted file mode 100644 index f584216f8..000000000 --- a/datacontract/export/avro_converter.py +++ /dev/null @@ -1,138 +0,0 @@ -import json - -from datacontract.export.exporter import Exporter, _check_models_for_export -from datacontract.model.data_contract_specification import Field - - -class AvroExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - return to_avro_schema_json(model_name, model_value) - - -def to_avro_schema(model_name, model) -> dict: - return to_avro_record(model_name, model.fields, model.description, model.namespace) - - -def to_avro_schema_json(model_name, model) -> str: - schema = to_avro_schema(model_name, model) - return json.dumps(schema, indent=2, sort_keys=False) - - -def to_avro_record(name, fields, description, namespace) -> dict: - schema = {"type": "record", "name": name} - if description is not None: - schema["doc"] = description - if namespace is not None: - schema["namespace"] = namespace - schema["fields"] = to_avro_fields(fields) - return schema - - -def to_avro_fields(fields): - result = [] - for field_name, field in fields.items(): - result.append(to_avro_field(field, field_name)) - return result - - -def to_avro_field(field, field_name): - avro_field = {"name": field_name} - if field.description is not None: - avro_field["doc"] = field.description - is_required_avro = field.required if field.required is not None else True - avro_type = to_avro_type(field, field_name) - avro_field["type"] = avro_type if is_required_avro else ["null", avro_type] - - # Handle enum types - both required and optional - if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]): - enum_def = { - "type": "enum", - "name": field.title, - "symbols": field.enum, - } - if is_required_avro: - avro_field["type"] = enum_def - else: - # Replace "enum" with the full enum definition in the union - avro_field["type"] = ["null", enum_def] - - if field.config: - if "avroDefault" in field.config: - if field.config.get("avroType") != "enum": - avro_field["default"] = field.config["avroDefault"] - - return avro_field - - -def to_avro_type(field: Field, field_name: str) -> str | dict: - if field.config: - if "avroLogicalType" in field.config and "avroType" in field.config: - return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]} - if "avroLogicalType" in field.config: - if field.config["avroLogicalType"] in [ - "timestamp-millis", - "timestamp-micros", - "local-timestamp-millis", - "local-timestamp-micros", - "time-micros", - ]: - return {"type": "long", "logicalType": field.config["avroLogicalType"]} - if field.config["avroLogicalType"] in ["time-millis", "date"]: - return {"type": "int", "logicalType": field.config["avroLogicalType"]} - if "avroType" in field.config: - return field.config["avroType"] - - # Check for enum fields based on presence of enum list and avroType config - if field.enum and field.config and field.config.get("avroType") == "enum": - return "enum" - - if field.type is None: - return "null" - if field.type in ["string", "varchar", "text"]: - return "string" - elif field.type in ["number", "numeric"]: - # https://avro.apache.org/docs/1.11.1/specification/#decimal - return "bytes" - elif field.type in ["decimal"]: - typeVal = {"type": "bytes", "logicalType": "decimal"} - if field.scale is not None: - typeVal["scale"] = field.scale - if field.precision is not None: - typeVal["precision"] = field.precision - return typeVal - elif field.type in ["float"]: - return "float" - elif field.type in ["double"]: - return "double" - elif field.type in ["integer", "int"]: - return "int" - elif field.type in ["long", "bigint"]: - return "long" - elif field.type in ["boolean"]: - return "boolean" - elif field.type in ["timestamp", "timestamp_tz"]: - return {"type": "long", "logicalType": "timestamp-millis"} - elif field.type in ["timestamp_ntz"]: - return {"type": "long", "logicalType": "local-timestamp-millis"} - elif field.type in ["date"]: - return {"type": "int", "logicalType": "date"} - elif field.type in ["time"]: - return "long" - elif field.type in ["map"]: - if field.config is not None and "values" in field.config: - return {"type": "map", "values": field.config["values"]} - else: - return "bytes" - elif field.type in ["object", "record", "struct"]: - if field.config is not None and "namespace" in field.config: - return to_avro_record(field_name, field.fields, field.description, field.config["namespace"]) - return to_avro_record(field_name, field.fields, field.description, None) - elif field.type in ["binary"]: - return "bytes" - elif field.type in ["array"]: - return {"type": "array", "items": to_avro_type(field.items, field_name)} - elif field.type in ["null"]: - return "null" - else: - return "bytes" diff --git a/datacontract/export/avro_exporter.py b/datacontract/export/avro_exporter.py new file mode 100644 index 000000000..b5eb8f307 --- /dev/null +++ b/datacontract/export/avro_exporter.py @@ -0,0 +1,253 @@ +import json +from typing import List, Optional, Union + +from open_data_contract_standard.model import SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export + + +class AvroExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + model_name, model_value = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + return to_avro_schema_json(model_name, model_value) + + +def to_avro_schema(model_name: str, model: SchemaObject) -> dict: + namespace = _get_config_value(model, "namespace") + return to_avro_record(model_name, model.properties or [], model.description, namespace) + + +def to_avro_schema_json(model_name: str, model: SchemaObject) -> str: + schema = to_avro_schema(model_name, model) + return json.dumps(schema, indent=2, sort_keys=False) + + +def to_avro_record(name: str, properties: List[SchemaProperty], description: Optional[str], namespace: Optional[str]) -> dict: + schema = {"type": "record", "name": name} + if description is not None: + schema["doc"] = description + if namespace is not None: + schema["namespace"] = namespace + schema["fields"] = to_avro_fields(properties) + return schema + + +def to_avro_fields(properties: List[SchemaProperty]) -> list: + result = [] + for prop in properties: + result.append(to_avro_field(prop)) + return result + + +def _get_config_value(obj: Union[SchemaObject, SchemaProperty], key: str) -> Optional[str]: + """Get a custom property value.""" + if obj.customProperties is None: + return None + for cp in obj.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_enum_values(prop: SchemaProperty): + """Get enum values from logicalTypeOptions, customProperties, or quality rules.""" + import json + # First check logicalTypeOptions (legacy/direct ODCS) + enum_values = _get_logical_type_option(prop, "enum") + if enum_values: + return enum_values + # Then check customProperties (converted from DCS) + enum_str = _get_config_value(prop, "enum") + if enum_str: + try: + return json.loads(enum_str) + except (json.JSONDecodeError, TypeError): + pass + # Finally check quality rules for invalidValues with validValues + if prop.quality: + for q in prop.quality: + if q.metric == "invalidValues" and q.arguments: + valid_values = q.arguments.get("validValues") + if valid_values: + return valid_values + return None + + +def _parse_default_value(value: str): + """Parse a default value string to its proper type (bool, int, float, or string).""" + if value.lower() == "true": + return True + if value.lower() == "false": + return False + if value.lower() == "null": + return None + # Try parsing as int + try: + return int(value) + except ValueError: + pass + # Try parsing as float + try: + return float(value) + except ValueError: + pass + # Return as string + return value + + +def to_avro_field(prop: SchemaProperty) -> dict: + avro_field = {"name": prop.name} + if prop.description is not None: + avro_field["doc"] = prop.description + is_required_avro = prop.required if prop.required is not None else True + avro_type = to_avro_type(prop) + avro_field["type"] = avro_type if is_required_avro else ["null", avro_type] + + # Handle enum types - both required and optional + enum_values = _get_enum_values(prop) + avro_config_type = _get_config_value(prop, "avroType") + + if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]): + title = prop.businessName or prop.name + enum_def = { + "type": "enum", + "name": title, + "symbols": enum_values or [], + } + if is_required_avro: + avro_field["type"] = enum_def + else: + # Replace "enum" with the full enum definition in the union + avro_field["type"] = ["null", enum_def] + + avro_default = _get_config_value(prop, "avroDefault") + if avro_default is not None: + if avro_config_type != "enum": + # Parse the default value to its proper type + avro_field["default"] = _parse_default_value(avro_default) + + return avro_field + + +def to_avro_type(prop: SchemaProperty) -> Union[str, dict]: + avro_logical_type = _get_config_value(prop, "avroLogicalType") + avro_type = _get_config_value(prop, "avroType") + + if avro_logical_type and avro_type: + return {"type": avro_type, "logicalType": avro_logical_type} + if avro_logical_type: + if avro_logical_type in [ + "timestamp-millis", + "timestamp-micros", + "local-timestamp-millis", + "local-timestamp-micros", + "time-micros", + ]: + return {"type": "long", "logicalType": avro_logical_type} + if avro_logical_type in ["time-millis", "date"]: + return {"type": "int", "logicalType": avro_logical_type} + if avro_type: + return avro_type + + # Check for enum fields based on presence of enum list and avroType config + enum_values = _get_enum_values(prop) + if enum_values and avro_type == "enum": + return "enum" + + # Use physicalType for more specific type mappings, fall back to logicalType + physical_type = prop.physicalType.lower() if prop.physicalType else None + field_type = prop.logicalType + + # Handle specific physical types that need special treatment + if physical_type in ["float"]: + return "float" + elif physical_type in ["double"]: + return "double" + elif physical_type in ["long", "bigint"]: + return "long" + elif physical_type in ["decimal"]: + typeVal = {"type": "bytes", "logicalType": "decimal"} + # Read precision/scale from customProperties + scale = _get_config_value(prop, "scale") + precision = _get_config_value(prop, "precision") + if scale is not None: + typeVal["scale"] = int(scale) + if precision is not None: + typeVal["precision"] = int(precision) + return typeVal + elif physical_type in ["map"]: + values_type = _get_config_value(prop, "values") + if values_type: + # Parse JSON array if values is a string like '["string", "long"]' + import json + try: + parsed_values = json.loads(values_type) + return {"type": "map", "values": parsed_values} + except (json.JSONDecodeError, TypeError): + return {"type": "map", "values": values_type} + else: + return "bytes" + elif physical_type in ["timestamp_ntz"]: + return {"type": "long", "logicalType": "local-timestamp-millis"} + + if field_type is None: + return "null" + if field_type.lower() in ["string", "varchar", "text"]: + return "string" + elif field_type.lower() in ["number", "numeric"]: + # https://avro.apache.org/docs/1.11.1/specification/#decimal + return "bytes" + elif field_type.lower() in ["decimal"]: + typeVal = {"type": "bytes", "logicalType": "decimal"} + # Read precision/scale from customProperties + scale = _get_config_value(prop, "scale") + precision = _get_config_value(prop, "precision") + if scale is not None: + typeVal["scale"] = int(scale) + if precision is not None: + typeVal["precision"] = int(precision) + return typeVal + elif field_type.lower() in ["float"]: + return "float" + elif field_type.lower() in ["double"]: + return "double" + elif field_type.lower() in ["integer", "int"]: + return "int" + elif field_type.lower() in ["long", "bigint"]: + return "long" + elif field_type.lower() in ["boolean"]: + return "boolean" + elif field_type.lower() in ["timestamp", "timestamp_tz"]: + return {"type": "long", "logicalType": "timestamp-millis"} + elif field_type.lower() in ["timestamp_ntz"]: + return {"type": "long", "logicalType": "local-timestamp-millis"} + elif field_type.lower() in ["date"]: + return {"type": "int", "logicalType": "date"} + elif field_type.lower() in ["time"]: + return "long" + elif field_type.lower() in ["map"]: + values_type = _get_config_value(prop, "values") + if values_type: + return {"type": "map", "values": values_type} + else: + return "bytes" + elif field_type.lower() in ["object", "record", "struct"]: + namespace = _get_config_value(prop, "namespace") + return to_avro_record(prop.name, prop.properties or [], prop.description, namespace) + elif field_type.lower() in ["binary"]: + return "bytes" + elif field_type.lower() in ["array"]: + if prop.items: + return {"type": "array", "items": to_avro_type(prop.items)} + return {"type": "array", "items": "string"} + elif field_type.lower() in ["null"]: + return "null" + else: + return "bytes" diff --git a/datacontract/export/avro_idl_converter.py b/datacontract/export/avro_idl_exporter.py similarity index 52% rename from datacontract/export/avro_idl_converter.py rename to datacontract/export/avro_idl_exporter.py index e3cfa5b9a..72fd71a99 100644 --- a/datacontract/export/avro_idl_converter.py +++ b/datacontract/export/avro_idl_exporter.py @@ -2,10 +2,12 @@ from dataclasses import dataclass from enum import Enum from io import StringIO +from typing import Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty from datacontract.export.exporter import Exporter from datacontract.lint.resolve import inline_definitions_into_data_contract -from datacontract.model.data_contract_specification import DataContractSpecification, Field from datacontract.model.exceptions import DataContractException @@ -63,35 +65,34 @@ class AvroIDLProtocol: model_types: list[AvroModelType] -# TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py -avro_primitive_types = set( - [ - "string", - "text", - "varchar", - "float", - "double", - "int", - "integer", - "long", - "bigint", - "boolean", - "timestamp_ntz", - "timestamp", - "timestamp_tz", - "date", - "bytes", - "null", - ] -) +# ODCS logical types and physical types that map to Avro primitive types +avro_primitive_logical_types = {"string", "integer", "number", "boolean", "date"} +avro_primitive_physical_types = { + "string", + "text", + "varchar", + "float", + "double", + "int", + "integer", + "long", + "bigint", + "boolean", + "timestamp_ntz", + "timestamp", + "timestamp_tz", + "date", + "bytes", + "null", +} class AvroIdlExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: return to_avro_idl(data_contract) -def to_avro_idl(contract: DataContractSpecification) -> str: +def to_avro_idl(contract: OpenDataContractStandard) -> str: """Serialize the provided data contract specification into an Avro IDL string. The data contract will be serialized as a protocol, with one record type @@ -103,95 +104,165 @@ def to_avro_idl(contract: DataContractSpecification) -> str: return stream.getvalue() -def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO): +def to_avro_idl_stream(contract: OpenDataContractStandard, stream: typing.TextIO): """Serialize the provided data contract specification into Avro IDL.""" ir = _contract_to_avro_idl_ir(contract) if ir.description: - stream.write(f"/** {contract.info.description} */\n") + stream.write(f"/** {ir.description} */\n") stream.write(f"protocol {ir.name or 'Unnamed'} {{\n") for model_type in ir.model_types: _write_model_type(model_type, stream) stream.write("}\n") -def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField: - result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string) - match field.type: - case "string" | "text" | "varchar": +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the logical type from a schema property.""" + return prop.logicalType + + +def _is_primitive_type(prop: SchemaProperty) -> bool: + """Check if a property is a primitive type.""" + logical_type = _get_type(prop) + physical_type = prop.physicalType + + if logical_type in avro_primitive_logical_types: + return True + if physical_type and physical_type.lower() in avro_primitive_physical_types: + return True + return False + + +def _to_avro_primitive_logical_type(field_name: str, prop: SchemaProperty) -> AvroPrimitiveField: + result = AvroPrimitiveField(field_name, prop.required or False, prop.description, AvroPrimitiveType.string) + logical_type = _get_type(prop) + physical_type = prop.physicalType.lower() if prop.physicalType else None + + # Check physical type first for more specific mapping + if physical_type: + if physical_type in ["string", "text", "varchar"]: result.type = AvroPrimitiveType.string - case "float": + return result + elif physical_type == "float": result.type = AvroPrimitiveType.float - case "double": + return result + elif physical_type == "double": result.type = AvroPrimitiveType.double - case "int" | "integer": + return result + elif physical_type in ["int", "integer"]: result.type = AvroPrimitiveType.int - case "long" | "bigint": + return result + elif physical_type in ["long", "bigint"]: result.type = AvroPrimitiveType.long - case "boolean": + return result + elif physical_type == "boolean": result.type = AvroPrimitiveType.boolean - case "timestamp" | "timestamp_tz": + return result + elif physical_type in ["timestamp", "timestamp_tz"]: result.type = AvroPrimitiveType.string - case "timestamp_ntz": + return result + elif physical_type == "timestamp_ntz": result.type = AvroLogicalType.timestamp_ms - case "date": + return result + elif physical_type == "date": result.type = AvroLogicalType.date - case "bytes": + return result + elif physical_type == "bytes": result.type = AvroPrimitiveType.bytes - case "null": + return result + elif physical_type == "null": result.type = AvroPrimitiveType.null + return result + + # Fall back to logical type + match logical_type: + case "string": + result.type = AvroPrimitiveType.string + case "number": + result.type = AvroPrimitiveType.double + case "integer": + result.type = AvroPrimitiveType.long + case "boolean": + result.type = AvroPrimitiveType.boolean + case "date": + result.type = AvroLogicalType.date case _: raise DataContractException( type="general", name="avro-idl-export", - model=field, - reason="Unknown field type {field.type}", + model=prop, + reason=f"Unknown field type {logical_type}", result="failed", message="Avro IDL type conversion failed.", ) return result -def _to_avro_idl_type(field_name: str, field: Field) -> AvroField: - if field.type in avro_primitive_types: - return _to_avro_primitive_logical_type(field_name, field) +def _to_avro_idl_type(field_name: str, prop: SchemaProperty) -> AvroField: + if _is_primitive_type(prop): + return _to_avro_primitive_logical_type(field_name, prop) else: - match field.type: - case "array": + logical_type = _get_type(prop) + physical_type = prop.physicalType.lower() if prop.physicalType else None + + if logical_type == "array": + if prop.items: return AvroArrayField( - field_name, field.required, field.description, _to_avro_idl_type(field_name, field.items) + field_name, prop.required or False, prop.description, _to_avro_idl_type(field_name, prop.items) ) - case "object" | "record" | "struct": + else: + raise DataContractException( + type="general", + name="avro-idl-export", + model=prop, + reason="Array type requires items", + result="failed", + message="Avro IDL type conversion failed.", + ) + elif logical_type == "object" or physical_type in ["record", "struct"]: + if prop.properties: return AvroComplexField( field_name, - field.required, - field.description, - [_to_avro_idl_type(field_name, field) for (field_name, field) in field.fields.items()], + prop.required or False, + prop.description, + [_to_avro_idl_type(p.name, p) for p in prop.properties], ) - case _: + else: raise DataContractException( type="general", name="avro-idl-export", - model=type, - reason="Unknown Data Contract field type", + model=prop, + reason="Object type requires properties", result="failed", message="Avro IDL type conversion failed.", ) + else: + raise DataContractException( + type="general", + name="avro-idl-export", + model=prop, + reason=f"Unknown Data Contract field type: {logical_type}", + result="failed", + message="Avro IDL type conversion failed.", + ) -def _generate_field_types(contract: DataContractSpecification) -> list[AvroField]: +def _generate_field_types(schema_obj: SchemaObject) -> list[AvroField]: result = [] - for _, model in contract.models.items(): - for field_name, field in model.fields.items(): - result.append(_to_avro_idl_type(field_name, field)) + if schema_obj.properties: + for prop in schema_obj.properties: + result.append(_to_avro_idl_type(prop.name, prop)) return result -def generate_model_types(contract: DataContractSpecification) -> list[AvroModelType]: +def generate_model_types(contract: OpenDataContractStandard) -> list[AvroModelType]: result = [] - for model_name, model in contract.models.items(): - result.append( - AvroModelType(name=model_name, description=model.description, fields=_generate_field_types(contract)) - ) + if contract.schema_: + for schema_obj in contract.schema_: + result.append( + AvroModelType( + name=schema_obj.name, description=schema_obj.description, fields=_generate_field_types(schema_obj) + ) + ) return result @@ -199,18 +270,16 @@ def _model_name_to_identifier(model_name: str): return "".join([word.title() for word in model_name.split()]) -def _contract_to_avro_idl_ir(contract: DataContractSpecification) -> AvroIDLProtocol: +def _contract_to_avro_idl_ir(contract: OpenDataContractStandard) -> AvroIDLProtocol: """Convert models into an intermediate representation for later serialization into Avro IDL. Each model is converted to a record containing a field for each model field. """ inlined_contract = contract.model_copy() inline_definitions_into_data_contract(inlined_contract) - protocol_name = _model_name_to_identifier(contract.info.title) if contract.info and contract.info.title else None - description = contract.info.description if contract.info and contract.info.description else None - return AvroIDLProtocol( - name=protocol_name, description=description, model_types=generate_model_types(inlined_contract) - ) + protocol_name = _model_name_to_identifier(contract.name) if contract.name else None + description = contract.description.purpose if contract.description and contract.description.purpose else None + return AvroIDLProtocol(name=protocol_name, description=description, model_types=generate_model_types(inlined_contract)) def _write_indent(indent: int, stream: typing.TextIO): @@ -244,10 +313,10 @@ def _write_field_type_definition(field: AvroField, indent: int, stream: typing.T for subfield in subfields: subfield_types.append(_write_field_type_definition(subfield, indent + 1, stream)) # Reference all defined record types. - for field, subfield_type in zip(field.subfields, subfield_types): - _write_field_description(field, indent + 1, stream) + for subfield, subfield_type in zip(field.subfields, subfield_types): + _write_field_description(subfield, indent + 1, stream) _write_indent(indent + 1, stream) - stream.write(f"{subfield_type} {field.name};\n") + stream.write(f"{subfield_type} {subfield.name};\n") _write_indent(indent, stream) stream.write("}\n") if required is True: @@ -261,7 +330,7 @@ def _write_field_type_definition(field: AvroField, indent: int, stream: typing.T else: return f"array<{subfield_type}>?" case _: - raise RuntimeError("Unknown Avro field type {field}") + raise RuntimeError(f"Unknown Avro field type {field}") def _write_field(field: AvroField, indent, stream: typing.TextIO): diff --git a/datacontract/export/bigquery_converter.py b/datacontract/export/bigquery_converter.py deleted file mode 100644 index 71222d46c..000000000 --- a/datacontract/export/bigquery_converter.py +++ /dev/null @@ -1,130 +0,0 @@ -import json -import logging -from typing import Dict, List - -from datacontract.export.exporter import Exporter, _check_models_for_export -from datacontract.model.data_contract_specification import Field, Model, Server -from datacontract.model.exceptions import DataContractException - - -class BigQueryExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - self.dict_args = export_args - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - found_server = data_contract.servers.get(server) - if found_server is None: - raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.") - if found_server.type != "bigquery": - raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.") - - return to_bigquery_json(model_name, model_value, found_server) - - -def to_bigquery_json(model_name: str, model_value: Model, server: Server) -> str: - bigquery_table = to_bigquery_schema(model_name, model_value, server) - return json.dumps(bigquery_table, indent=2) - - -def to_bigquery_schema(model_name: str, model_value: Model, server: Server) -> dict: - return { - "kind": "bigquery#table", - "tableReference": {"datasetId": server.dataset, "projectId": server.project, "tableId": model_name}, - "description": model_value.description, - "schema": {"fields": to_fields_array(model_value.fields)}, - } - - -def to_fields_array(fields: Dict[str, Field]) -> List[Dict[str, Field]]: - bq_fields = [] - for field_name, field in fields.items(): - bq_fields.append(to_field(field_name, field)) - - return bq_fields - - -def to_field(field_name: str, field: Field) -> dict: - bq_type = map_type_to_bigquery(field) - bq_field = { - "name": field_name, - "type": bq_type, - "mode": "REQUIRED" if field.required else "NULLABLE", - "description": field.description, - } - - # handle arrays - if field.type == "array": - bq_field["mode"] = "REPEATED" - if field.items.type == "object": - # in case the array type is a complex object, we want to copy all its fields - bq_field["fields"] = to_fields_array(field.items.fields) - else: - bq_field["type"] = map_type_to_bigquery(field.items) - - # all of these can carry other fields - elif bq_type.lower() in ["record", "struct"]: - bq_field["fields"] = to_fields_array(field.fields) - - # strings can have a maxlength - if bq_type.lower() == "string": - bq_field["maxLength"] = field.maxLength - - # number types have precision and scale - if bq_type.lower() in ["numeric", "bignumeric"]: - bq_field["precision"] = field.precision - bq_field["scale"] = field.scale - - return bq_field - - -def map_type_to_bigquery(field: Field) -> str: - logger = logging.getLogger(__name__) - - field_type = field.type - if not field_type: - return None - - if field.config and "bigqueryType" in field.config: - return field.config["bigqueryType"] - - if field_type.lower() in ["string", "varchar", "text"]: - return "STRING" - elif field_type.lower() == "bytes": - return "BYTES" - elif field_type.lower() in ["int", "integer"]: - return "INTEGER" - elif field_type.lower() in ["long", "bigint"]: - return "INT64" - elif field_type.lower() == "float": - return "FLOAT64" - elif field_type.lower() == "boolean": - return "BOOL" - elif field_type.lower() in ["timestamp", "timestamp_tz"]: - return "TIMESTAMP" - elif field_type.lower() == "date": - return "DATE" - elif field_type.lower() == "timestamp_ntz": - return "DATETIME" - elif field_type.lower() in ["number", "decimal", "numeric"]: - return "NUMERIC" - elif field_type.lower() == "double": - return "BIGNUMERIC" - elif field_type.lower() in ["object", "record"] and not field.fields: - return "JSON" - elif field_type.lower() in ["object", "record", "array"]: - return "RECORD" - elif field_type.lower() == "struct": - return "STRUCT" - elif field_type.lower() == "null": - logger.info( - f"Can't properly map {field.title} to bigquery Schema, as 'null' \ - is not supported as a type. Mapping it to STRING." - ) - return "STRING" - else: - raise DataContractException( - type="schema", - result="failed", - name="Map datacontract type to bigquery data type", - reason=f"Unsupported type {field_type} in data contract definition.", - engine="datacontract", - ) diff --git a/datacontract/export/bigquery_exporter.py b/datacontract/export/bigquery_exporter.py new file mode 100644 index 000000000..ad91b2203 --- /dev/null +++ b/datacontract/export/bigquery_exporter.py @@ -0,0 +1,180 @@ +import json +import logging +from typing import Dict, List + +from open_data_contract_standard.model import SchemaObject, SchemaProperty, Server + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export +from datacontract.model.exceptions import DataContractException + + +class BigQueryExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> str: + self.dict_args = export_args + schema_name, schema_object = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + + # Find the server + found_server = None + if data_contract.servers: + for srv in data_contract.servers: + if srv.server == server: + found_server = srv + break + + if found_server is None: + raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.") + if found_server.type != "bigquery": + raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.") + + return to_bigquery_json(schema_name, schema_object, found_server) + + +def to_bigquery_json(schema_name: str, schema_object: SchemaObject, server: Server) -> str: + bigquery_table = to_bigquery_schema(schema_object, server) + return json.dumps(bigquery_table, indent=2) + + +def to_bigquery_schema(schema_object: SchemaObject, server: Server) -> dict: + return { + "kind": "bigquery#table", + "tableReference": {"datasetId": server.dataset, "projectId": server.project, "tableId": schema_object.physicalName or schema_object.name}, + "description": schema_object.description, + "schema": {"fields": to_bigquery_fields_array(schema_object.properties or [])}, + } + + +def to_bigquery_fields_array(properties: List[SchemaProperty]) -> List[Dict]: + bq_fields = [] + for prop in properties: + bq_fields.append(to_bigquery_field(prop)) + return bq_fields + + +def to_bigquery_field(prop: SchemaProperty) -> dict: + bq_type = map_type_to_bigquery(prop) + field_name = prop.physicalName or prop.name + bq_field = { + "name": field_name, + "type": bq_type, + "mode": "REQUIRED" if prop.required else "NULLABLE", + "description": prop.description, + } + + field_type = prop.logicalType or "" + + # handle arrays + if field_type.lower() == "array": + bq_field["mode"] = "REPEATED" + if prop.items: + items_type = prop.items.logicalType or "" + if items_type.lower() == "object": + # in case the array type is a complex object, we want to copy all its fields + bq_field["fields"] = to_bigquery_fields_array(prop.items.properties or []) + else: + bq_field["type"] = map_type_to_bigquery(prop.items) + + # all of these can carry other fields + elif bq_type.lower() in ["record", "struct"]: + bq_field["fields"] = to_bigquery_fields_array(prop.properties or []) + + # strings can have a maxlength + if bq_type.lower() == "string": + max_length = None + if prop.logicalTypeOptions: + max_length = prop.logicalTypeOptions.get("maxLength") + bq_field["maxLength"] = max_length + + # number types have precision and scale (from customProperties) + if bq_type.lower() in ["numeric", "bignumeric"]: + precision = _get_custom_property(prop, "precision") + scale = _get_custom_property(prop, "scale") + bq_field["precision"] = int(precision) if precision is not None else None + bq_field["scale"] = int(scale) if scale is not None else None + + return bq_field + + +def _get_custom_property(prop: SchemaProperty, key: str): + """Get a custom property value from a SchemaProperty.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def map_type_to_bigquery(prop: SchemaProperty) -> str: + """Map a schema property type to BigQuery type. + + Maps both physicalType and logicalType to their BigQuery equivalents. + PhysicalType is preferred if set. + """ + # If physicalType is already a BigQuery type, return it directly + if prop.physicalType: + bq_types = { + "STRING", "BYTES", "INT64", "INTEGER", "FLOAT64", "NUMERIC", + "BIGNUMERIC", "BOOL", "TIMESTAMP", "DATE", "TIME", "DATETIME", + "GEOGRAPHY", "JSON", "RECORD", "STRUCT", "ARRAY" + } + if prop.physicalType.upper() in bq_types or prop.physicalType.upper().startswith(("STRUCT<", "ARRAY<", "RANGE<")): + return prop.physicalType + + # Determine which type to map (prefer physicalType) + type_to_map = prop.physicalType or prop.logicalType + + # Map the type to BigQuery type + return _map_logical_type_to_bigquery(type_to_map, prop.properties) + + +def _map_logical_type_to_bigquery(logical_type: str, nested_fields) -> str: + """Map a logical type to the corresponding BigQuery type.""" + logger = logging.getLogger(__name__) + + if not logical_type: + return None + + if logical_type.lower() in ["string", "varchar", "text"]: + return "STRING" + elif logical_type.lower() == "json": + return "JSON" + elif logical_type.lower() == "bytes": + return "BYTES" + elif logical_type.lower() in ["int", "integer"]: + return "INTEGER" + elif logical_type.lower() in ["long", "bigint"]: + return "INT64" + elif logical_type.lower() == "float": + return "FLOAT64" + elif logical_type.lower() == "boolean": + return "BOOL" + elif logical_type.lower() in ["timestamp", "timestamp_tz"]: + return "TIMESTAMP" + elif logical_type.lower() == "date": + return "DATE" + elif logical_type.lower() == "timestamp_ntz": + return "DATETIME" + elif logical_type.lower() in ["number", "decimal", "numeric"]: + return "NUMERIC" + elif logical_type.lower() == "double": + return "BIGNUMERIC" + elif logical_type.lower() in ["object", "record"] and not nested_fields: + return "JSON" + elif logical_type.lower() in ["object", "record", "array"]: + return "RECORD" + elif logical_type.lower() == "struct": + return "STRUCT" + elif logical_type.lower() == "null": + logger.info( + "Can't properly map field to bigquery Schema, as 'null' " + "is not supported as a type. Mapping it to STRING." + ) + return "STRING" + else: + raise DataContractException( + type="schema", + result="failed", + name="Map datacontract type to bigquery data type", + reason=f"Unsupported type {logical_type} in data contract definition.", + engine="datacontract", + ) diff --git a/datacontract/export/custom_converter.py b/datacontract/export/custom_exporter.py similarity index 77% rename from datacontract/export/custom_converter.py rename to datacontract/export/custom_exporter.py index fd31ae41c..bc8a03c25 100644 --- a/datacontract/export/custom_converter.py +++ b/datacontract/export/custom_exporter.py @@ -1,21 +1,18 @@ from pathlib import Path from jinja2 import Environment, FileSystemLoader +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Model, -) class CustomExporter(Exporter): - """Exporter implementation for converting data contracts to Markdown.""" + """Exporter implementation for converting data contracts to custom format with Jinja.""" def export( self, - data_contract: DataContractSpecification, - model: Model, + data_contract: OpenDataContractStandard, + schema_name: str, server: str, sql_server_type: str, export_args: dict, @@ -28,7 +25,7 @@ def export( return to_custom(data_contract, template) -def to_custom(data_contract: DataContractSpecification, template_path: Path) -> str: +def to_custom(data_contract: OpenDataContractStandard, template_path: Path) -> str: template = get_template(template_path) rendered_sql = template.render(data_contract=data_contract) return rendered_sql diff --git a/datacontract/export/data_caterer_converter.py b/datacontract/export/data_caterer_converter.py deleted file mode 100644 index 0a672411f..000000000 --- a/datacontract/export/data_caterer_converter.py +++ /dev/null @@ -1,161 +0,0 @@ -from typing import Dict - -import yaml - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server - - -class DataCatererExporter(Exporter): - """ - Exporter class for Data Caterer. - Creates a YAML file, based on the data contract, for Data Caterer to generate synthetic data. - """ - - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_data_caterer_generate_yaml(data_contract, server) - - -def to_data_caterer_generate_yaml(data_contract_spec: DataContractSpecification, server): - generation_task = {"name": data_contract_spec.info.title, "steps": []} - server_info = _get_server_info(data_contract_spec, server) - - for model_key, model_value in data_contract_spec.models.items(): - odcs_table = _to_data_caterer_generate_step(model_key, model_value, server_info) - generation_task["steps"].append(odcs_table) - return yaml.dump(generation_task, indent=2, sort_keys=False, allow_unicode=True) - - -def _get_server_info(data_contract_spec: DataContractSpecification, server): - if server is not None and server in data_contract_spec.servers: - return data_contract_spec.servers.get(server) - elif server is not None: - raise Exception(f"Server name not found in servers list in data contract, server-name={server}") - elif len(data_contract_spec.servers.keys()) > 0: - return next(iter(data_contract_spec.servers.values())) - else: - return None - - -def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server) -> dict: - step = { - "name": model_key, - "type": _to_step_type(server), - "options": _to_data_source_options(model_key, server), - "fields": [], - } - fields = _to_fields(model_value.fields) - if fields: - step["fields"] = fields - return step - - -def _to_step_type(server: Server): - if server is not None and server.type is not None: - if server.type in ["s3", "gcs", "azure", "local"]: - return server.format - else: - return server.type - else: - return "csv" - - -def _to_data_source_options(model_key, server: Server): - options = {} - if server is not None and server.type is not None: - if server.type in ["s3", "gcs", "azure", "local"]: - if server.path is not None: - options["path"] = server.path - elif server.location is not None: - options["path"] = server.location - else: - options["path"] = "/tmp/data_caterer_data" - elif server.type == "postgres": - options["schema"] = server.schema_ - options["table"] = model_key - elif server.type == "kafka": - options["topic"] = server.topic - - return options - - -def _to_fields(fields: Dict[str, Field]) -> list: - dc_fields = [] - for field_name, field in fields.items(): - column = _to_field(field_name, field) - dc_fields.append(column) - return dc_fields - - -def _to_field(field_name: str, field: Field) -> dict: - dc_field = {"name": field_name} - dc_generator_opts = {} - - if field.type is not None: - new_type = _to_data_type(field.type) - dc_field["type"] = _to_data_type(field.type) - if new_type == "object" or new_type == "record" or new_type == "struct": - # need to get nested field definitions - nested_fields = _to_fields(field.fields) - dc_field["fields"] = nested_fields - elif new_type == "array": - if field.items is not None and field.items.type is not None: - dc_generator_opts["arrayType"] = _to_data_type(field.items.type) - else: - dc_generator_opts["arrayType"] = "string" - - if field.enum is not None and len(field.enum) > 0: - dc_generator_opts["oneOf"] = field.enum - if field.unique is not None and field.unique: - dc_generator_opts["isUnique"] = field.unique - if field.primaryKey is not None and field.primaryKey: - dc_generator_opts["isPrimaryKey"] = field.primaryKey - if field.minLength is not None: - if field.type is not None and field.type == "array": - dc_generator_opts["arrayMinLen"] = field.minLength - else: - dc_generator_opts["minLen"] = field.minLength - if field.maxLength is not None: - if field.type is not None and field.type == "array": - dc_generator_opts["arrayMaxLen"] = field.maxLength - else: - dc_generator_opts["maxLen"] = field.maxLength - if field.pattern is not None: - dc_generator_opts["regex"] = field.pattern - if field.minimum is not None: - dc_generator_opts["min"] = field.minimum - if field.maximum is not None: - dc_generator_opts["max"] = field.maximum - - if len(dc_generator_opts.keys()) > 0: - dc_field["options"] = dc_generator_opts - return dc_field - - -def _to_data_type(data_type): - if data_type == "number" or data_type == "numeric" or data_type == "double": - return "double" - elif data_type == "decimal" or data_type == "bigint": - return "decimal" - elif data_type == "int" or data_type == "integer": - return "integer" - elif data_type == "long": - return "long" - elif data_type == "float": - return "float" - elif data_type == "string" or data_type == "text" or data_type == "varchar": - return "string" - if data_type == "boolean": - return "boolean" - if data_type == "timestamp" or data_type == "timestamp_tz" or data_type == "timestamp_ntz": - return "timestamp" - elif data_type == "date": - return "date" - elif data_type == "array": - return "array" - elif data_type == "map" or data_type == "object" or data_type == "record" or data_type == "struct": - return "struct" - elif data_type == "bytes": - return "binary" - else: - return "string" diff --git a/datacontract/export/data_caterer_exporter.py b/datacontract/export/data_caterer_exporter.py new file mode 100644 index 000000000..14c1ddfa2 --- /dev/null +++ b/datacontract/export/data_caterer_exporter.py @@ -0,0 +1,236 @@ +from typing import List, Optional + +import yaml +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty, Server + +from datacontract.export.exporter import Exporter + + +class DataCatererExporter(Exporter): + """ + Exporter class for Data Caterer. + Creates a YAML file, based on the data contract, for Data Caterer to generate synthetic data. + """ + + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_data_caterer_generate_yaml(data_contract, server) + + +def _get_server_by_name(data_contract: OpenDataContractStandard, name: str) -> Optional[Server]: + """Get a server by name.""" + if data_contract.servers is None: + return None + return next((s for s in data_contract.servers if s.server == name), None) + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property.""" + if prop.physicalType: + return prop.physicalType + if prop.logicalType: + return prop.logicalType + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_custom_property_value(prop: SchemaProperty, key: str): + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_enum_values(prop: SchemaProperty): + """Get enum values from logicalTypeOptions, customProperties, or quality rules.""" + import json + # First check logicalTypeOptions + enum_values = _get_logical_type_option(prop, "enum") + if enum_values: + return enum_values + # Then check customProperties + enum_str = _get_custom_property_value(prop, "enum") + if enum_str: + try: + if isinstance(enum_str, list): + return enum_str + return json.loads(enum_str) + except (json.JSONDecodeError, TypeError): + pass + # Finally check quality rules for invalidValues with validValues + if prop.quality: + for q in prop.quality: + if q.metric == "invalidValues" and q.arguments: + valid_values = q.arguments.get("validValues") + if valid_values: + return valid_values + return None + + +def to_data_caterer_generate_yaml(data_contract: OpenDataContractStandard, server): + generation_task = {"name": data_contract.name, "steps": []} + server_info = _get_server_info(data_contract, server) + + if data_contract.schema_: + for schema_obj in data_contract.schema_: + step = _to_data_caterer_generate_step(schema_obj.name, schema_obj, server_info) + generation_task["steps"].append(step) + return yaml.dump(generation_task, indent=2, sort_keys=False, allow_unicode=True) + + +def _get_server_info(data_contract: OpenDataContractStandard, server) -> Optional[Server]: + if server is not None: + found_server = _get_server_by_name(data_contract, server) + if found_server: + return found_server + raise Exception(f"Server name not found in servers list in data contract, server-name={server}") + elif data_contract.servers and len(data_contract.servers) > 0: + return data_contract.servers[0] + else: + return None + + +def _to_data_caterer_generate_step(model_key: str, schema_obj: SchemaObject, server: Optional[Server]) -> dict: + step = { + "name": model_key, + "type": _to_step_type(server), + "options": _to_data_source_options(model_key, server), + "fields": [], + } + fields = _to_fields(schema_obj.properties or []) + if fields: + step["fields"] = fields + return step + + +def _to_step_type(server: Optional[Server]): + if server is not None and server.type is not None: + if server.type in ["s3", "gcs", "azure", "local"]: + return server.format + else: + return server.type + else: + return "csv" + + +def _to_data_source_options(model_key: str, server: Optional[Server]): + options = {} + if server is not None and server.type is not None: + if server.type in ["s3", "gcs", "azure", "local"]: + if server.path is not None: + options["path"] = server.path + elif server.location is not None: + options["path"] = server.location + else: + options["path"] = "/tmp/data_caterer_data" + elif server.type == "postgres": + options["schema"] = server.schema_ + options["table"] = model_key + elif server.type == "kafka": + options["topic"] = server.topic + + return options + + +def _to_fields(properties: List[SchemaProperty]) -> list: + dc_fields = [] + for prop in properties: + column = _to_field(prop.name, prop) + dc_fields.append(column) + return dc_fields + + +def _to_field(field_name: str, prop: SchemaProperty) -> dict: + dc_field = {"name": field_name} + dc_generator_opts = {} + + prop_type = _get_type(prop) + if prop_type is not None: + new_type = _to_data_type(prop_type) + dc_field["type"] = new_type + if new_type in ["object", "record", "struct"]: + # need to get nested field definitions + nested_fields = _to_fields(prop.properties or []) + dc_field["fields"] = nested_fields + elif new_type == "array": + if prop.items is not None: + item_type = _get_type(prop.items) + if item_type is not None: + dc_generator_opts["arrayType"] = _to_data_type(item_type) + else: + dc_generator_opts["arrayType"] = "string" + else: + dc_generator_opts["arrayType"] = "string" + + enum_values = _get_enum_values(prop) + if enum_values is not None and len(enum_values) > 0: + dc_generator_opts["oneOf"] = enum_values + if prop.unique is not None and prop.unique: + dc_generator_opts["isUnique"] = prop.unique + if prop.primaryKey is not None and prop.primaryKey: + dc_generator_opts["isPrimaryKey"] = prop.primaryKey + + min_length = _get_logical_type_option(prop, "minLength") + max_length = _get_logical_type_option(prop, "maxLength") + pattern = _get_logical_type_option(prop, "pattern") + minimum = _get_logical_type_option(prop, "minimum") + maximum = _get_logical_type_option(prop, "maximum") + + if min_length is not None: + if prop_type is not None and prop_type.lower() == "array": + dc_generator_opts["arrayMinLen"] = min_length + else: + dc_generator_opts["minLen"] = min_length + if max_length is not None: + if prop_type is not None and prop_type.lower() == "array": + dc_generator_opts["arrayMaxLen"] = max_length + else: + dc_generator_opts["maxLen"] = max_length + if pattern is not None: + dc_generator_opts["regex"] = pattern + if minimum is not None: + dc_generator_opts["min"] = minimum + if maximum is not None: + dc_generator_opts["max"] = maximum + + if len(dc_generator_opts.keys()) > 0: + dc_field["options"] = dc_generator_opts + return dc_field + + +def _to_data_type(data_type): + data_type_lower = data_type.lower() if data_type else "" + if data_type_lower in ["number", "numeric", "double"]: + return "double" + elif data_type_lower in ["decimal", "bigint"]: + return "decimal" + elif data_type_lower in ["int", "integer"]: + return "integer" + elif data_type_lower == "long": + return "long" + elif data_type_lower == "float": + return "float" + elif data_type_lower in ["string", "text", "varchar"]: + return "string" + elif data_type_lower == "boolean": + return "boolean" + elif data_type_lower in ["timestamp", "timestamp_tz", "timestamp_ntz"]: + return "timestamp" + elif data_type_lower == "date": + return "date" + elif data_type_lower == "array": + return "array" + elif data_type_lower in ["map", "object", "record", "struct"]: + return "struct" + elif data_type_lower == "bytes": + return "binary" + else: + return "string" diff --git a/datacontract/export/dbml_converter.py b/datacontract/export/dbml_converter.py deleted file mode 100644 index 5c13ea319..000000000 --- a/datacontract/export/dbml_converter.py +++ /dev/null @@ -1,148 +0,0 @@ -from datetime import datetime -from importlib.metadata import version -from typing import Tuple - -import pytz - -import datacontract.model.data_contract_specification as spec -from datacontract.export.exporter import Exporter -from datacontract.export.sql_type_converter import convert_to_sql_type -from datacontract.model.exceptions import DataContractException - - -class DbmlExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - found_server = data_contract.servers.get(server) - return to_dbml_diagram(data_contract, found_server) - - -def to_dbml_diagram(contract: spec.DataContractSpecification, server: spec.Server) -> str: - result = "" - result += add_generated_info(contract, server) + "\n" - result += generate_project_info(contract) + "\n" - - for model_name, model in contract.models.items(): - table_description = generate_table(model_name, model, server) - result += f"\n{table_description}\n" - - return result - - -def add_generated_info(contract: spec.DataContractSpecification, server: spec.Server) -> str: - tz = pytz.timezone("UTC") - now = datetime.now(tz) - formatted_date = now.strftime("%b %d %Y") - datacontract_cli_version = get_version() - dialect = "Logical Datacontract" if server is None else server.type - - generated_info = """ -Generated at {0} by datacontract-cli version {1} -for datacontract {2} ({3}) version {4} -Using {5} Types for the field types - """.format( - formatted_date, datacontract_cli_version, contract.info.title, contract.id, contract.info.version, dialect - ) - - comment = """/* -{0} -*/ - """.format(generated_info) - return comment - - -def get_version() -> str: - try: - return version("datacontract_cli") - except Exception: - return "" - - -def generate_project_info(contract: spec.DataContractSpecification) -> str: - return """Project "{0}" {{ - Note: '''{1}''' -}}\n - """.format(contract.info.title, contract.info.description) - - -def generate_table(model_name: str, model: spec.Model, server: spec.Server) -> str: - result = """Table "{0}" {{ -Note: {1} - """.format(model_name, formatDescription(model.description)) - - references = [] - - for field_name, field in model.fields.items(): - ref, field_string = generate_field(field_name, field, model_name, server) - if ref is not None: - references.append(ref) - result += "{0}\n".format(field_string) - - result += "}\n" - - # and if any: add the references - if len(references) > 0: - for ref in references: - result += "Ref: {0}\n".format(ref) - - result += "\n" - - return result - - -def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]: - if field.primaryKey or field.primary: - if field.required is not None: - if not field.required: - raise DataContractException( - type="lint", - name="Primary key fields cannot have required == False.", - result="error", - reason="Primary key fields cannot have required == False.", - engine="datacontract", - ) - else: - field.required = True - if field.unique is not None: - if not field.unique: - raise DataContractException( - type="lint", - name="Primary key fields cannot have unique == False", - result="error", - reason="Primary key fields cannot have unique == False.", - engine="datacontract", - ) - else: - field.unique = True - - field_attrs = [] - if field.primaryKey or field.primary: - field_attrs.append("pk") - - if field.unique: - field_attrs.append("unique") - - if field.required: - field_attrs.append("not null") - else: - field_attrs.append("null") - - if field.description: - field_attrs.append("""Note: {0}""".format(formatDescription(field.description))) - - field_type = field.type if server is None else convert_to_sql_type(field, server.type) - - field_str = '"{0}" "{1}" [{2}]'.format(field_name, field_type, ",".join(field_attrs)) - ref_str = None - if (field.references) is not None: - if field.unique: - ref_str = "{0}.{1} - {2}".format(model_name, field_name, field.references) - else: - ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references) - return (ref_str, field_str) - - -def formatDescription(input: str) -> str: - if "\n" in input or "\r" in input or '"' in input: - return "'''{0}'''".format(input) - else: - return '"{0}"'.format(input) diff --git a/datacontract/export/dbml_exporter.py b/datacontract/export/dbml_exporter.py new file mode 100644 index 000000000..26dfb112c --- /dev/null +++ b/datacontract/export/dbml_exporter.py @@ -0,0 +1,173 @@ +from datetime import datetime +from importlib.metadata import version +from typing import Optional, Tuple + +import pytz +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty, Server + +from datacontract.export.exporter import Exporter +from datacontract.export.sql_type_converter import convert_to_sql_type +from datacontract.model.exceptions import DataContractException + + +class DbmlExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + found_server = _get_server_by_name(data_contract, server) if server else None + return to_dbml_diagram(data_contract, found_server) + + +def _get_server_by_name(data_contract: OpenDataContractStandard, name: str) -> Optional[Server]: + """Get a server by name.""" + if data_contract.servers is None: + return None + return next((s for s in data_contract.servers if s.server == name), None) + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property.""" + if prop.logicalType: + return prop.logicalType + if prop.physicalType: + return prop.physicalType + return None + + +def _get_references(prop: SchemaProperty) -> Optional[str]: + """Get references from a property's relationships.""" + if prop.relationships: + for rel in prop.relationships: + if hasattr(rel, 'to') and rel.to: + return rel.to + return None + + +def to_dbml_diagram(contract: OpenDataContractStandard, server: Optional[Server]) -> str: + result = "" + result += add_generated_info(contract, server) + "\n" + result += generate_project_info(contract) + "\n" + + if contract.schema_: + for schema_obj in contract.schema_: + table_description = generate_table(schema_obj.name, schema_obj, server) + result += f"\n{table_description}\n" + + return result + + +def add_generated_info(contract: OpenDataContractStandard, server: Optional[Server]) -> str: + tz = pytz.timezone("UTC") + now = datetime.now(tz) + formatted_date = now.strftime("%b %d %Y") + datacontract_cli_version = get_version() + dialect = "Logical Datacontract" if server is None else server.type + + return """/* +Generated at {0} by datacontract-cli version {1} +for datacontract {2} ({3}) version {4} +Using {5} Types for the field types +*/""".format( + formatted_date, datacontract_cli_version, contract.name, contract.id, contract.version, dialect + ) + + +def get_version() -> str: + try: + return version("datacontract_cli") + except Exception: + return "" + + +def generate_project_info(contract: OpenDataContractStandard) -> str: + description = "" + if contract.description: + if hasattr(contract.description, 'purpose') and contract.description.purpose: + description = contract.description.purpose + elif isinstance(contract.description, str): + description = contract.description + return """Project "{0}" {{ + note: '''{1}''' +}}""".format(contract.name or "", description) + + +def generate_table(model_name: str, schema_obj: SchemaObject, server: Optional[Server]) -> str: + result = "Table {0} {{\n note: {1}\n".format(model_name, formatDescription(schema_obj.description or "")) + + references = [] + + if schema_obj.properties: + for prop in schema_obj.properties: + ref, field_string = generate_field(prop.name, prop, model_name, server) + if ref is not None: + references.append(ref) + result += "{0}\n".format(field_string) + + result += "}" + + # and if any: add the references + if len(references) > 0: + result += "\n" + for ref in references: + result += "Ref: {0}\n".format(ref) + + return result + + +def generate_field(field_name: str, prop: SchemaProperty, model_name: str, server: Optional[Server]) -> Tuple[str, str]: + if prop.primaryKey: + if prop.required is not None: + if not prop.required: + raise DataContractException( + type="lint", + name="Primary key fields cannot have required == False.", + result="error", + reason="Primary key fields cannot have required == False.", + engine="datacontract", + ) + else: + prop.required = True + if prop.unique is not None: + if not prop.unique: + raise DataContractException( + type="lint", + name="Primary key fields cannot have unique == False", + result="error", + reason="Primary key fields cannot have unique == False.", + engine="datacontract", + ) + else: + prop.unique = True + + field_attrs = [] + if prop.primaryKey: + field_attrs.append("pk") + + if prop.unique: + field_attrs.append("unique") + + if prop.required: + field_attrs.append("not null") + else: + field_attrs.append("null") + + if prop.description: + field_attrs.append("""note: {0}""".format(formatDescription(prop.description))) + + prop_type = _get_type(prop) + field_type = prop_type if server is None else convert_to_sql_type(prop, server.type) + + field_str = ' {0} {1} [{2}]'.format(field_name, field_type, ", ".join(field_attrs)) + ref_str = None + references = _get_references(prop) + if references is not None: + if prop.unique: + ref_str = "{0}.{1} - {2}".format(model_name, field_name, references) + else: + ref_str = "{0}.{1} > {2}".format(model_name, field_name, references) + return (ref_str, field_str) + + +def formatDescription(input: str) -> str: + if "\n" in input or "\r" in input or '"' in input: + return "'''{0}'''".format(input) + else: + return '"{0}"'.format(input) diff --git a/datacontract/export/dbt_converter.py b/datacontract/export/dbt_converter.py deleted file mode 100644 index 3cf07afe3..000000000 --- a/datacontract/export/dbt_converter.py +++ /dev/null @@ -1,278 +0,0 @@ -from typing import Dict, Optional - -import yaml - -from datacontract.export.exporter import Exporter, _check_models_for_export -from datacontract.export.sql_type_converter import convert_to_sql_type -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model - - -class DbtExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_dbt_models_yaml(data_contract, server) - - -class DbtSourceExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_dbt_sources_yaml(data_contract, server) - - -class DbtStageExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - return to_dbt_staging_sql( - data_contract, - model_name, - model_value, - ) - - -def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None) -> str: - dbt = { - "version": 2, - "models": [], - } - - for model_key, model_value in data_contract_spec.models.items(): - dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec, adapter_type=server) - dbt["models"].append(dbt_model) - return yaml.safe_dump(dbt, indent=2, sort_keys=False, allow_unicode=True) - - -def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str: - id = data_contract_spec.id - columns = [] - for field_name, field in model_value.fields.items(): - # TODO escape SQL reserved key words, probably dependent on server type - columns.append(field_name) - return f""" - select - {", ".join(columns)} - from {{{{ source('{id}', '{model_name}') }}}} -""" - - -def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None): - source = {"name": data_contract_spec.id} - dbt = { - "version": 2, - "sources": [source], - } - if data_contract_spec.info.owner is not None: - source["meta"] = {"owner": data_contract_spec.info.owner} - if data_contract_spec.info.description is not None: - source["description"] = data_contract_spec.info.description.strip().replace("\n", " ") - found_server = data_contract_spec.servers.get(server) - adapter_type = None - if found_server is not None: - adapter_type = found_server.type - if adapter_type == "bigquery": - source["database"] = found_server.project - source["schema"] = found_server.dataset - else: - source["database"] = found_server.database - source["schema"] = found_server.schema_ - - source["tables"] = [] - for model_key, model_value in data_contract_spec.models.items(): - dbt_model = _to_dbt_source_table(data_contract_spec, model_key, model_value, adapter_type) - source["tables"].append(dbt_model) - return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True) - - -def _to_dbt_source_table( - data_contract_spec: DataContractSpecification, model_key, model_value: Model, adapter_type: Optional[str] -) -> dict: - dbt_model = { - "name": model_key, - } - - if model_value.description is not None: - dbt_model["description"] = model_value.description.strip().replace("\n", " ") - columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type) - if columns: - dbt_model["columns"] = columns - return dbt_model - - -def _to_dbt_model( - model_key, model_value: Model, data_contract_spec: DataContractSpecification, adapter_type: Optional[str] -) -> dict: - dbt_model = { - "name": model_key, - } - model_type = _to_dbt_model_type(model_value.type) - - dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}} - - if model_type: - dbt_model["config"]["materialized"] = model_type - - if data_contract_spec.info.owner is not None: - dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner - - if _supports_constraints(model_type): - dbt_model["config"]["contract"] = {"enforced": True} - if model_value.description is not None: - dbt_model["description"] = model_value.description.strip().replace("\n", " ") - columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type) - if columns: - dbt_model["columns"] = columns - return dbt_model - - -def _to_dbt_model_type(model_type): - # https://docs.getdbt.com/docs/build/materializations - # Allowed values: table, view, incremental, ephemeral, materialized view - # Custom values also possible - if model_type is None: - return None - if model_type.lower() == "table": - return "table" - if model_type.lower() == "view": - return "view" - return "table" - - -def _supports_constraints(model_type): - return model_type == "table" or model_type == "incremental" - - -def _to_columns( - data_contract_spec: DataContractSpecification, - fields: Dict[str, Field], - supports_constraints: bool, - adapter_type: Optional[str], -) -> list: - columns = [] - for field_name, field in fields.items(): - column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type) - columns.append(column) - return columns - - -def get_table_name_and_column_name(references: str) -> tuple[Optional[str], str]: - parts = references.split(".") - if len(parts) < 2: - return None, parts[0] - return parts[-2], parts[-1] - - -def _to_column( - data_contract_spec: DataContractSpecification, - field_name: str, - field: Field, - supports_constraints: bool, - adapter_type: Optional[str], -) -> dict: - column = {"name": field_name} - adapter_type = adapter_type or "snowflake" - dbt_type = convert_to_sql_type(field, adapter_type) - - column["data_tests"] = [] - if dbt_type is not None: - column["data_type"] = dbt_type - else: - column["data_tests"].append( - {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}} - ) - if field.description is not None: - column["description"] = field.description.strip().replace("\n", " ") - if field.required: - if supports_constraints: - column.setdefault("constraints", []).append({"type": "not_null"}) - else: - column["data_tests"].append("not_null") - if field.unique: - if supports_constraints: - column.setdefault("constraints", []).append({"type": "unique"}) - else: - column["data_tests"].append("unique") - if field.enum is not None and len(field.enum) > 0: - column["data_tests"].append({"accepted_values": {"values": field.enum}}) - if field.minLength is not None or field.maxLength is not None: - length_test = {} - if field.minLength is not None: - length_test["min_value"] = field.minLength - if field.maxLength is not None: - length_test["max_value"] = field.maxLength - column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}) - if field.pii is not None: - column.setdefault("meta", {})["pii"] = field.pii - if field.classification is not None: - column.setdefault("meta", {})["classification"] = field.classification - if field.tags is not None and len(field.tags) > 0: - column.setdefault("tags", []).extend(field.tags) - if field.pattern is not None: - # Beware, the data contract pattern is a regex, not a like pattern - column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}) - if ( - field.minimum is not None - or field.maximum is not None - and field.exclusiveMinimum is None - and field.exclusiveMaximum is None - ): - range_test = {} - if field.minimum is not None: - range_test["min_value"] = field.minimum - if field.maximum is not None: - range_test["max_value"] = field.maximum - column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test}) - elif ( - field.exclusiveMinimum is not None - or field.exclusiveMaximum is not None - and field.minimum is None - and field.maximum is None - ): - range_test = {} - if field.exclusiveMinimum is not None: - range_test["min_value"] = field.exclusiveMinimum - if field.exclusiveMaximum is not None: - range_test["max_value"] = field.exclusiveMaximum - range_test["strictly"] = True - column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test}) - else: - if field.minimum is not None: - column["data_tests"].append( - {"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}} - ) - if field.maximum is not None: - column["data_tests"].append( - {"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}} - ) - if field.exclusiveMinimum is not None: - column["data_tests"].append( - { - "dbt_expectations.expect_column_values_to_be_between": { - "min_value": field.exclusiveMinimum, - "strictly": True, - } - } - ) - if field.exclusiveMaximum is not None: - column["data_tests"].append( - { - "dbt_expectations.expect_column_values_to_be_between": { - "max_value": field.exclusiveMaximum, - "strictly": True, - } - } - ) - if field.references is not None: - ref_source_name = data_contract_spec.id - table_name, column_name = get_table_name_and_column_name(field.references) - if table_name is not None and column_name is not None: - column["data_tests"].append( - { - "relationships": { - "to": f"""source("{ref_source_name}", "{table_name}")""", - "field": f"{column_name}", - } - } - ) - - if not column["data_tests"]: - column.pop("data_tests") - - # TODO: all constraints - return column diff --git a/datacontract/export/dbt_exporter.py b/datacontract/export/dbt_exporter.py new file mode 100644 index 000000000..03293b0fc --- /dev/null +++ b/datacontract/export/dbt_exporter.py @@ -0,0 +1,394 @@ +from typing import List, Optional, Union + +import yaml +from open_data_contract_standard.model import Description, OpenDataContractStandard, SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export +from datacontract.export.sql_type_converter import convert_to_sql_type + + +def _get_description_str(description: Union[str, Description, None]) -> Optional[str]: + """Extract description string from either a string or Description object.""" + if description is None: + return None + if isinstance(description, str): + return description.strip().replace("\n", " ") + # Description object - use purpose field + if hasattr(description, "purpose") and description.purpose: + return description.purpose.strip().replace("\n", " ") + return None + + +class DbtExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_dbt_models_yaml(data_contract, server) + + +class DbtSourceExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_dbt_sources_yaml(data_contract, server) + + +class DbtStageExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + model_name, model_value = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + return to_dbt_staging_sql( + data_contract, + model_name, + model_value, + ) + + +def _get_custom_property_value(prop: SchemaProperty, key: str) -> Optional[str]: + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_enum_values(prop: SchemaProperty): + """Get enum values from logicalTypeOptions, customProperties, or quality rules.""" + import json + # First check logicalTypeOptions + enum_values = _get_logical_type_option(prop, "enum") + if enum_values: + return enum_values + # Then check customProperties + enum_str = _get_custom_property_value(prop, "enum") + if enum_str: + try: + if isinstance(enum_str, list): + return enum_str + return json.loads(enum_str) + except (json.JSONDecodeError, TypeError): + pass + # Finally check quality rules for invalidValues with validValues + if prop.quality: + for q in prop.quality: + if q.metric == "invalidValues" and q.arguments: + valid_values = q.arguments.get("validValues") + if valid_values: + return valid_values + return None + + +def _get_owner(data_contract: OpenDataContractStandard) -> Optional[str]: + """Get owner from team.""" + if data_contract.team is None: + return None + return data_contract.team.name + + +def _get_server_by_name(data_contract: OpenDataContractStandard, name: str): + """Get a server by name.""" + if data_contract.servers is None: + return None + return next((s for s in data_contract.servers if s.server == name), None) + + +def to_dbt_models_yaml(odcs: OpenDataContractStandard, server: str = None) -> str: + dbt = { + "version": 2, + "models": [], + } + + if odcs.schema_: + for schema_obj in odcs.schema_: + dbt_model = _to_dbt_model(schema_obj.name, schema_obj, odcs, adapter_type=server) + dbt["models"].append(dbt_model) + return yaml.safe_dump(dbt, indent=2, sort_keys=False, allow_unicode=True) + + +def to_dbt_staging_sql(odcs: OpenDataContractStandard, model_name: str, model_value: SchemaObject) -> str: + contract_id = odcs.id + columns = [] + if model_value.properties: + for prop in model_value.properties: + # TODO escape SQL reserved key words, probably dependent on server type + columns.append(prop.name) + return f""" + select + {", ".join(columns)} + from {{{{ source('{contract_id}', '{model_name}') }}}} +""" + + +def to_dbt_sources_yaml(odcs: OpenDataContractStandard, server: str = None): + source = {"name": odcs.id} + dbt = { + "version": 2, + "sources": [source], + } + owner = _get_owner(odcs) + if owner is not None: + source["meta"] = {"owner": owner} + desc_str = _get_description_str(odcs.description) + if desc_str is not None: + source["description"] = desc_str + + found_server = _get_server_by_name(odcs, server) if server else None + adapter_type = None + if found_server is not None: + adapter_type = found_server.type + if adapter_type == "bigquery": + source["database"] = found_server.project + source["schema"] = found_server.dataset + else: + source["database"] = found_server.database + source["schema"] = found_server.schema_ + + source["tables"] = [] + if odcs.schema_: + for schema_obj in odcs.schema_: + dbt_model = _to_dbt_source_table(odcs, schema_obj.name, schema_obj, adapter_type) + source["tables"].append(dbt_model) + return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True) + + +def _to_dbt_source_table( + odcs: OpenDataContractStandard, model_key: str, model_value: SchemaObject, adapter_type: Optional[str] +) -> dict: + dbt_model = { + "name": model_key, + } + + if model_value.description is not None: + dbt_model["description"] = model_value.description.strip().replace("\n", " ") + columns = _to_columns(odcs, model_value.properties or [], False, adapter_type) + if columns: + dbt_model["columns"] = columns + return dbt_model + + +def _to_dbt_model( + schema_name: str, schema_object: SchemaObject, odcs: OpenDataContractStandard, adapter_type: Optional[str] +) -> dict: + dbt_model = { + "name": schema_name, + } + model_type = _to_dbt_model_type(schema_object.physicalType) + + dbt_model["config"] = {"meta": {"data_contract": odcs.id}} + + if model_type: + dbt_model["config"]["materialized"] = model_type + + owner = _get_owner(odcs) + if owner is not None: + dbt_model["config"]["meta"]["owner"] = owner + + if _supports_constraints(model_type): + dbt_model["config"]["contract"] = {"enforced": True} + if schema_object.description is not None: + dbt_model["description"] = schema_object.description.strip().replace("\n", " ") + + # Handle model-level primaryKey from properties + primary_key_columns = [] + if schema_object.properties: + for prop in schema_object.properties: + if prop.primaryKey: + primary_key_columns.append(prop.name) + + if len(primary_key_columns) > 1: + # Multiple columns: use dbt_utils.unique_combination_of_columns + dbt_model["data_tests"] = [ + {"dbt_utils.unique_combination_of_columns": {"combination_of_columns": primary_key_columns}} + ] + + columns = _to_columns( + odcs, schema_object.properties or [], _supports_constraints(model_type), adapter_type, primary_key_columns + ) + if columns: + dbt_model["columns"] = columns + + return dbt_model + + +def _to_dbt_model_type(model_type: Optional[str]): + # https://docs.getdbt.com/docs/build/materializations + # Allowed values: table, view, incremental, ephemeral, materialized view + # Custom values also possible + if model_type is None: + return None + if model_type.lower() == "table": + return "table" + if model_type.lower() == "view": + return "view" + return "table" + + +def _supports_constraints(model_type: Optional[str]) -> bool: + return model_type == "table" or model_type == "incremental" + + +def _to_columns( + odcs: OpenDataContractStandard, + properties: List[SchemaProperty], + supports_constraints: bool, + adapter_type: Optional[str], + primary_key_columns: Optional[list] = None, +) -> list: + columns = [] + primary_key_columns = primary_key_columns or [] + is_single_pk = len(primary_key_columns) == 1 + for prop in properties: + is_primary_key = prop.name in primary_key_columns + # Only pass is_primary_key for unique constraint if it's a single-column PK + # Composite PKs use unique_combination_of_columns at model level instead + column = _to_column(odcs, prop, supports_constraints, adapter_type, is_primary_key, is_single_pk) + columns.append(column) + return columns + + +def get_table_name_and_column_name(references: str) -> tuple: + parts = references.split(".") + if len(parts) < 2: + return None, parts[0] + return parts[-2], parts[-1] + + +def _to_column( + data_contract: OpenDataContractStandard, + prop: SchemaProperty, + supports_constraints: bool, + adapter_type: Optional[str], + is_primary_key: bool = False, + is_single_pk: bool = False, +) -> dict: + column = {"name": prop.name} + adapter_type = adapter_type or "snowflake" + dbt_type = convert_to_sql_type(prop, adapter_type) + + column["data_tests"] = [] + if dbt_type is not None: + column["data_type"] = dbt_type + else: + column["data_tests"].append( + {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}} + ) + if prop.description is not None: + column["description"] = prop.description.strip().replace("\n", " ") + + # Handle required/not_null constraint + if prop.required or is_primary_key: + if supports_constraints: + column.setdefault("constraints", []).append({"type": "not_null"}) + else: + column["data_tests"].append("not_null") + + # Handle unique constraint + # For composite primary keys, uniqueness is handled at model level via unique_combination_of_columns + # Only add unique constraint for single-column primary keys or explicit unique fields + if prop.unique or (is_primary_key and is_single_pk): + if supports_constraints: + column.setdefault("constraints", []).append({"type": "unique"}) + else: + column["data_tests"].append("unique") + + enum_values = _get_enum_values(prop) + if enum_values and len(enum_values) > 0: + column["data_tests"].append({"accepted_values": {"values": enum_values}}) + + min_length = _get_logical_type_option(prop, "minLength") + max_length = _get_logical_type_option(prop, "maxLength") + if min_length is not None or max_length is not None: + length_test = {} + if min_length is not None: + length_test["min_value"] = min_length + if max_length is not None: + length_test["max_value"] = max_length + column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}) + + if prop.classification is not None: + column.setdefault("meta", {})["classification"] = prop.classification + if prop.tags is not None and len(prop.tags) > 0: + column.setdefault("tags", []).extend(prop.tags) + + pattern = _get_logical_type_option(prop, "pattern") + if pattern is not None: + # Beware, the data contract pattern is a regex, not a like pattern + column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": pattern}}) + + minimum = _get_logical_type_option(prop, "minimum") + maximum = _get_logical_type_option(prop, "maximum") + exclusive_minimum = _get_logical_type_option(prop, "exclusiveMinimum") + exclusive_maximum = _get_logical_type_option(prop, "exclusiveMaximum") + + if (minimum is not None or maximum is not None) and exclusive_minimum is None and exclusive_maximum is None: + range_test = {} + if minimum is not None: + range_test["min_value"] = minimum + if maximum is not None: + range_test["max_value"] = maximum + column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test}) + elif (exclusive_minimum is not None or exclusive_maximum is not None) and minimum is None and maximum is None: + range_test = {} + if exclusive_minimum is not None: + range_test["min_value"] = exclusive_minimum + if exclusive_maximum is not None: + range_test["max_value"] = exclusive_maximum + range_test["strictly"] = True + column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test}) + else: + if minimum is not None: + column["data_tests"].append( + {"dbt_expectations.expect_column_values_to_be_between": {"min_value": minimum}} + ) + if maximum is not None: + column["data_tests"].append( + {"dbt_expectations.expect_column_values_to_be_between": {"max_value": maximum}} + ) + if exclusive_minimum is not None: + column["data_tests"].append( + { + "dbt_expectations.expect_column_values_to_be_between": { + "min_value": exclusive_minimum, + "strictly": True, + } + } + ) + if exclusive_maximum is not None: + column["data_tests"].append( + { + "dbt_expectations.expect_column_values_to_be_between": { + "max_value": exclusive_maximum, + "strictly": True, + } + } + ) + + # Handle references from relationships + references = None + if prop.relationships: + for rel in prop.relationships: + if hasattr(rel, 'to') and rel.to: + references = rel.to + break + if references is not None: + ref_source_name = data_contract.id + table_name, column_name = get_table_name_and_column_name(references) + if table_name is not None and column_name is not None: + column["data_tests"].append( + { + "relationships": { + "to": f"""source("{ref_source_name}", "{table_name}")""", + "field": f"{column_name}", + } + } + ) + + if not column["data_tests"]: + column.pop("data_tests") + + # TODO: all constraints + return column diff --git a/datacontract/export/dcs_exporter.py b/datacontract/export/dcs_exporter.py index 38c3c5a54..ce700d0f9 100644 --- a/datacontract/export/dcs_exporter.py +++ b/datacontract/export/dcs_exporter.py @@ -1,6 +1,294 @@ +"""DCS Exporter - Exports ODCS to Data Contract Specification format for backward compatibility.""" + +from typing import Optional + +from datacontract_specification.model import ( + Availability, + Contact, + DataContractSpecification, + Field, + Info, + Model, + Retention, + Terms, +) +from datacontract_specification.model import ( + Server as DCSServer, +) +from open_data_contract_standard.model import ( + OpenDataContractStandard, + SchemaObject, + SchemaProperty, +) +from open_data_contract_standard.model import ( + Server as ODCSServer, +) + from datacontract.export.exporter import Exporter class DcsExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return data_contract.to_yaml() + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_dcs_yaml(data_contract) + + +def to_dcs_yaml(data_contract: OpenDataContractStandard) -> str: + """Convert ODCS to DCS and return as YAML.""" + dcs = to_dcs(data_contract) + return dcs.to_yaml() + + +def to_dcs(data_contract: OpenDataContractStandard) -> DataContractSpecification: + """Convert an ODCS data contract to a DCS data contract.""" + # Basic info + info = Info( + title=data_contract.name, + version=data_contract.version, + description=data_contract.description, + status=data_contract.status, + ) + + # Team/owner + if data_contract.team: + info.owner = data_contract.team.name + + # Contact + if data_contract.support: + for support in data_contract.support: + if support.channel == "email" and support.url: + # Extract email from mailto: + email = support.url.replace("mailto:", "") if support.url.startswith("mailto:") else None + if email: + info.contact = Contact(email=email) + break + elif support.channel == "other" and support.url: + info.contact = Contact(url=support.url) + + # Create DCS spec + dcs = DataContractSpecification( + id=data_contract.id, + info=info, + ) + + # Convert servers + if data_contract.servers: + dcs.servers = {} + for odcs_server in data_contract.servers: + dcs_server = _convert_server(odcs_server) + dcs.servers[odcs_server.server] = dcs_server + + # Convert schema_ to models + if data_contract.schema_: + dcs.models = {} + for schema_obj in data_contract.schema_: + model = _convert_schema_to_model(schema_obj) + dcs.models[schema_obj.name] = model + + # Convert description to terms + if data_contract.description: + terms = Terms() + # Check for structured description + if hasattr(data_contract, 'description') and data_contract.description: + if isinstance(data_contract.description, str): + terms.description = data_contract.description + elif hasattr(data_contract.description, 'purpose'): + terms.description = data_contract.description.purpose + terms.usage = getattr(data_contract.description, 'usage', None) + terms.limitations = getattr(data_contract.description, 'limitations', None) + dcs.terms = terms + + # Convert SLA properties to service levels + if data_contract.slaProperties: + for sla in data_contract.slaProperties: + if sla.property == "generalAvailability": + if dcs.servicelevels is None: + dcs.servicelevels = {} + dcs.servicelevels["availability"] = Availability(description=sla.value) + elif sla.property == "retention": + if dcs.servicelevels is None: + dcs.servicelevels = {} + dcs.servicelevels["retention"] = Retention(period=sla.value) + + return dcs + + +def _convert_server(odcs_server: ODCSServer) -> DCSServer: + """Convert an ODCS server to a DCS server.""" + dcs_server = DCSServer(type=odcs_server.type) + + # Copy common attributes + if odcs_server.environment: + dcs_server.environment = odcs_server.environment + if odcs_server.account: + dcs_server.account = odcs_server.account + if odcs_server.database: + dcs_server.database = odcs_server.database + if odcs_server.schema_: + dcs_server.schema_ = odcs_server.schema_ + if odcs_server.format: + dcs_server.format = odcs_server.format + if odcs_server.project: + dcs_server.project = odcs_server.project + if odcs_server.dataset: + dcs_server.dataset = odcs_server.dataset + if odcs_server.path: + dcs_server.path = odcs_server.path + if odcs_server.delimiter: + dcs_server.delimiter = odcs_server.delimiter + if odcs_server.endpointUrl: + dcs_server.endpointUrl = odcs_server.endpointUrl + if odcs_server.location: + dcs_server.location = odcs_server.location + if odcs_server.host: + dcs_server.host = odcs_server.host + if odcs_server.port: + dcs_server.port = odcs_server.port + if odcs_server.catalog: + dcs_server.catalog = odcs_server.catalog + if odcs_server.topic: + dcs_server.topic = odcs_server.topic + if odcs_server.http_path: + dcs_server.http_path = odcs_server.http_path + if odcs_server.driver: + dcs_server.driver = odcs_server.driver + + return dcs_server + + +def _convert_schema_to_model(schema_obj: SchemaObject) -> Model: + """Convert an ODCS SchemaObject to a DCS Model.""" + model = Model(type=schema_obj.physicalType or "table") + + if schema_obj.description: + model.description = schema_obj.description + + # Convert properties to fields + if schema_obj.properties: + model.fields = {} + for prop in schema_obj.properties: + field = _convert_property_to_field(prop) + model.fields[prop.name] = field + + return model + + +def _convert_property_to_field(prop: SchemaProperty) -> Field: + """Convert an ODCS SchemaProperty to a DCS Field.""" + # Convert logical type back to DCS type + dcs_type = _convert_logical_to_dcs_type(prop.logicalType, prop.physicalType) + + field = Field(type=dcs_type) + + if prop.description: + field.description = prop.description + if prop.required is not None: + field.required = prop.required + if prop.unique is not None: + field.unique = prop.unique + if prop.primaryKey: + field.primaryKey = prop.primaryKey + if prop.businessName: + field.title = prop.businessName + if prop.classification: + field.classification = prop.classification + if prop.tags: + field.tags = prop.tags + + # Convert logical type options + if prop.logicalTypeOptions: + opts = prop.logicalTypeOptions + if "minLength" in opts: + field.minLength = opts["minLength"] + if "maxLength" in opts: + field.maxLength = opts["maxLength"] + if "pattern" in opts: + field.pattern = opts["pattern"] + if "minimum" in opts: + field.minimum = opts["minimum"] + if "maximum" in opts: + field.maximum = opts["maximum"] + if "exclusiveMinimum" in opts: + field.exclusiveMinimum = opts["exclusiveMinimum"] + if "exclusiveMaximum" in opts: + field.exclusiveMaximum = opts["exclusiveMaximum"] + if "enum" in opts: + field.enum = opts["enum"] + if "format" in opts: + field.format = opts["format"] + + # Convert custom properties + if prop.customProperties: + field.config = {} + for cp in prop.customProperties: + if cp.property == "pii": + field.pii = cp.value + else: + field.config[cp.property] = cp.value + + # Convert nested properties (for object types) + if prop.properties: + field.fields = {} + for nested_prop in prop.properties: + nested_field = _convert_property_to_field(nested_prop) + field.fields[nested_prop.name] = nested_field + + # Convert items (for array types) + if prop.items: + field.items = _convert_property_to_field(prop.items) + + return field + + +def _convert_logical_to_dcs_type(logical_type: Optional[str], physical_type: Optional[str]) -> str: + """Convert ODCS logical type back to a DCS field type.""" + if physical_type: + # Use physical type if available (more specific) + pt = physical_type.lower() + # Common physical types + if pt in ["varchar", "text", "char", "nvarchar"]: + return "string" + if pt in ["int", "integer", "int32"]: + return "integer" + if pt in ["bigint", "int64", "long"]: + return "long" + if pt in ["float", "real", "float32"]: + return "float" + if pt in ["double", "float64"]: + return "double" + if pt in ["decimal", "numeric"]: + return "decimal" + if pt in ["timestamp", "datetime", "timestamptz"]: + return "timestamp" + if pt in ["date"]: + return "date" + if pt in ["bool", "boolean"]: + return "boolean" + if pt in ["bytes", "binary", "bytea"]: + return "bytes" + if pt in ["array"]: + return "array" + if pt in ["object", "struct", "record", "map", "json", "jsonb"]: + return "object" + # Return the physical type as-is if no mapping found + return physical_type + + if logical_type is None: + return "string" + + lt = logical_type.lower() + if lt == "string": + return "string" + elif lt == "integer": + return "integer" + elif lt == "number": + return "decimal" + elif lt == "boolean": + return "boolean" + elif lt == "date": + return "timestamp" + elif lt == "array": + return "array" + elif lt == "object": + return "object" + else: + return logical_type diff --git a/datacontract/export/dqx_converter.py b/datacontract/export/dqx_converter.py deleted file mode 100644 index 8ec395fbf..000000000 --- a/datacontract/export/dqx_converter.py +++ /dev/null @@ -1,121 +0,0 @@ -from typing import Any, Dict, List, Union - -import yaml - -from datacontract.export.exporter import Exporter, _check_models_for_export -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Quality - - -class DqxKeys: - CHECK = "check" - ARGUMENTS = "arguments" - SPECIFICATION = "specification" - COL_NAME = "column" - COL_NAMES = "for_each_column" - COLUMNS = "columns" - FUNCTION = "function" - - -class DqxExporter(Exporter): - """Exporter implementation for converting data contracts to DQX YAML file.""" - - def export( - self, - data_contract: DataContractSpecification, - model: Model, - server: str, - sql_server_type: str, - export_args: Dict[str, Any], - ) -> str: - """Exports a data contract to DQX format.""" - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - return to_dqx_yaml(model_value) - - -def to_dqx_yaml(model_value: Model) -> str: - """ - Converts the data contract's quality checks to DQX YAML format. - - Args: - model_value (Model): The data contract to convert. - - Returns: - str: YAML representation of the data contract's quality checks. - """ - extracted_rules = extract_quality_rules(model_value) - return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False) - - -def process_quality_rule(rule: Quality, column_name: str) -> Dict[str, Any]: - """ - Processes a single quality rule by injecting the column path into its arguments if absent. - - Args: - rule (Quality): The quality rule to process. - column_name (str): The full path to the current column. - - Returns: - dict: The processed quality rule specification. - """ - rule_data = rule.model_extra - specification = rule_data[DqxKeys.SPECIFICATION] - check = specification[DqxKeys.CHECK] - - arguments = check.setdefault(DqxKeys.ARGUMENTS, {}) - - if DqxKeys.COL_NAME not in arguments and DqxKeys.COL_NAMES not in arguments and DqxKeys.COLUMNS not in arguments: - if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"): - arguments[DqxKeys.COL_NAME] = column_name - else: - arguments[DqxKeys.COLUMNS] = [column_name] - - return specification - - -def extract_quality_rules(data: Union[Model, Field, Quality], column_path: str = "") -> List[Dict[str, Any]]: - """ - Recursively extracts all quality rules from a data contract structure. - - Args: - data (Union[Model, Field, Quality]): The data contract model, field, or quality rule. - column_path (str, optional): The current path in the schema hierarchy. Defaults to "". - - Returns: - List[Dict[str, Any]]: A list of quality rule specifications. - """ - quality_rules = [] - - if isinstance(data, Quality): - return [process_quality_rule(data, column_path)] - - if isinstance(data, (Model, Field)): - for key, field in data.fields.items(): - current_path = build_column_path(column_path, key) - - if field.fields: - # Field is a struct-like object, recurse deeper - quality_rules.extend(extract_quality_rules(field, current_path)) - else: - # Process quality rules at leaf fields - for rule in field.quality: - quality_rules.append(process_quality_rule(rule, current_path)) - - # Process any quality rules attached directly to this level - for rule in data.quality: - quality_rules.append(process_quality_rule(rule, column_path)) - - return quality_rules - - -def build_column_path(current_path: str, key: str) -> str: - """ - Builds the full column path by concatenating parent path with current key. - - Args: - current_path (str): The current path prefix. - key (str): The current field's key. - - Returns: - str: The full path. - """ - return f"{current_path}.{key}" if current_path else key diff --git a/datacontract/export/dqx_exporter.py b/datacontract/export/dqx_exporter.py new file mode 100644 index 000000000..7f24f607e --- /dev/null +++ b/datacontract/export/dqx_exporter.py @@ -0,0 +1,163 @@ +from typing import Any, Dict, List, Union + +import yaml +from open_data_contract_standard.model import DataQuality, OpenDataContractStandard, SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export + + +class DqxKeys: + CHECK = "check" + ARGUMENTS = "arguments" + COL_NAME = "column" + COL_NAMES = "for_each_column" + COLUMNS = "columns" + FUNCTION = "function" + + +class DqxExporter(Exporter): + """Exporter implementation for converting data contracts to DQX YAML file.""" + + def export( + self, + data_contract: OpenDataContractStandard, + schema_name: str, + server: str, + sql_server_type: str, + export_args: Dict[str, Any], + ) -> str: + """Exports a data contract to DQX format.""" + model_name, model_value = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + return to_dqx_yaml(model_value) + + +def to_dqx_yaml(model_value: SchemaObject) -> str: + """ + Converts the data contract's quality checks to DQX YAML format. + + Args: + model_value (SchemaObject): The schema object to convert. + + Returns: + str: YAML representation of the data contract's quality checks. + """ + extracted_rules = extract_quality_rules(model_value) + return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False) + + +def process_quality_rule(rule: DataQuality, column_name: str) -> Dict[str, Any]: + """ + Processes a single quality rule by injecting the column path into its arguments if absent. + + Args: + rule (DataQuality): The quality rule to process. + column_name (str): The full path to the current column. + + Returns: + dict: The processed quality rule specification. + """ + implementation = rule.implementation + check = implementation[DqxKeys.CHECK] + + if column_name: + arguments = check.setdefault(DqxKeys.ARGUMENTS, {}) + + if ( + DqxKeys.COL_NAME not in arguments + and DqxKeys.COL_NAMES not in arguments + and DqxKeys.COLUMNS not in arguments + ): + if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"): + arguments[DqxKeys.COL_NAME] = column_name + else: + arguments[DqxKeys.COLUMNS] = [column_name] + + return implementation + + +def extract_quality_rules(data: Union[SchemaObject, SchemaProperty, DataQuality], column_path: str = "") -> List[Dict[str, Any]]: + """ + Recursively extracts all quality rules from a data contract structure. + + Args: + data (Union[SchemaObject, SchemaProperty, DataQuality]): The schema object, property, or quality rule. + column_path (str, optional): The current path in the schema hierarchy. Defaults to "". + + Returns: + List[Dict[str, Any]]: A list of quality rule specifications. + """ + quality_rules = [] + + if isinstance(data, DataQuality): + return [process_quality_rule(data, column_path)] + + if isinstance(data, SchemaObject): + # Process properties + if data.properties: + for prop in data.properties: + current_path = build_column_path(column_path, prop.name) + + if prop.properties: + # Property is a struct-like object, recurse deeper + quality_rules.extend(extract_quality_rules_from_property(prop, current_path)) + else: + # Process quality rules at leaf properties + if prop.quality: + for rule in prop.quality: + quality_rules.append(process_quality_rule(rule, current_path)) + + # Process any quality rules attached directly to the schema + if data.quality: + for rule in data.quality: + quality_rules.append(process_quality_rule(rule, column_path)) + + return quality_rules + + +def extract_quality_rules_from_property(prop: SchemaProperty, column_path: str) -> List[Dict[str, Any]]: + """ + Recursively extracts quality rules from a property and its nested properties. + + Args: + prop (SchemaProperty): The property to process. + column_path (str): The current path in the schema hierarchy. + + Returns: + List[Dict[str, Any]]: A list of quality rule specifications. + """ + quality_rules = [] + + # Process nested properties + if prop.properties: + for nested_prop in prop.properties: + nested_path = build_column_path(column_path, nested_prop.name) + + if nested_prop.properties: + # Recurse deeper + quality_rules.extend(extract_quality_rules_from_property(nested_prop, nested_path)) + else: + # Process quality rules at leaf properties + if nested_prop.quality: + for rule in nested_prop.quality: + quality_rules.append(process_quality_rule(rule, nested_path)) + + # Process quality rules at this property level + if prop.quality: + for rule in prop.quality: + quality_rules.append(process_quality_rule(rule, column_path)) + + return quality_rules + + +def build_column_path(current_path: str, key: str) -> str: + """ + Builds the full column path by concatenating parent path with current key. + + Args: + current_path (str): The current path prefix. + key (str): The current field's key. + + Returns: + str: The full path. + """ + return f"{current_path}.{key}" if current_path else key diff --git a/datacontract/export/duckdb_type_converter.py b/datacontract/export/duckdb_type_converter.py index cf57398e1..2f1e6c825 100644 --- a/datacontract/export/duckdb_type_converter.py +++ b/datacontract/export/duckdb_type_converter.py @@ -1,12 +1,21 @@ -from typing import Dict +from typing import List, Optional -from datacontract.model.data_contract_specification import Field +from open_data_contract_standard.model import SchemaProperty + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property, checking both logical and physical type.""" + if prop.logicalType: + return prop.logicalType + if prop.physicalType: + return prop.physicalType + return None # https://duckdb.org/docs/data/csv/overview.html # ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR'] -def convert_to_duckdb_csv_type(field) -> None | str: - datacontract_type = field.type +def convert_to_duckdb_csv_type(prop: SchemaProperty) -> None | str: + datacontract_type = _get_type(prop) if datacontract_type is None: return "VARCHAR" if datacontract_type.lower() in ["string", "varchar", "text"]: @@ -20,8 +29,7 @@ def convert_to_duckdb_csv_type(field) -> None | str: if datacontract_type.lower() in ["time"]: return "TIME" if datacontract_type.lower() in ["number", "decimal", "numeric"]: - # precision and scale not supported by data contract - return "VARCHAR" + return "DOUBLE" if datacontract_type.lower() in ["float", "double"]: return "DOUBLE" if datacontract_type.lower() in ["integer", "int", "long", "bigint"]: @@ -41,17 +49,22 @@ def convert_to_duckdb_csv_type(field) -> None | str: return "VARCHAR" -def convert_to_duckdb_json_type(field: Field) -> None | str: - datacontract_type = field.type +def convert_to_duckdb_json_type(prop: SchemaProperty) -> None | str: + datacontract_type = _get_type(prop) if datacontract_type is None: return "VARCHAR" if datacontract_type.lower() in ["array"]: - return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore + if prop.items: + return convert_to_duckdb_json_type(prop.items) + "[]" # type: ignore + return "VARCHAR[]" if datacontract_type.lower() in ["object", "record", "struct"]: - return convert_to_duckdb_object(field.fields) - return convert_to_duckdb_csv_type(field) + # If no properties are defined, treat as generic JSON + if prop.properties is None or len(prop.properties) == 0: + return "JSON" + return convert_to_duckdb_object(prop.properties) + return convert_to_duckdb_csv_type(prop) -def convert_to_duckdb_object(fields: Dict[str, Field]): - columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()] +def convert_to_duckdb_object(properties: List[SchemaProperty]): + columns = [f'"{prop.name}" {convert_to_duckdb_json_type(prop)}' for prop in properties] return f"STRUCT({', '.join(columns)})" diff --git a/datacontract/export/excel_exporter.py b/datacontract/export/excel_exporter.py index 102a7a18a..abf4f3458 100644 --- a/datacontract/export/excel_exporter.py +++ b/datacontract/export/excel_exporter.py @@ -17,7 +17,6 @@ from openpyxl.worksheet.worksheet import Worksheet from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification logger = logging.getLogger(__name__) @@ -32,12 +31,12 @@ class ExcelExporter(Exporter): def __init__(self, export_format): super().__init__(export_format) - def export(self, data_contract, model, server, sql_server_type, export_args) -> bytes: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> bytes: """ Export data contract to Excel using the official ODCS template Args: - data_contract: DataContractSpecification or OpenDataContractStandard to export + data_contract: OpenDataContractStandard to export model: Model name (not used for Excel export) server: Server name (not used for Excel export) sql_server_type: SQL server type (not used for Excel export) @@ -46,13 +45,8 @@ def export(self, data_contract, model, server, sql_server_type, export_args) -> Returns: Excel file as bytes """ - # Convert to ODCS if needed - if isinstance(data_contract, DataContractSpecification): - # First convert DCS to ODCS format via YAML - yaml_content = data_contract.to_yaml() - odcs = OpenDataContractStandard.from_string(yaml_content) - else: - odcs = data_contract + # The data_contract is now always ODCS + odcs = data_contract # Get template from export_args if provided, otherwise use default template = export_args.get("template") if export_args else None @@ -283,7 +277,7 @@ def fill_single_property_template( sheet: Worksheet, row_index: int, prefix: str, property: SchemaProperty, header_map: dict ) -> int: """Fill a single property row using the template's column structure""" - property_name = f"{prefix}.{property.name}" if prefix else property.name + property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name # Helper function to set cell value by header name def set_by_header(header_name: str, value: Any): @@ -307,13 +301,14 @@ def set_by_header(header_name: str, value: Any): set_by_header("Classification", property.classification) set_by_header("Tags", ",".join(property.tags) if property.tags else "") set_by_header( - "Example(s)", ",".join(property.examples) if property.examples else "" + "Example(s)", ",".join(map(str, property.examples)) if property.examples else "" ) # Note: using "Example(s)" as in template set_by_header("Encrypted Name", property.encryptedName) set_by_header( "Transform Sources", ",".join(property.transformSourceObjects) if property.transformSourceObjects else "" ) set_by_header("Transform Logic", property.transformLogic) + set_by_header("Critical Data Element Status", property.criticalDataElement) # Authoritative definitions if property.authoritativeDefinitions and len(property.authoritativeDefinitions) > 0: @@ -404,7 +399,7 @@ def fill_properties_quality( if not property.name: continue - full_property_name = f"{prefix}.{property.name}" if prefix else property.name + full_property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name # Add quality attributes for this property if property.quality: @@ -482,10 +477,14 @@ def get_threshold_operator(quality: DataQuality) -> Optional[str]: return "mustBeGreaterThan" elif hasattr(quality, "mustBeGreaterThanOrEqualTo") and quality.mustBeGreaterThanOrEqualTo is not None: return "mustBeGreaterThanOrEqualTo" + elif hasattr(quality, "mustBeGreaterOrEqualTo") and quality.mustBeGreaterOrEqualTo is not None: + return "mustBeGreaterOrEqualTo" elif hasattr(quality, "mustBeLessThan") and quality.mustBeLessThan is not None: return "mustBeLessThan" elif hasattr(quality, "mustBeLessThanOrEqualTo") and quality.mustBeLessThanOrEqualTo is not None: return "mustBeLessThanOrEqualTo" + elif hasattr(quality, "mustBeLessOrEqualTo") and quality.mustBeLessOrEqualTo is not None: + return "mustBeLessOrEqualTo" elif hasattr(quality, "mustBeBetween") and quality.mustBeBetween is not None: return "mustBeBetween" elif hasattr(quality, "mustNotBeBetween") and quality.mustNotBeBetween is not None: @@ -503,10 +502,14 @@ def get_threshold_value(quality: DataQuality) -> Optional[str]: return str(quality.mustBeGreaterThan) elif hasattr(quality, "mustBeGreaterThanOrEqualTo") and quality.mustBeGreaterThanOrEqualTo is not None: return str(quality.mustBeGreaterThanOrEqualTo) + elif hasattr(quality, "mustBeGreaterOrEqualTo") and quality.mustBeGreaterOrEqualTo is not None: + return str(quality.mustBeGreaterOrEqualTo) elif hasattr(quality, "mustBeLessThan") and quality.mustBeLessThan is not None: return str(quality.mustBeLessThan) elif hasattr(quality, "mustBeLessThanOrEqualTo") and quality.mustBeLessThanOrEqualTo is not None: return str(quality.mustBeLessThanOrEqualTo) + elif hasattr(quality, "mustBeLessOrEqualTo") and quality.mustBeLessOrEqualTo is not None: + return str(quality.mustBeLessOrEqualTo) elif hasattr(quality, "mustBeBetween") and quality.mustBeBetween is not None and len(quality.mustBeBetween) >= 2: return f"[{quality.mustBeBetween[0]}, {quality.mustBeBetween[1]}]" elif ( @@ -711,7 +714,60 @@ def fill_servers(workbook: Workbook, odcs: OpenDataContractStandard): set_cell_value_by_column_index(servers_sheet, "servers.databricks.catalog", index, server.catalog) set_cell_value_by_column_index(servers_sheet, "servers.databricks.host", index, server.host) set_cell_value_by_column_index(servers_sheet, "servers.databricks.schema", index, server.schema_) - # Add other server types as needed... + elif server_type == "glue": + set_cell_value_by_column_index(servers_sheet, "servers.glue.account", index, server.account) + set_cell_value_by_column_index(servers_sheet, "servers.glue.database", index, server.database) + set_cell_value_by_column_index(servers_sheet, "servers.glue.format", index, server.format) + set_cell_value_by_column_index(servers_sheet, "servers.glue.location", index, server.location) + elif server_type == "kafka": + set_cell_value_by_column_index(servers_sheet, "servers.kafka.format", index, server.format) + set_cell_value_by_column_index(servers_sheet, "servers.kafka.host", index, server.host) + elif server_type == "oracle": + set_cell_value_by_column_index(servers_sheet, "servers.oracle.host", index, server.host) + set_cell_value_by_column_index(servers_sheet, "servers.oracle.port", index, server.port) + set_cell_value_by_column_index(servers_sheet, "servers.oracle.servicename", index, server.serviceName) + elif server_type == "postgres": + set_cell_value_by_column_index(servers_sheet, "servers.postgres.database", index, server.database) + set_cell_value_by_column_index(servers_sheet, "servers.postgres.host", index, server.host) + set_cell_value_by_column_index(servers_sheet, "servers.postgres.port", index, server.port) + set_cell_value_by_column_index(servers_sheet, "servers.postgres.schema", index, server.schema_) + elif server_type == "s3": + set_cell_value_by_column_index(servers_sheet, "servers.s3.delimiter", index, server.delimiter) + set_cell_value_by_column_index(servers_sheet, "servers.s3.endpointUrl", index, server.endpointUrl) + set_cell_value_by_column_index(servers_sheet, "servers.s3.format", index, server.format) + set_cell_value_by_column_index(servers_sheet, "servers.s3.location", index, server.location) + elif server_type == "snowflake": + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.account", index, server.account) + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.database", index, server.database) + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.host", index, server.host) + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.port", index, server.port) + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.schema", index, server.schema_) + set_cell_value_by_column_index(servers_sheet, "servers.snowflake.warehouse", index, server.warehouse) + elif server_type == "sqlserver": + set_cell_value_by_column_index(servers_sheet, "servers.sqlserver.database", index, server.database) + set_cell_value_by_column_index(servers_sheet, "servers.sqlserver.host", index, server.host) + set_cell_value_by_column_index(servers_sheet, "servers.sqlserver.port", index, server.port) + set_cell_value_by_column_index(servers_sheet, "servers.sqlserver.schema", index, server.schema_) + else: + # Custom/unknown server type - export all possible fields + set_cell_value_by_column_index(servers_sheet, "servers.custom.account", index, server.account) + set_cell_value_by_column_index(servers_sheet, "servers.custom.catalog", index, server.catalog) + set_cell_value_by_column_index(servers_sheet, "servers.custom.database", index, server.database) + set_cell_value_by_column_index(servers_sheet, "servers.custom.dataset", index, server.dataset) + set_cell_value_by_column_index(servers_sheet, "servers.custom.delimiter", index, server.delimiter) + set_cell_value_by_column_index(servers_sheet, "servers.custom.endpointUrl", index, server.endpointUrl) + set_cell_value_by_column_index(servers_sheet, "servers.custom.format", index, server.format) + set_cell_value_by_column_index(servers_sheet, "servers.custom.host", index, server.host) + set_cell_value_by_column_index(servers_sheet, "servers.custom.location", index, server.location) + set_cell_value_by_column_index(servers_sheet, "servers.custom.path", index, server.path) + set_cell_value_by_column_index(servers_sheet, "servers.custom.port", index, server.port) + set_cell_value_by_column_index(servers_sheet, "servers.custom.project", index, server.project) + set_cell_value_by_column_index(servers_sheet, "servers.custom.schema", index, server.schema_) + set_cell_value_by_column_index(servers_sheet, "servers.custom.serviceName", index, server.serviceName) + set_cell_value_by_column_index(servers_sheet, "servers.custom.stagingDir", index, server.stagingDir) + set_cell_value_by_column_index(servers_sheet, "servers.custom.warehouse", index, server.warehouse) + set_cell_value_by_column_index(servers_sheet, "servers.custom.region", index, server.region) + set_cell_value_by_column_index(servers_sheet, "servers.custom.regionName", index, server.regionName) except Exception as e: logger.warning(f"Error filling servers: {e}") diff --git a/datacontract/export/exporter.py b/datacontract/export/exporter.py index 7be6755b4..c87e6592f 100644 --- a/datacontract/export/exporter.py +++ b/datacontract/export/exporter.py @@ -2,10 +2,7 @@ from abc import ABC, abstractmethod from enum import Enum -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Model, -) +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject class Exporter(ABC): @@ -13,7 +10,25 @@ def __init__(self, export_format) -> None: self.export_format = export_format @abstractmethod - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict | str: + def export( + self, + data_contract: OpenDataContractStandard, + schema_name: str, + server: str, + sql_server_type: str, + export_args: dict, + ) -> dict | str: + """Export a data contract to the target format. + + Args: + data_contract: The ODCS data contract to export. + schema_name: The name of the schema to export, or 'all' for all schemas. + server: The server name to use for export. + sql_server_type: The SQL server type for dialect-specific exports. + export_args: Additional export arguments. + + All exporters now accept OpenDataContractStandard (ODCS) format. + """ pass @@ -29,7 +44,6 @@ class ExportFormat(str, Enum): avro = "avro" protobuf = "protobuf" great_expectations = "great-expectations" - terraform = "terraform" avro_idl = "avro-idl" sql = "sql" sql_query = "sql-query" @@ -53,39 +67,47 @@ def get_supported_formats(cls): return list(map(lambda c: c.value, cls)) -def _check_models_for_export( - data_contract: DataContractSpecification, model: str, export_format: str -) -> typing.Tuple[str, Model]: - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") +def _check_schema_name_for_export( + data_contract: OpenDataContractStandard, schema_name: str, export_format: str +) -> typing.Tuple[str, SchemaObject]: + """Check and retrieve a schema from the data contract for export. - model_names = list(data_contract.models.keys()) + In ODCS, schemas are stored in schema_ as a list of SchemaObject. + """ + if data_contract.schema_ is None or len(data_contract.schema_) == 0: + raise RuntimeError(f"Export to {export_format} requires schema in the data contract.") - if model == "all": - if len(data_contract.models.items()) != 1: + schema_names = [schema.name for schema in data_contract.schema_] + + if schema_name == "all": + if len(data_contract.schema_) != 1: raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" + f"Export to {export_format} requires a specific schema. Specify the schema via --schema-name. Available schemas: {schema_names}" ) - model_name, model_value = next(iter(data_contract.models.items())) + schema_obj = data_contract.schema_[0] + return schema_obj.name, schema_obj else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError(f"Model {model_name} not found in the data contract. Available models: {model_names}") + schema_obj = next((s for s in data_contract.schema_ if s.name == schema_name), None) + if schema_obj is None: + raise RuntimeError(f"Schema '{schema_name}' not found in the data contract. Available schemas: {schema_names}") - return model_name, model_value + return schema_name, schema_obj -def _determine_sql_server_type(data_contract: DataContractSpecification, sql_server_type: str, server: str = None): +def _determine_sql_server_type( + data_contract: OpenDataContractStandard, sql_server_type: str, server: str = None +) -> str: + """Determine the SQL server type from the data contract servers.""" if sql_server_type == "auto": if data_contract.servers is None or len(data_contract.servers) == 0: raise RuntimeError("Export with server_type='auto' requires servers in the data contract.") if server is None: - server_types = set([server.type for server in data_contract.servers.values()]) + server_types = set([s.type for s in data_contract.servers]) else: - server_types = {data_contract.servers[server].type} + server_obj = next((s for s in data_contract.servers if s.server == server), None) + server_types = {server_obj.type} if server_obj else set() if "snowflake" in server_types: return "snowflake" diff --git a/datacontract/export/exporter_factory.py b/datacontract/export/exporter_factory.py index 483127916..76a8cf97f 100644 --- a/datacontract/export/exporter_factory.py +++ b/datacontract/export/exporter_factory.py @@ -49,43 +49,43 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.avro, - module_path="datacontract.export.avro_converter", + module_path="datacontract.export.avro_exporter", class_name="AvroExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.avro_idl, - module_path="datacontract.export.avro_idl_converter", + module_path="datacontract.export.avro_idl_exporter", class_name="AvroIdlExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.bigquery, - module_path="datacontract.export.bigquery_converter", + module_path="datacontract.export.bigquery_exporter", class_name="BigQueryExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.data_caterer, - module_path="datacontract.export.data_caterer_converter", + module_path="datacontract.export.data_caterer_exporter", class_name="DataCatererExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.dbml, - module_path="datacontract.export.dbml_converter", + module_path="datacontract.export.dbml_exporter", class_name="DbmlExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.rdf, - module_path="datacontract.export.rdf_converter", + module_path="datacontract.export.rdf_exporter", class_name="RdfExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.dbt, - module_path="datacontract.export.dbt_converter", + module_path="datacontract.export.dbt_exporter", class_name="DbtExporter", ) @@ -97,19 +97,19 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.dbt_sources, - module_path="datacontract.export.dbt_converter", + module_path="datacontract.export.dbt_exporter", class_name="DbtSourceExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.dbt_staging_sql, - module_path="datacontract.export.dbt_converter", + module_path="datacontract.export.dbt_exporter", class_name="DbtStageExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.jsonschema, - module_path="datacontract.export.jsonschema_converter", + module_path="datacontract.export.jsonschema_exporter", class_name="JsonSchemaExporter", ) @@ -121,13 +121,13 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.go, - module_path="datacontract.export.go_converter", + module_path="datacontract.export.go_exporter", class_name="GoExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.great_expectations, - module_path="datacontract.export.great_expectations_converter", + module_path="datacontract.export.great_expectations_exporter", class_name="GreatExpectationsExporter", ) @@ -139,49 +139,43 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.protobuf, - module_path="datacontract.export.protobuf_converter", + module_path="datacontract.export.protobuf_exporter", class_name="ProtoBufExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.pydantic_model, - module_path="datacontract.export.pydantic_converter", + module_path="datacontract.export.pydantic_exporter", class_name="PydanticExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.sodacl, - module_path="datacontract.export.sodacl_converter", + module_path="datacontract.export.sodacl_exporter", class_name="SodaExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.sql, - module_path="datacontract.export.sql_converter", + module_path="datacontract.export.sql_exporter", class_name="SqlExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.sql_query, - module_path="datacontract.export.sql_converter", + module_path="datacontract.export.sql_exporter", class_name="SqlQueryExporter", ) -exporter_factory.register_lazy_exporter( - name=ExportFormat.terraform, - module_path="datacontract.export.terraform_converter", - class_name="TerraformExporter", -) - exporter_factory.register_lazy_exporter( name=ExportFormat.spark, - module_path="datacontract.export.spark_converter", + module_path="datacontract.export.spark_exporter", class_name="SparkExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.sqlalchemy, - module_path="datacontract.export.sqlalchemy_converter", + module_path="datacontract.export.sqlalchemy_exporter", class_name="SQLAlchemyExporter", ) @@ -193,22 +187,22 @@ def load_module_class(module_path, class_name): exporter_factory.register_lazy_exporter( name=ExportFormat.markdown, - module_path="datacontract.export.markdown_converter", + module_path="datacontract.export.markdown_exporter", class_name="MarkdownExporter", ) exporter_factory.register_lazy_exporter( name=ExportFormat.dqx, - module_path="datacontract.export.dqx_converter", + module_path="datacontract.export.dqx_exporter", class_name="DqxExporter", ) exporter_factory.register_lazy_exporter( - name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter" + name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_exporter", class_name="IcebergExporter" ) exporter_factory.register_lazy_exporter( - name=ExportFormat.custom, module_path="datacontract.export.custom_converter", class_name="CustomExporter" + name=ExportFormat.custom, module_path="datacontract.export.custom_exporter", class_name="CustomExporter" ) exporter_factory.register_lazy_exporter( diff --git a/datacontract/export/go_converter.py b/datacontract/export/go_converter.py deleted file mode 100644 index 7572a70b6..000000000 --- a/datacontract/export/go_converter.py +++ /dev/null @@ -1,105 +0,0 @@ -import re -from typing import List - -import datacontract.model.data_contract_specification as spec -from datacontract.export.exporter import Exporter - - -class GoExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_go_types(data_contract) - - -def to_go_types(contract: spec.DataContractSpecification) -> str: - result = "package main\n\n" - - for key in contract.models.keys(): - go_types = generate_go_type(contract.models[key], key) - for go_type in go_types: - # print(go_type + "\n\n") - result += f"\n{go_type}\n" - - return result - - -def python_type_to_go_type(py_type) -> str: - match py_type: - case "text": - return "string" - case "timestamp": - return "time.Time" - case "long": - return "int64" - case "int": - return "int" - case "float": - return "float64" - case "boolean": - return "bool" - case _: - return "interface{}" - - -def to_camel_case(snake_str) -> str: - return "".join(word.capitalize() for word in re.split(r"_|(? str: - go_type = "interface{}" - if field_info.fields: - nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}") - nested_types[nested_type_name] = field_info.fields - go_type = nested_type_name - - match field_info.type: - case "array": - if field_info.items: - item_type = get_subtype(field_info.items, nested_types, type_name, camel_case_name + "Item") - go_type = f"[]{item_type}" - else: - go_type = "[]interface{}" - case "record": - if field_info.fields: - nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}") - nested_types[nested_type_name] = field_info.fields - go_type = nested_type_name - else: - go_type = "interface{}" - case "object": - pass - case _: - go_type = field_info.type - - return go_type - - -def generate_go_type(model, model_name) -> List[str]: - go_types = [] - type_name = to_camel_case(model_name) - lines = [f"type {type_name} struct {{"] - - nested_types = {} - - for field_name, field_info in model.fields.items(): - go_type = python_type_to_go_type(field_info.type) - camel_case_name = to_camel_case(field_name) - json_tag = field_name if field_info.required else f"{field_name},omitempty" - avro_tag = field_name - - if go_type == "interface{}": - go_type = get_subtype(field_info, nested_types, type_name, camel_case_name) - - go_type = go_type if field_info.required else f"*{go_type}" - - lines.append( - f' {camel_case_name} {go_type} `json:"{json_tag}" avro:"{avro_tag}"` // {field_info.description}' - ) - lines.append("}") - go_types.append("\n".join(lines)) - - for nested_type_name, nested_fields in nested_types.items(): - nested_model = spec.Model(fields=nested_fields) - nested_go_types = generate_go_type(nested_model, nested_type_name) - go_types.extend(nested_go_types) - - return go_types diff --git a/datacontract/export/go_exporter.py b/datacontract/export/go_exporter.py new file mode 100644 index 000000000..4f3ad5ca1 --- /dev/null +++ b/datacontract/export/go_exporter.py @@ -0,0 +1,142 @@ +import re +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter + + +class GoExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_go_types(data_contract) + + +def to_go_types(contract: OpenDataContractStandard) -> str: + result = "package main\n\n" + + if contract.schema_: + for schema_obj in contract.schema_: + go_types = generate_go_type(schema_obj, schema_obj.name) + for go_type in go_types: + result += f"\n{go_type}\n" + + return result + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the logical type from a schema property.""" + return prop.logicalType + + +def python_type_to_go_type(prop_type: Optional[str], physical_type: Optional[str] = None) -> str: + """Convert ODCS type to Go type.""" + # Check physical type first for more specific mappings + if physical_type: + pt = physical_type.lower() + if pt in ["text", "varchar", "char", "nvarchar"]: + return "string" + if pt in ["timestamp", "datetime", "timestamp_tz", "timestamp_ntz"]: + return "time.Time" + if pt in ["long", "bigint", "int64"]: + return "int64" + if pt in ["int", "integer", "int32"]: + return "int" + if pt in ["float", "real", "float32"]: + return "float32" + if pt in ["double", "float64"]: + return "float64" + if pt in ["bool", "boolean"]: + return "bool" + + # Then check logical type + match prop_type: + case "string": + return "string" + case "date": + return "time.Time" + case "integer": + return "int64" + case "number": + return "float64" + case "boolean": + return "bool" + case _: + return "interface{}" + + +def to_camel_case(snake_str) -> str: + return "".join(word.capitalize() for word in re.split(r"_|(? str: + go_type = "interface{}" + if prop.properties: + nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}") + nested_types[nested_type_name] = prop.properties + go_type = nested_type_name + + prop_type = _get_type(prop) + physical_type = prop.physicalType + + match prop_type: + case "array": + if prop.items: + item_type = get_subtype(prop.items, nested_types, type_name, camel_case_name + "Item") + go_type = f"[]{item_type}" + else: + go_type = "[]interface{}" + case "object": + if prop.properties: + nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}") + nested_types[nested_type_name] = prop.properties + go_type = nested_type_name + else: + go_type = "interface{}" + case _: + if physical_type and physical_type.lower() in ["record", "struct"]: + if prop.properties: + nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}") + nested_types[nested_type_name] = prop.properties + go_type = nested_type_name + elif prop_type: + go_type = python_type_to_go_type(prop_type, physical_type) + + return go_type + + +def generate_go_type(schema_obj: SchemaObject, model_name: str) -> List[str]: + go_types = [] + type_name = to_camel_case(model_name) + lines = [f"type {type_name} struct {{"] + + nested_types = {} + + if schema_obj.properties: + for prop in schema_obj.properties: + prop_type = _get_type(prop) + physical_type = prop.physicalType + go_type = python_type_to_go_type(prop_type, physical_type) + camel_case_name = to_camel_case(prop.name) + json_tag = prop.name if prop.required else f"{prop.name},omitempty" + avro_tag = prop.name + + if go_type == "interface{}": + go_type = get_subtype(prop, nested_types, type_name, camel_case_name) + + go_type = go_type if prop.required else f"*{go_type}" + description = prop.description or "" + comment = f" // {description}" if description else "" + + lines.append(f' {camel_case_name} {go_type} `json:"{json_tag}" avro:"{avro_tag}"`{comment}') + + lines.append("}") + go_types.append("\n".join(lines)) + + # Generate nested types + for nested_type_name, nested_properties in nested_types.items(): + # Create a temporary SchemaObject for nested types + nested_schema = SchemaObject(name=nested_type_name, properties=nested_properties) + nested_go_types = generate_go_type(nested_schema, nested_type_name) + go_types.extend(nested_go_types) + + return go_types diff --git a/datacontract/export/great_expectations_converter.py b/datacontract/export/great_expectations_exporter.py similarity index 57% rename from datacontract/export/great_expectations_converter.py rename to datacontract/export/great_expectations_exporter.py index 6e30094de..424fd7cfc 100644 --- a/datacontract/export/great_expectations_converter.py +++ b/datacontract/export/great_expectations_exporter.py @@ -6,19 +6,13 @@ import json from enum import Enum -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional -import yaml +from open_data_contract_standard.model import DataQuality, OpenDataContractStandard, SchemaProperty from datacontract.export.exporter import ( Exporter, - _check_models_for_export, -) -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - DeprecatedQuality, - Field, - Quality, + _check_schema_name_for_export, ) @@ -44,29 +38,57 @@ class GreatExpectationsExporter(Exporter): """ - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> str: """Exports a data contract model to a Great Expectations suite. Args: - data_contract (DataContractSpecification): The data contract specification. + data_contract (OpenDataContractStandard): The data contract specification. model (str): The model name to export. server (str): The server information. sql_server_type (str): Type of SQL server (e.g., "snowflake"). export_args (dict): Additional arguments for export, such as "suite_name" and "engine". Returns: - dict: A dictionary representation of the Great Expectations suite. + str: JSON string of the Great Expectations suite. """ expectation_suite_name = export_args.get("suite_name") engine = export_args.get("engine") - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) + schema_name, _ = _check_schema_name_for_export(data_contract, schema_name, self.export_format) sql_server_type = "snowflake" if sql_server_type == "auto" else sql_server_type - return to_great_expectations(data_contract, model_name, expectation_suite_name, engine, sql_server_type) + return to_great_expectations(data_contract, schema_name, expectation_suite_name, engine, sql_server_type) + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property.""" + if prop.physicalType: + return prop.physicalType + if prop.logicalType: + return prop.logicalType + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_enum_from_custom_properties(prop: SchemaProperty) -> Optional[List[str]]: + """Get enum values from customProperties (used when importing from DCS).""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == "enum" and cp.value: + if isinstance(cp.value, list): + return cp.value + return json.loads(cp.value) + return None def to_great_expectations( - data_contract_spec: DataContractSpecification, - model_key: str, + odcs: OpenDataContractStandard, + schema_name: str, expectation_suite_name: str | None = None, engine: str | None = None, sql_server_type: str = "snowflake", @@ -74,8 +96,8 @@ def to_great_expectations( """Converts a data contract model to a Great Expectations suite. Args: - data_contract_spec (DataContractSpecification): The data contract specification. - model_key (str): The model key. + odcs (OpenDataContractStandard): The data contract. + schema_name (str): The schema/model name to export. expectation_suite_name (str | None): Optional suite name for the expectations. engine (str | None): Optional engine type (e.g., "pandas", "spark"). sql_server_type (str): The type of SQL server (default is "snowflake"). @@ -83,21 +105,24 @@ def to_great_expectations( Returns: str: JSON string of the Great Expectations suite. """ + # Find the schema by name + schema = next((s for s in odcs.schema_ if s.name == schema_name), None) + if schema is None: + raise RuntimeError(f"Schema '{schema_name}' not found in data contract.") + expectations = [] if not expectation_suite_name: - expectation_suite_name = "{model_key}.{contract_version}".format( - model_key=model_key, contract_version=data_contract_spec.info.version + expectation_suite_name = "{schema_name}.{contract_version}".format( + schema_name=schema_name, contract_version=odcs.version ) - model_value = data_contract_spec.models.get(model_key) - # Support for Deprecated Quality - quality_checks = get_deprecated_quality_checks(data_contract_spec.quality) + # Get quality checks from schema-level quality + if schema.quality: + expectations.extend(get_quality_checks(schema.quality)) - expectations.extend(get_quality_checks(model_value.quality)) + # Get expectations from model fields + expectations.extend(model_to_expectations(schema.properties or [], engine, sql_server_type)) - expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type)) - - expectations.extend(checks_to_expectations(quality_checks, model_key)) model_expectation_suite = to_suite(expectations, expectation_suite_name) return model_expectation_suite @@ -115,8 +140,7 @@ def to_suite(expectations: List[Dict[str, Any]], expectation_suite_name: str) -> """ return json.dumps( { - "data_asset_type": "null", - "expectation_suite_name": expectation_suite_name, + "name": expectation_suite_name, "expectations": expectations, "meta": {}, }, @@ -124,11 +148,11 @@ def to_suite(expectations: List[Dict[str, Any]], expectation_suite_name: str) -> ) -def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_server_type: str) -> List[Dict[str, Any]]: - """Converts model fields to a list of expectations. +def model_to_expectations(properties: List[SchemaProperty], engine: str | None, sql_server_type: str) -> List[Dict[str, Any]]: + """Converts model properties to a list of expectations. Args: - fields (Dict[str, Field]): Dictionary of model fields. + properties (List[SchemaProperty]): List of model properties. engine (str | None): Engine type (e.g., "pandas", "spark"). sql_server_type (str): SQL server type. @@ -136,16 +160,17 @@ def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_serv List[Dict[str, Any]]: List of expectations. """ expectations = [] - add_column_order_exp(fields, expectations) - for field_name, field in fields.items(): - add_field_expectations(field_name, field, expectations, engine, sql_server_type) - expectations.extend(get_quality_checks(field.quality, field_name)) + add_column_order_exp(properties, expectations) + for prop in properties: + add_field_expectations(prop.name, prop, expectations, engine, sql_server_type) + if prop.quality: + expectations.extend(get_quality_checks(prop.quality, prop.name)) return expectations def add_field_expectations( - field_name, - field: Field, + field_name: str, + prop: SchemaProperty, expectations: List[Dict[str, Any]], engine: str | None, sql_server_type: str, @@ -154,7 +179,7 @@ def add_field_expectations( Args: field_name (str): The name of the field. - field (Field): The field object. + prop (SchemaProperty): The property object. expectations (List[Dict[str, Any]]): The expectations list to update. engine (str | None): Engine type (e.g., "pandas", "spark"). sql_server_type (str): SQL server type. @@ -162,45 +187,55 @@ def add_field_expectations( Returns: List[Dict[str, Any]]: Updated list of expectations. """ - if field.type is not None: + prop_type = _get_type(prop) + if prop_type is not None: if engine == GreatExpectationsEngine.spark.value: - from datacontract.export.spark_converter import to_spark_data_type + from datacontract.export.spark_exporter import to_spark_data_type - field_type = to_spark_data_type(field).__class__.__name__ + field_type = to_spark_data_type(prop).__class__.__name__ elif engine == GreatExpectationsEngine.pandas.value: from datacontract.export.pandas_type_converter import convert_to_pandas_type - field_type = convert_to_pandas_type(field) + field_type = convert_to_pandas_type(prop) elif engine == GreatExpectationsEngine.sql.value: from datacontract.export.sql_type_converter import convert_to_sql_type - field_type = convert_to_sql_type(field, sql_server_type) + field_type = convert_to_sql_type(prop, sql_server_type) else: - field_type = field.type + field_type = prop_type expectations.append(to_column_types_exp(field_name, field_type)) - if field.unique: + if prop.unique: expectations.append(to_column_unique_exp(field_name)) - if field.maxLength is not None or field.minLength is not None: - expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength)) - if field.minimum is not None or field.maximum is not None: - expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum)) - if field.enum is not None and len(field.enum) != 0: - expectations.append(to_column_enum_exp(field_name, field.enum)) + + min_length = _get_logical_type_option(prop, "minLength") + max_length = _get_logical_type_option(prop, "maxLength") + if min_length is not None or max_length is not None: + expectations.append(to_column_length_exp(field_name, min_length, max_length)) + + minimum = _get_logical_type_option(prop, "minimum") + maximum = _get_logical_type_option(prop, "maximum") + if minimum is not None or maximum is not None: + expectations.append(to_column_min_max_exp(field_name, minimum, maximum)) + + enum_values = _get_logical_type_option(prop, "enum") or _get_enum_from_custom_properties(prop) + if enum_values is not None and len(enum_values) != 0: + expectations.append(to_column_enum_exp(field_name, enum_values)) return expectations -def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str, Any]]): +def add_column_order_exp(properties: List[SchemaProperty], expectations: List[Dict[str, Any]]): """Adds expectation for column ordering. Args: - fields (Dict[str, Field]): Dictionary of fields. + properties (List[SchemaProperty]): List of properties. expectations (List[Dict[str, Any]]): The expectations list to update. """ + column_names = [prop.name for prop in properties] expectations.append( { - "expectation_type": "expect_table_columns_to_match_ordered_list", - "kwargs": {"column_list": list(fields.keys())}, + "type": "expect_table_columns_to_match_ordered_list", + "kwargs": {"column_list": column_names}, "meta": {}, } ) @@ -217,7 +252,7 @@ def to_column_types_exp(field_name, field_type) -> Dict[str, Any]: Dict[str, Any]: Column type expectation. """ return { - "expectation_type": "expect_column_values_to_be_of_type", + "type": "expect_column_values_to_be_of_type", "kwargs": {"column": field_name, "type_": field_type}, "meta": {}, } @@ -233,7 +268,7 @@ def to_column_unique_exp(field_name) -> Dict[str, Any]: Dict[str, Any]: Column uniqueness expectation. """ return { - "expectation_type": "expect_column_values_to_be_unique", + "type": "expect_column_values_to_be_unique", "kwargs": {"column": field_name}, "meta": {}, } @@ -251,7 +286,7 @@ def to_column_length_exp(field_name, min_length, max_length) -> Dict[str, Any]: Dict[str, Any]: Column length expectation. """ return { - "expectation_type": "expect_column_value_lengths_to_be_between", + "type": "expect_column_value_lengths_to_be_between", "kwargs": { "column": field_name, "min_value": min_length, @@ -273,7 +308,7 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]: Dict[str, Any]: Column min-max value expectation. """ return { - "expectation_type": "expect_column_values_to_be_between", + "type": "expect_column_values_to_be_between", "kwargs": {"column": field_name, "min_value": minimum, "max_value": maximum}, "meta": {}, } @@ -290,73 +325,27 @@ def to_column_enum_exp(field_name, enum_list: List[str]) -> Dict[str, Any]: Dict[str, Any]: Column value in set expectation. """ return { - "expectation_type": "expect_column_values_to_be_in_set", + "type": "expect_column_values_to_be_in_set", "kwargs": {"column": field_name, "value_set": enum_list}, "meta": {}, } -def get_deprecated_quality_checks(quality: DeprecatedQuality) -> Dict[str, Any]: +def get_quality_checks(qualities: List[DataQuality], field_name: str | None = None) -> List[Dict[str, Any]]: """Retrieves quality checks defined in a data contract. Args: - quality (Quality): Quality object from the data contract. - - Returns: - Dict[str, Any]: Dictionary of quality checks. - """ - if quality is None: - return {} - if quality.type is None: - return {} - if quality.type.lower() != "great-expectations": - return {} - if isinstance(quality.specification, str): - quality_specification = yaml.safe_load(quality.specification) - else: - quality_specification = quality.specification - return quality_specification - - -def get_quality_checks(qualities: List[Quality], field_name: str | None = None) -> List[Dict[str, Any]]: - """Retrieves quality checks defined in a data contract. - - Args: - qualities (List[Quality]): List of quality object from the model specification. + qualities (List[DataQuality]): List of quality object from the model specification. field_name (str | None): field name if the quality list is attached to a specific field Returns: - Dict[str, Any]: Dictionary of quality checks. + List[Dict[str, Any]]: List of quality check specifications. """ quality_specification = [] for quality in qualities: - if quality is not None and quality.engine is not None and quality.engine.lower() == "great-expectations": + if quality is not None and quality.engine is not None and quality.engine.lower() in ("great-expectations", "greatexpectations"): ge_expectation = quality.implementation - if field_name is not None: + if field_name is not None and isinstance(ge_expectation, dict): ge_expectation["column"] = field_name quality_specification.append(ge_expectation) return quality_specification - - -def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]: - """Converts quality checks to a list of expectations. - - Args: - quality_checks (Dict[str, Any]): Dictionary of quality checks by model. - model_key (str): The model key. - - Returns: - List[Dict[str, Any]]: List of expectations for the model. - """ - if quality_checks is None or model_key not in quality_checks: - return [] - - model_quality_checks = quality_checks[model_key] - - if model_quality_checks is None: - return [] - - if isinstance(model_quality_checks, str): - expectation_list = json.loads(model_quality_checks) - return expectation_list - return [] diff --git a/datacontract/export/html_exporter.py b/datacontract/export/html_exporter.py index 53b26b28d..2533c501c 100644 --- a/datacontract/export/html_exporter.py +++ b/datacontract/export/html_exporter.py @@ -4,21 +4,19 @@ import jinja_partials import pytz -import yaml from jinja2 import Environment, PackageLoader, select_autoescape from open_data_contract_standard.model import OpenDataContractStandard from datacontract.export.exporter import Exporter from datacontract.export.mermaid_exporter import to_mermaid -from datacontract.model.data_contract_specification import DataContractSpecification class HtmlExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: return to_html(data_contract) -def to_html(data_contract_spec: DataContractSpecification | OpenDataContractStandard) -> str: +def to_html(data_contract: OpenDataContractStandard) -> str: # Load templates from templates folder package_loader = PackageLoader("datacontract", "templates") env = Environment( @@ -31,37 +29,21 @@ def to_html(data_contract_spec: DataContractSpecification | OpenDataContractStan # Set up for partials jinja_partials.register_environment(env) - # Load the required template - # needs to be included in /MANIFEST.in - template_file = "datacontract.html" - if isinstance(data_contract_spec, OpenDataContractStandard): - template_file = "datacontract_odcs.html" - + # Load the ODCS template + template_file = "datacontract_odcs.html" template = env.get_template(template_file) style_content, _, _ = package_loader.get_source(env, "style/output.css") - quality_specification = None - if isinstance(data_contract_spec, DataContractSpecification): - if data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, str): - quality_specification = data_contract_spec.quality.specification - elif data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, object): - if data_contract_spec.quality.type == "great-expectations": - quality_specification = yaml.dump( - data_contract_spec.quality.specification, sort_keys=False, default_style="|" - ) - else: - quality_specification = yaml.dump(data_contract_spec.quality.specification, sort_keys=False) - - datacontract_yaml = data_contract_spec.to_yaml() + datacontract_yaml = data_contract.to_yaml() # Get the mermaid diagram - mermaid_diagram = to_mermaid(data_contract_spec) + mermaid_diagram = to_mermaid(data_contract) # Render the template with necessary data html_string = template.render( - datacontract=data_contract_spec, - quality_specification=quality_specification, + datacontract=data_contract, + quality_specification=None, # ODCS doesn't have the deprecated quality.specification style=style_content, datacontract_yaml=datacontract_yaml, formatted_date=_formatted_date(), diff --git a/datacontract/export/iceberg_converter.py b/datacontract/export/iceberg_converter.py deleted file mode 100644 index a343ec807..000000000 --- a/datacontract/export/iceberg_converter.py +++ /dev/null @@ -1,188 +0,0 @@ -from pyiceberg import types -from pyiceberg.schema import Schema, assign_fresh_schema_ids - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Field, - Model, -) - - -class IcebergExporter(Exporter): - """ - Exporter class for exporting data contracts to Iceberg schemas. - """ - - def export( - self, - data_contract: DataContractSpecification, - model, - server, - sql_server_type, - export_args, - ): - """ - Export the given data contract model to an Iceberg schema. - - Args: - data_contract (DataContractSpecification): The data contract specification. - model: The model to export, currently just supports one model. - server: Not used in this implementation. - sql_server_type: Not used in this implementation. - export_args: Additional arguments for export. - - Returns: - str: A string representation of the Iceberg json schema. - """ - - return to_iceberg(data_contract, model) - - -def to_iceberg(contract: DataContractSpecification, model: str) -> str: - """ - Converts a DataContractSpecification into an Iceberg json schema string. JSON string follows https://iceberg.apache.org/spec/#appendix-c-json-serialization. - - Args: - contract (DataContractSpecification): The data contract specification containing models. - model: The model to export, currently just supports one model. - - Returns: - str: A string representation of the Iceberg json schema. - """ - if model is None or model == "all": - if len(contract.models.items()) != 1: - # Iceberg doesn't have a way to combine multiple models into a single schema, an alternative would be to export json lines - raise Exception(f"Can only output one model at a time, found {len(contract.models.items())} models") - for model_name, model in contract.models.items(): - schema = to_iceberg_schema(model) - else: - if model not in contract.models: - raise Exception(f"model {model} not found in contract") - schema = to_iceberg_schema(contract.models[model]) - - return schema.model_dump_json() - - -def to_iceberg_schema(model: Model) -> types.StructType: - """ - Convert a model to a Iceberg schema. - - Args: - model (Model): The model to convert. - - Returns: - types.StructType: The corresponding Iceberg schema. - """ - iceberg_fields = [] - primary_keys = [] - for field_name, spec_field in model.fields.items(): - iceberg_field = make_field(field_name, spec_field) - iceberg_fields.append(iceberg_field) - - if spec_field.primaryKey: - primary_keys.append(iceberg_field.name) - - schema = Schema(*iceberg_fields) - - # apply non-0 field IDs so we can set the identifier fields for the schema - schema = assign_fresh_schema_ids(schema) - for field in schema.fields: - if field.name in primary_keys: - schema.identifier_field_ids.append(field.field_id) - - return schema - - -def make_field(field_name, field): - field_type = get_field_type(field) - - # Note: might want to re-populate field_id from config['icebergFieldId'] if it exists, however, it gets - # complicated since field_ids impact the list and map element_ids, and the importer is not keeping track of those. - # Even if IDs are re-constituted, it seems like the SDK code would still reset them before any operation against a catalog, - # so it's likely not worth it. - - # Note 2: field_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values. - # also, the Iceberg sdk catalog code will re-set the fieldIDs prior to executing any table operations on the schema - # ref: https://github.com/apache/iceberg-python/pull/1072 - return types.NestedField(field_id=0, name=field_name, field_type=field_type, required=field.required is True) - - -def make_list(item): - field_type = get_field_type(item) - - # element_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field) - return types.ListType(element_id=0, element_type=field_type, element_required=item.required is True) - - -def make_map(field): - key_type = get_field_type(field.keys) - value_type = get_field_type(field.values) - - # key_id and value_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field) - return types.MapType( - key_id=0, key_type=key_type, value_id=0, value_type=value_type, value_required=field.values.required is True - ) - - -def to_struct_type(fields: dict[str, Field]) -> types.StructType: - """ - Convert a dictionary of fields to a Iceberg StructType. - - Args: - fields (dict[str, Field]): The fields to convert. - - Returns: - types.StructType: The corresponding Iceberg StructType. - """ - struct_fields = [] - for field_name, field in fields.items(): - struct_field = make_field(field_name, field) - struct_fields.append(struct_field) - return types.StructType(*struct_fields) - - -def get_field_type(field: Field) -> types.IcebergType: - """ - Convert a field to a Iceberg IcebergType. - - Args: - field (Field): The field to convert. - - Returns: - types.IcebergType: The corresponding Iceberg IcebergType. - """ - field_type = field.type - if field_type is None or field_type in ["null"]: - return types.NullType() - if field_type == "array": - return make_list(field.items) - if field_type == "map": - return make_map(field) - if field_type in ["object", "record", "struct"]: - return to_struct_type(field.fields) - if field_type in ["string", "varchar", "text"]: - return types.StringType() - if field_type in ["number", "decimal", "numeric"]: - precision = field.precision if field.precision is not None else 38 - scale = field.scale if field.scale is not None else 0 - return types.DecimalType(precision=precision, scale=scale) - if field_type in ["integer", "int"]: - return types.IntegerType() - if field_type in ["bigint", "long"]: - return types.LongType() - if field_type == "float": - return types.FloatType() - if field_type == "double": - return types.DoubleType() - if field_type == "boolean": - return types.BooleanType() - if field_type in ["timestamp", "timestamp_tz"]: - return types.TimestamptzType() - if field_type == "timestamp_ntz": - return types.TimestampType() - if field_type == "date": - return types.DateType() - if field_type == "bytes": - return types.BinaryType() - return types.BinaryType() diff --git a/datacontract/export/iceberg_exporter.py b/datacontract/export/iceberg_exporter.py new file mode 100644 index 000000000..863c3cb65 --- /dev/null +++ b/datacontract/export/iceberg_exporter.py @@ -0,0 +1,315 @@ +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty +from pyiceberg import types +from pyiceberg.schema import Schema, assign_fresh_schema_ids + +from datacontract.export.exporter import Exporter + + +class IcebergExporter(Exporter): + """ + Exporter class for exporting data contracts to Iceberg schemas. + """ + + def export( + self, + data_contract: OpenDataContractStandard, + schema_name, + server, + sql_server_type, + export_args, + ): + """ + Export the given data contract to an Iceberg schema. + + Args: + data_contract (OpenDataContractStandard): The data contract specification. + schema_name: The name of the schema to export, or 'all' for all schemas. + server: Not used in this implementation. + sql_server_type: Not used in this implementation. + export_args: Additional arguments for export. + + Returns: + str: A string representation of the Iceberg json schema. + """ + + return to_iceberg(data_contract, schema_name) + + +def to_iceberg(contract: OpenDataContractStandard, model: str) -> str: + """ + Converts an OpenDataContractStandard into an Iceberg json schema string. JSON string follows https://iceberg.apache.org/spec/#appendix-c-json-serialization. + + Args: + contract (OpenDataContractStandard): The data contract specification containing models. + model: The model to export, currently just supports one model. + + Returns: + str: A string representation of the Iceberg json schema. + """ + if not contract.schema_: + raise Exception("No schema found in contract") + + if model is None or model == "all": + if len(contract.schema_) != 1: + # Iceberg doesn't have a way to combine multiple models into a single schema, an alternative would be to export json lines + raise Exception(f"Can only output one model at a time, found {len(contract.schema_)} models") + schema_obj = contract.schema_[0] + schema = to_iceberg_schema(schema_obj) + else: + # Find the specific schema by name + schema_obj = next((s for s in contract.schema_ if s.name == model), None) + if schema_obj is None: + raise Exception(f"model {model} not found in contract") + schema = to_iceberg_schema(schema_obj) + + return schema.model_dump_json() + + +def to_iceberg_schema(schema_obj: SchemaObject) -> types.StructType: + """ + Convert a schema object to an Iceberg schema. + + Args: + schema_obj (SchemaObject): The schema object to convert. + + Returns: + types.StructType: The corresponding Iceberg schema. + """ + iceberg_fields = [] + primary_keys = [] + + if schema_obj.properties: + for prop in schema_obj.properties: + iceberg_field = make_field(prop.name, prop) + iceberg_fields.append(iceberg_field) + + if prop.primaryKey: + primary_keys.append(iceberg_field.name) + + schema = Schema(*iceberg_fields) + + # apply non-0 field IDs so we can set the identifier fields for the schema + schema = assign_fresh_schema_ids(schema) + for field in schema.fields: + if field.name in primary_keys: + schema.identifier_field_ids.append(field.field_id) + + return schema + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the logical type from a schema property.""" + return prop.logicalType + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_custom_property_value(prop: SchemaProperty, key: str): + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key and cp.value is not None: + return cp.value + return None + + +def make_field(field_name: str, prop: SchemaProperty) -> types.NestedField: + field_type = get_field_type(prop) + + # Note: might want to re-populate field_id from config['icebergFieldId'] if it exists, however, it gets + # complicated since field_ids impact the list and map element_ids, and the importer is not keeping track of those. + # Even if IDs are re-constituted, it seems like the SDK code would still reset them before any operation against a catalog, + # so it's likely not worth it. + + # Note 2: field_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values. + # also, the Iceberg sdk catalog code will re-set the fieldIDs prior to executing any table operations on the schema + # ref: https://github.com/apache/iceberg-python/pull/1072 + return types.NestedField(field_id=0, name=field_name, field_type=field_type, required=prop.required is True) + + +def make_list(item: SchemaProperty) -> types.ListType: + field_type = get_field_type(item) + + # element_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field) + return types.ListType(element_id=0, element_type=field_type, element_required=item.required is True) + + +def _type_str_to_iceberg_type(type_str: str) -> types.IcebergType: + """Convert a type string to an Iceberg type.""" + if not type_str: + return types.StringType() + t = type_str.lower() + if t == "string": + return types.StringType() + elif t in ["integer", "int"]: + return types.IntegerType() + elif t in ["long", "bigint"]: + return types.LongType() + elif t == "number": + return types.DecimalType(precision=38, scale=0) + elif t in ["float"]: + return types.FloatType() + elif t in ["double"]: + return types.DoubleType() + elif t == "boolean": + return types.BooleanType() + elif t == "date": + return types.DateType() + elif t == "timestamp": + return types.TimestamptzType() + elif t in ["bytes", "binary"]: + return types.BinaryType() + else: + return types.StringType() + + +def _get_custom_prop(prop: SchemaProperty, key: str) -> Optional[str]: + """Get a custom property value from a SchemaProperty.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def make_map(prop: SchemaProperty) -> types.MapType: + # For ODCS, read key/value types from customProperties + # Default to string -> string if not specified + key_type = types.StringType() + value_type = types.StringType() + + key_type_str = _get_custom_prop(prop, "mapKeyType") + value_type_str = _get_custom_prop(prop, "mapValueType") + value_physical_type = _get_custom_prop(prop, "mapValuePhysicalType") + value_required_str = _get_custom_prop(prop, "mapValueRequired") + value_required = value_required_str == "true" if value_required_str else False + + if key_type_str: + key_type = _type_str_to_iceberg_type(key_type_str) + + # Handle nested map in value type + if value_physical_type == "map": + nested_key_type = _get_custom_prop(prop, "mapNestedKeyType") or "string" + nested_value_type = _get_custom_prop(prop, "mapNestedValueType") or "string" + nested_value_required_str = _get_custom_prop(prop, "mapNestedValueRequired") + nested_value_required = nested_value_required_str == "true" if nested_value_required_str else True + value_type = types.MapType( + key_id=0, + key_type=_type_str_to_iceberg_type(nested_key_type), + value_id=0, + value_type=_type_str_to_iceberg_type(nested_value_type), + value_required=nested_value_required + ) + elif value_type_str: + value_type = _type_str_to_iceberg_type(value_type_str) + + # key_id and value_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field) + return types.MapType(key_id=0, key_type=key_type, value_id=0, value_type=value_type, value_required=value_required) + + +def to_struct_type(properties: List[SchemaProperty]) -> types.StructType: + """ + Convert a list of properties to an Iceberg StructType. + + Args: + properties (List[SchemaProperty]): The properties to convert. + + Returns: + types.StructType: The corresponding Iceberg StructType. + """ + struct_fields = [] + for prop in properties: + struct_field = make_field(prop.name, prop) + struct_fields.append(struct_field) + return types.StructType(*struct_fields) + + +def get_field_type(prop: SchemaProperty) -> types.IcebergType: + """ + Convert a property to an Iceberg IcebergType. + + Args: + prop (SchemaProperty): The property to convert. + + Returns: + types.IcebergType: The corresponding Iceberg IcebergType. + """ + logical_type = _get_type(prop) + physical_type = prop.physicalType.lower() if prop.physicalType else None + + # Handle null type + if logical_type is None and physical_type is None: + return types.NullType() + if physical_type == "null": + return types.NullType() + + # Handle array type + if logical_type == "array": + if prop.items: + return make_list(prop.items) + return types.ListType(element_id=0, element_type=types.StringType(), element_required=False) + + # Handle map type + if physical_type == "map": + return make_map(prop) + + # Handle object/struct type + if logical_type == "object" or physical_type in ["object", "record", "struct"]: + if prop.properties: + return to_struct_type(prop.properties) + return types.StructType() + + # Check physical type first for specific SQL types + if physical_type: + if physical_type in ["string", "varchar", "text", "char", "nvarchar"]: + return types.StringType() + if physical_type in ["decimal", "numeric"]: + precision = _get_custom_property_value(prop, "precision") or 38 + scale = _get_custom_property_value(prop, "scale") or 0 + return types.DecimalType(precision=precision, scale=scale) + if physical_type in ["integer", "int", "int32"]: + return types.IntegerType() + if physical_type in ["bigint", "long", "int64"]: + return types.LongType() + if physical_type in ["float", "real", "float32"]: + return types.FloatType() + if physical_type in ["double", "float64"]: + return types.DoubleType() + if physical_type in ["boolean", "bool"]: + return types.BooleanType() + if physical_type in ["timestamp", "timestamp_tz"]: + return types.TimestamptzType() + if physical_type == "timestamp_ntz": + return types.TimestampType() + if physical_type == "date": + return types.DateType() + if physical_type in ["bytes", "binary", "bytea"]: + return types.BinaryType() + + # Fall back to logical type + match logical_type: + case "string": + return types.StringType() + case "number": + precision = _get_custom_property_value(prop, "precision") or 38 + scale = _get_custom_property_value(prop, "scale") or 0 + return types.DecimalType(precision=precision, scale=scale) + case "integer": + return types.LongType() + case "boolean": + return types.BooleanType() + case "timestamp": + return types.TimestamptzType() + case "date": + return types.DateType() + case _: + return types.BinaryType() diff --git a/datacontract/export/jsonschema_converter.py b/datacontract/export/jsonschema_converter.py deleted file mode 100644 index 80d9c08e2..000000000 --- a/datacontract/export/jsonschema_converter.py +++ /dev/null @@ -1,157 +0,0 @@ -import json -from typing import Dict - -from datacontract.export.exporter import Exporter, _check_models_for_export -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model - - -class JsonSchemaExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - return to_jsonschema_json(model_name, model_value) - - -def to_jsonschemas(data_contract_spec: DataContractSpecification): - jsonschmemas = {} - for model_key, model_value in data_contract_spec.models.items(): - jsonschema = to_jsonschema(model_key, model_value) - jsonschmemas[model_key] = jsonschema - return jsonschmemas - - -def to_jsonschema_json(model_key, model_value: Model) -> str: - jsonschema = to_jsonschema(model_key, model_value) - return json.dumps(jsonschema, indent=2) - - -def to_properties(fields: Dict[str, Field]) -> dict: - properties = {} - for field_name, field in fields.items(): - properties[field_name] = to_property(field) - return properties - - -def to_property(field: Field) -> dict: - property = {} - json_type, json_format = convert_type_format(field.type, field.format) - if json_type is not None: - if not field.required: - """ - From: https://json-schema.org/understanding-json-schema/reference/type - The type keyword may either be a string or an array: - - If it's a string, it is the name of one of the basic types above. - If it is an array, it must be an array of strings, where each string - is the name of one of the basic types, and each element is unique. - In this case, the JSON snippet is valid if it matches any of the given types. - """ - property["type"] = [json_type, "null"] - else: - property["type"] = json_type - if json_format is not None: - property["format"] = json_format - if field.primaryKey: - property["primaryKey"] = field.primaryKey - if field.unique: - property["unique"] = True - if json_type == "object": - # TODO: any better idea to distinguish between properties and patternProperties? - if field.fields.keys() and next(iter(field.fields.keys())).startswith("^"): - property["patternProperties"] = to_properties(field.fields) - else: - property["properties"] = to_properties(field.fields) - property["required"] = to_required(field.fields) - if json_type == "array": - property["items"] = to_property(field.items) - if field.pattern: - property["pattern"] = field.pattern - if field.enum: - property["enum"] = field.enum - if field.minLength is not None: - property["minLength"] = field.minLength - if field.maxLength is not None: - property["maxLength"] = field.maxLength - if field.title: - property["title"] = field.title - if field.description: - property["description"] = field.description - if field.exclusiveMinimum is not None: - property["exclusiveMinimum"] = field.exclusiveMinimum - if field.exclusiveMaximum is not None: - property["exclusiveMaximum"] = field.exclusiveMaximum - if field.minimum is not None: - property["minimum"] = field.minimum - if field.maximum is not None: - property["maximum"] = field.maximum - if field.tags: - property["tags"] = field.tags - if field.pii: - property["pii"] = field.pii - if field.classification is not None: - property["classification"] = field.classification - - # TODO: all constraints - return property - - -def to_required(fields: Dict[str, Field]): - required = [] - for field_name, field in fields.items(): - if field.required is True: - required.append(field_name) - return required - - -def convert_type_format(type, format) -> (str, str): - if type is None: - return None, None - if type.lower() in ["string", "varchar", "text"]: - return "string", format - if type.lower() in ["timestamp", "timestamp_tz", "date-time", "datetime"]: - return "string", "date-time" - if type.lower() in ["timestamp_ntz"]: - return "string", None - if type.lower() in ["date"]: - return "string", "date" - if type.lower() in ["time"]: - return "string", "time" - if type.lower() in ["number", "decimal", "numeric", "float", "double"]: - return "number", None - if type.lower() in ["integer", "int", "long", "bigint"]: - return "integer", None - if type.lower() in ["boolean"]: - return "boolean", None - if type.lower() in ["object", "record", "struct"]: - return "object", None - if type.lower() in ["array"]: - return "array", None - return None, None - - -def convert_format(self, format): - if format is None: - return None - if format.lower() in ["uri"]: - return "uri" - if format.lower() in ["email"]: - return "email" - if format.lower() in ["uuid"]: - return "uuid" - if format.lower() in ["boolean"]: - return "boolean" - return None - - -def to_jsonschema(model_key, model_value: Model) -> dict: - model = { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": to_properties(model_value.fields), - "required": to_required(model_value.fields), - } - if model_value.title: - model["title"] = model_value.title - if model_value.description: - model["description"] = model_value.description - - return model diff --git a/datacontract/export/jsonschema_exporter.py b/datacontract/export/jsonschema_exporter.py new file mode 100644 index 000000000..7746c4a5a --- /dev/null +++ b/datacontract/export/jsonschema_exporter.py @@ -0,0 +1,231 @@ +import json +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export + + +class JsonSchemaExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + model_name, model_value = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + return to_jsonschema_json(model_name, model_value) + + +def to_jsonschemas(data_contract: OpenDataContractStandard) -> dict: + jsonschemas = {} + if data_contract.schema_: + for schema_obj in data_contract.schema_: + jsonschema = to_jsonschema(schema_obj.name, schema_obj) + jsonschemas[schema_obj.name] = jsonschema + return jsonschemas + + +def to_jsonschema_json(model_key: str, model_value: SchemaObject) -> str: + jsonschema = to_jsonschema(model_key, model_value) + return json.dumps(jsonschema, indent=2) + + +def to_properties(properties: List[SchemaProperty]) -> dict: + result = {} + for prop in properties: + result[prop.name] = to_property(prop) + return result + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_config_value(prop: SchemaProperty, key: str): + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_enum_from_quality(prop: SchemaProperty): + """Get enum values from quality rules (invalidValues metric with validValues).""" + if prop.quality is None: + return None + for q in prop.quality: + if q.metric == "invalidValues" and q.arguments: + valid_values = q.arguments.get("validValues") + if valid_values: + return valid_values + return None + + +def to_property(prop: SchemaProperty) -> dict: + property_dict = {} + field_type = prop.logicalType + json_type, json_format = convert_type_format(field_type, _get_logical_type_option(prop, "format")) + + if json_type is not None: + if not prop.required: + """ + From: https://json-schema.org/understanding-json-schema/reference/type + The type keyword may either be a string or an array: + + If it's a string, it is the name of one of the basic types above. + If it is an array, it must be an array of strings, where each string + is the name of one of the basic types, and each element is unique. + In this case, the JSON snippet is valid if it matches any of the given types. + """ + property_dict["type"] = [json_type, "null"] + else: + property_dict["type"] = json_type + + if json_format is not None: + property_dict["format"] = json_format + + if prop.primaryKey: + property_dict["primaryKey"] = prop.primaryKey + + if prop.unique: + property_dict["unique"] = True + + if json_type == "object": + nested_props = prop.properties or [] + # TODO: any better idea to distinguish between properties and patternProperties? + if nested_props and nested_props[0].name.startswith("^"): + property_dict["patternProperties"] = to_properties(nested_props) + else: + property_dict["properties"] = to_properties(nested_props) + property_dict["required"] = to_required(nested_props) + + if json_type == "array" and prop.items: + property_dict["items"] = to_property(prop.items) + + pattern = _get_logical_type_option(prop, "pattern") + if pattern: + property_dict["pattern"] = pattern + + # Check logicalTypeOptions, customProperties, or quality rules for enum + enum_values = _get_logical_type_option(prop, "enum") + if not enum_values: + enum_from_custom = _get_config_value(prop, "enum") + if enum_from_custom: + # Parse JSON string from customProperties + try: + enum_values = json.loads(enum_from_custom) + except (json.JSONDecodeError, TypeError): + enum_values = None + if not enum_values: + # Check quality rules for invalidValues metric with validValues + enum_values = _get_enum_from_quality(prop) + if enum_values: + property_dict["enum"] = enum_values + + min_length = _get_logical_type_option(prop, "minLength") + if min_length is not None: + property_dict["minLength"] = min_length + + max_length = _get_logical_type_option(prop, "maxLength") + if max_length is not None: + property_dict["maxLength"] = max_length + + if prop.businessName: + property_dict["title"] = prop.businessName + + if prop.description: + property_dict["description"] = prop.description + + exclusive_minimum = _get_logical_type_option(prop, "exclusiveMinimum") + if exclusive_minimum is not None: + property_dict["exclusiveMinimum"] = exclusive_minimum + + exclusive_maximum = _get_logical_type_option(prop, "exclusiveMaximum") + if exclusive_maximum is not None: + property_dict["exclusiveMaximum"] = exclusive_maximum + + minimum = _get_logical_type_option(prop, "minimum") + if minimum is not None: + property_dict["minimum"] = minimum + + maximum = _get_logical_type_option(prop, "maximum") + if maximum is not None: + property_dict["maximum"] = maximum + + if prop.tags: + property_dict["tags"] = prop.tags + + pii = _get_config_value(prop, "pii") + if pii: + property_dict["pii"] = pii + + if prop.classification is not None: + property_dict["classification"] = prop.classification + + # TODO: all constraints + return property_dict + + +def to_required(properties: List[SchemaProperty]) -> list: + required = [] + for prop in properties: + if prop.required is True: + required.append(prop.name) + return required + + +def convert_type_format(type_str: Optional[str], format_str: Optional[str]) -> tuple: + if type_str is None: + return None, None + if type_str.lower() in ["string", "varchar", "text"]: + return "string", format_str + if type_str.lower() in ["timestamp", "timestamp_tz", "date-time", "datetime"]: + return "string", "date-time" + if type_str.lower() in ["timestamp_ntz"]: + return "string", None + if type_str.lower() in ["date"]: + return "string", "date" + if type_str.lower() in ["time"]: + return "string", "time" + if type_str.lower() in ["number", "decimal", "numeric", "float", "double"]: + return "number", None + if type_str.lower() in ["integer", "int", "long", "bigint"]: + return "integer", None + if type_str.lower() in ["boolean"]: + return "boolean", None + if type_str.lower() in ["object", "record", "struct"]: + return "object", None + if type_str.lower() in ["array"]: + return "array", None + return None, None + + +def convert_format(format_str: Optional[str]) -> Optional[str]: + if format_str is None: + return None + if format_str.lower() in ["uri"]: + return "uri" + if format_str.lower() in ["email"]: + return "email" + if format_str.lower() in ["uuid"]: + return "uuid" + if format_str.lower() in ["boolean"]: + return "boolean" + return None + + +def to_jsonschema(model_key: str, model_value: SchemaObject) -> dict: + properties = model_value.properties or [] + model = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": to_properties(properties), + "required": to_required(properties), + } + if model_value.businessName: + model["title"] = model_value.businessName + if model_value.description: + model["description"] = model_value.description + + return model diff --git a/datacontract/export/markdown_converter.py b/datacontract/export/markdown_converter.py deleted file mode 100644 index 0d235fae3..000000000 --- a/datacontract/export/markdown_converter.py +++ /dev/null @@ -1,318 +0,0 @@ -from typing import Dict, List - -from pydantic import BaseModel - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Definition, - Field, - Model, - Server, - ServiceLevel, -) - -TAB = " " -ARROW = "↳" - - -class MarkdownExporter(Exporter): - """Exporter implementation for converting data contracts to Markdown.""" - - def export( - self, - data_contract: DataContractSpecification, - model: Model, - server: str, - sql_server_type: str, - export_args: dict, - ) -> str: - """Exports a data contract to Markdown format.""" - return to_markdown(data_contract) - - -def to_markdown(data_contract: DataContractSpecification) -> str: - """ - Convert a data contract to its Markdown representation. - - Args: - data_contract (DataContractSpecification): The data contract to convert. - - Returns: - str: The Markdown representation of the data contract. - """ - markdown_parts = [ - f"# {data_contract.id}", - "## Info", - obj_attributes_to_markdown(data_contract.info), - "", - "## Servers", - servers_to_markdown(data_contract.servers), - "", - "## Terms", - obj_attributes_to_markdown(data_contract.terms), - "", - "## Models", - models_to_markdown(data_contract.models), - "", - "## Definitions", - definitions_to_markdown(data_contract.definitions), - "", - "## Service levels", - service_level_to_markdown(data_contract.servicelevels), - ] - return "\n".join(markdown_parts) - - -def obj_attributes_to_markdown(obj: BaseModel, excluded_fields: set = set(), is_in_table_cell: bool = False) -> str: - if not obj: - return "" - if is_in_table_cell: - bullet_char = "•" - newline_char = "
" - else: - bullet_char = "-" - newline_char = "\n" - model_attributes_to_include = set(obj.__class__.model_fields.keys()) - obj_model = obj.model_dump(exclude_unset=True, include=model_attributes_to_include, exclude=excluded_fields) - description_value = obj_model.pop("description", None) - attributes = [ - (f"{bullet_char} `{attr}`" if value is True else f"{bullet_char} **{attr}:** {value}") - for attr, value in obj_model.items() - if value - ] - description = f"*{description_to_markdown(description_value)}*" - extra = [extra_to_markdown(obj)] if obj.model_extra else [] - return newline_char.join([description] + attributes + extra) - - -def servers_to_markdown(servers: Dict[str, Server]) -> str: - if not servers: - return "" - markdown_parts = [ - "| Name | Type | Attributes |", - "| ---- | ---- | ---------- |", - ] - for server_name, server in servers.items(): - markdown_parts.append( - f"| {server_name} | {server.type or ''} | {obj_attributes_to_markdown(server, {'type'}, True)} |" - ) - return "\n".join(markdown_parts) - - -def models_to_markdown(models: Dict[str, Model]) -> str: - return "\n".join(model_to_markdown(model_name, model) for model_name, model in models.items()) - - -def model_to_markdown(model_name: str, model: Model) -> str: - """ - Generate Markdown representation for a specific model. - - Args: - model_name (str): The name of the model. - model (Model): The model object. - - Returns: - str: The Markdown representation of the model. - """ - parts = [ - f"### {model_name}", - f"*{description_to_markdown(model.description)}*", - "", - "| Field | Type | Attributes |", - "| ----- | ---- | ---------- |", - ] - - # Append generated field rows - parts.append(fields_to_markdown(model.fields)) - return "\n".join(parts) - - -def fields_to_markdown( - fields: Dict[str, Field], - level: int = 0, -) -> str: - """ - Generate Markdown table rows for all fields in a model. - - Args: - fields (Dict[str, Field]): The fields to process. - level (int): The level of nesting for indentation. - - Returns: - str: A Markdown table rows for the fields. - """ - - return "\n".join(field_to_markdown(field_name, field, level) for field_name, field in fields.items()) - - -def field_to_markdown(field_name: str, field: Field, level: int = 0) -> str: - """ - Generate Markdown table rows for a single field, including nested structures. - - Args: - field_name (str): The name of the field. - field (Field): The field object. - level (int): The level of nesting for indentation. - - Returns: - str: A Markdown table rows for the field. - """ - tabs = TAB * level - arrow = ARROW if level > 0 else "" - column_name = f"{tabs}{arrow} {field_name}" - - attributes = obj_attributes_to_markdown(field, {"type", "fields", "items", "keys", "values"}, True) - - rows = [f"| {column_name} | {field.type} | {attributes} |"] - - # Recursively handle nested fields, array, map - if field.fields: - rows.append(fields_to_markdown(field.fields, level + 1)) - if field.items: - rows.append(field_to_markdown("items", field.items, level + 1)) - if field.keys: - rows.append(field_to_markdown("keys", field.keys, level + 1)) - if field.values: - rows.append(field_to_markdown("values", field.values, level + 1)) - - return "\n".join(rows) - - -def definitions_to_markdown(definitions: Dict[str, Definition]) -> str: - if not definitions: - return "" - markdown_parts = [ - "| Name | Type | Domain | Attributes |", - "| ---- | ---- | ------ | ---------- |", - ] - for definition_name, definition in definitions.items(): - markdown_parts.append( - f"| {definition_name} | {definition.type or ''} | {definition.domain or ''} | {obj_attributes_to_markdown(definition, {'name', 'type', 'domain'}, True)} |", - ) - return "\n".join(markdown_parts) - - -def service_level_to_markdown(service_level: ServiceLevel | None) -> str: - if not service_level: - return "" - sections = { - "Availability": service_level.availability, - "Retention": service_level.retention, - "Latency": service_level.latency, - "Freshness": service_level.freshness, - "Frequency": service_level.frequency, - "Support": service_level.support, - "Backup": service_level.backup, - } - result = [f"### {name}\n{obj_attributes_to_markdown(attr)}\n" for name, attr in sections.items() if attr] - return "\n".join(result) - - -def description_to_markdown(description: str | None) -> str: - return (description or "No description.").replace("\n", "
") - - -def array_of_dict_to_markdown(array: List[Dict[str, str]]) -> str: - """ - Convert a list of dictionaries to a Markdown table. - - Args: - array (List[Dict[str, str]]): A list of dictionaries where each dictionary represents a row in the table. - - Returns: - str: A Markdown formatted table. - """ - if not array: - return "" - - headers = [] - - for item in array: - headers += item.keys() - headers = list(dict.fromkeys(headers)) # Preserve order and remove duplicates - - markdown_parts = [ - "| " + " | ".join(headers) + " |", - "| " + " | ".join(["---"] * len(headers)) + " |", - ] - - for row in array: - element = row - markdown_parts.append( - "| " - + " | ".join( - f"{str(element.get(header, ''))}".replace("\n", "
").replace("\t", TAB) for header in headers - ) - + " |" - ) - - return "\n".join(markdown_parts) + "\n" - - -def array_to_markdown(array: List[str]) -> str: - """ - Convert a list of strings to a Markdown formatted list. - - Args: - array (List[str]): A list of strings to convert. - - Returns: - str: A Markdown formatted list. - """ - if not array: - return "" - return "\n".join(f"- {item}" for item in array) + "\n" - - -def dict_to_markdown(dictionary: Dict[str, str]) -> str: - """ - Convert a dictionary to a Markdown formatted list. - - Args: - dictionary (Dict[str, str]): A dictionary where keys are item names and values are item descriptions. - - Returns: - str: A Markdown formatted list of items. - """ - if not dictionary: - return "" - - markdown_parts = [] - for key, value in dictionary.items(): - if isinstance(value, dict): - markdown_parts.append(f"- {key}") - nested_markdown = dict_to_markdown(value) - if nested_markdown: - nested_lines = nested_markdown.split("\n") - for line in nested_lines: - if line.strip(): - markdown_parts.append(f" {line}") - else: - markdown_parts.append(f"- {key}: {value}") - return "\n".join(markdown_parts) + "\n" - - -def extra_to_markdown(obj: BaseModel) -> str: - """ - Convert the extra attributes of a data contract to Markdown format. - Args: - obj (BaseModel): The data contract object containing extra attributes. - Returns: - str: A Markdown formatted string representing the extra attributes of the data contract. - """ - markdown_part = "" - extra = obj.model_extra - if extra: - for key_extra, value_extra in extra.items(): - markdown_part += f"\n### {key_extra.capitalize()}\n" - if isinstance(value_extra, list) and len(value_extra): - if isinstance(value_extra[0], dict): - markdown_part += array_of_dict_to_markdown(value_extra) - elif isinstance(value_extra[0], str): - markdown_part += array_to_markdown(value_extra) - elif isinstance(value_extra, dict): - markdown_part += dict_to_markdown(value_extra) - else: - markdown_part += f"{str(value_extra)}\n" - return markdown_part diff --git a/datacontract/export/markdown_exporter.py b/datacontract/export/markdown_exporter.py new file mode 100644 index 000000000..443417f8f --- /dev/null +++ b/datacontract/export/markdown_exporter.py @@ -0,0 +1,380 @@ +from typing import Dict, List, Optional + +from open_data_contract_standard.model import ( + Description, + OpenDataContractStandard, + SchemaObject, + SchemaProperty, + Server, + ServiceLevelAgreementProperty, +) +from pydantic import BaseModel + +from datacontract.export.exporter import Exporter + +TAB = " " +ARROW = "↳" + + +class MarkdownExporter(Exporter): + """Exporter implementation for converting data contracts to Markdown.""" + + def export( + self, + data_contract: OpenDataContractStandard, + schema_name: str, + server: str, + sql_server_type: str, + export_args: dict, + ) -> str: + """Exports a data contract to Markdown format.""" + return to_markdown(data_contract) + + +def to_markdown(data_contract: OpenDataContractStandard) -> str: + """ + Convert a data contract to its Markdown representation. + + Args: + data_contract (OpenDataContractStandard): The data contract to convert. + + Returns: + str: The Markdown representation of the data contract. + """ + markdown_parts = [ + f"# {data_contract.id}", + "## Info", + info_to_markdown(data_contract), + "", + "## Terms of Use", + terms_of_use_to_markdown(data_contract.description), + "", + "## Servers", + servers_to_markdown(data_contract.servers), + "", + "## Schema", + schema_to_markdown(data_contract.schema_), + "", + "## SLA Properties", + sla_properties_to_markdown(data_contract.slaProperties), + ] + return "\n".join(markdown_parts) + + +def info_to_markdown(data_contract: OpenDataContractStandard) -> str: + """Convert basic info to markdown.""" + parts = [] + if data_contract.description: + parts.append(f"*{description_to_markdown(data_contract.description)}*") + if data_contract.name: + parts.append(f"- **name:** {data_contract.name}") + if data_contract.version: + parts.append(f"- **version:** {data_contract.version}") + if data_contract.status: + parts.append(f"- **status:** {data_contract.status}") + if data_contract.team: + parts.append(f"- **team:** {data_contract.team.name}") + return "\n".join(parts) + + +def terms_of_use_to_markdown(description: Optional[Description]) -> str: + """Convert Description object's terms of use fields to markdown.""" + if not description: + return "*No terms of use defined.*" + + # Handle case where description is a string (legacy) + if isinstance(description, str): + return "*No terms of use defined.*" + + parts = [] + if description.usage: + parts.append(f"### Usage\n{description.usage}") + if description.purpose: + parts.append(f"### Purpose\n{description.purpose}") + if description.limitations: + parts.append(f"### Limitations\n{description.limitations}") + + if not parts: + return "*No terms of use defined.*" + + return "\n\n".join(parts) + + +def obj_attributes_to_markdown(obj: BaseModel, excluded_fields: set = set(), is_in_table_cell: bool = False) -> str: + if not obj: + return "" + if is_in_table_cell: + bullet_char = "•" + newline_char = "
" + else: + bullet_char = "-" + newline_char = "\n" + model_attributes_to_include = set(obj.__class__.model_fields.keys()) + obj_model = obj.model_dump(exclude_unset=True, include=model_attributes_to_include, exclude=excluded_fields) + description_value = obj_model.pop("description", None) + attributes = [ + (f"{bullet_char} `{attr}`" if value is True else f"{bullet_char} **{attr}:** {value}") + for attr, value in obj_model.items() + if value + ] + description = f"*{description_to_markdown(description_value)}*" + extra = [extra_to_markdown(obj, is_in_table_cell)] if obj.model_extra else [] + return newline_char.join([description] + attributes + extra) + + +def servers_to_markdown(servers: Optional[List[Server]]) -> str: + if not servers: + return "" + markdown_parts = [ + "| Name | Type | Attributes |", + "| ---- | ---- | ---------- |", + ] + for server in servers: + server_name = server.server or "" + markdown_parts.append( + f"| {server_name} | {server.type or ''} | {obj_attributes_to_markdown(server, {'type', 'server'}, True)} |" + ) + return "\n".join(markdown_parts) + + +def schema_to_markdown(schema: Optional[List[SchemaObject]]) -> str: + if not schema: + return "" + return "\n".join(schema_obj_to_markdown(schema_obj.name, schema_obj) for schema_obj in schema) + + +def schema_obj_to_markdown(model_name: str, schema_obj: SchemaObject) -> str: + """ + Generate Markdown representation for a specific schema object. + + Args: + model_name (str): The name of the model. + schema_obj (SchemaObject): The schema object. + + Returns: + str: The Markdown representation of the schema object. + """ + parts = [ + f"### {model_name}", + f"*{description_to_markdown(schema_obj.description)}*", + "", + "| Field | Type | Attributes |", + "| ----- | ---- | ---------- |", + ] + + # Append generated field rows + parts.append(properties_to_markdown(schema_obj.properties)) + return "\n".join(parts) + + +def properties_to_markdown( + properties: Optional[List[SchemaProperty]], + level: int = 0, +) -> str: + """ + Generate Markdown table rows for all properties in a schema. + + Args: + properties (List[SchemaProperty]): The properties to process. + level (int): The level of nesting for indentation. + + Returns: + str: A Markdown table rows for the properties. + """ + if not properties: + return "" + return "\n".join(property_to_markdown(prop.name, prop, level) for prop in properties) + + +def _get_type(prop: SchemaProperty) -> str: + """Get the display type for a property.""" + if prop.logicalType: + return prop.logicalType + if prop.physicalType: + return prop.physicalType + return "" + + +def property_to_markdown(field_name: str, prop: SchemaProperty, level: int = 0) -> str: + """ + Generate Markdown table rows for a single property, including nested structures. + + Args: + field_name (str): The name of the field. + prop (SchemaProperty): The property object. + level (int): The level of nesting for indentation. + + Returns: + str: A Markdown table rows for the property. + """ + tabs = TAB * level + arrow = ARROW if level > 0 else "" + column_name = f"{tabs}{arrow} {field_name}" + + prop_type = _get_type(prop) + attributes = obj_attributes_to_markdown(prop, {"name", "logicalType", "physicalType", "properties", "items"}, True) + + rows = [f"| {column_name} | {prop_type} | {attributes} |"] + + # Recursively handle nested properties, array + if prop.properties: + rows.append(properties_to_markdown(prop.properties, level + 1)) + if prop.items: + rows.append(property_to_markdown("items", prop.items, level + 1)) + + return "\n".join(rows) + + +def sla_properties_to_markdown(sla_properties: Optional[List[ServiceLevelAgreementProperty]]) -> str: + """Convert SLA properties to markdown.""" + if not sla_properties: + return "" + + markdown_parts = [ + "| Property | Value | Unit |", + "| -------- | ----- | ---- |", + ] + for sla in sla_properties: + prop_name = sla.property or "" + value = sla.value or "" + unit = sla.unit or "" + markdown_parts.append(f"| {prop_name} | {value} | {unit} |") + return "\n".join(markdown_parts) + + +def description_to_markdown(description) -> str: + """Convert a description (string or Description object) to markdown text.""" + if description is None: + return "No description." + if isinstance(description, str): + return description.replace("\n", "
") + # Handle Description object - use purpose as the primary description + if hasattr(description, "purpose") and description.purpose: + return description.purpose.replace("\n", "
") + if hasattr(description, "usage") and description.usage: + return description.usage.replace("\n", "
") + return "No description." + + +def array_of_dict_to_markdown(array: List[Dict[str, str]]) -> str: + """ + Convert a list of dictionaries to a Markdown table. + + Args: + array (List[Dict[str, str]]): A list of dictionaries where each dictionary represents a row in the table. + + Returns: + str: A Markdown formatted table. + """ + if not array: + return "" + + headers = [] + + for item in array: + headers += item.keys() + headers = list(dict.fromkeys(headers)) # Preserve order and remove duplicates + + markdown_parts = [ + "| " + " | ".join(headers) + " |", + "| " + " | ".join(["---"] * len(headers)) + " |", + ] + + for row in array: + element = row + markdown_parts.append( + "| " + + " | ".join( + f"{str(element.get(header, ''))}".replace("\n", "
").replace("\t", TAB) for header in headers + ) + + " |" + ) + + return "\n".join(markdown_parts) + "\n" + + +def array_to_markdown(array: List[str]) -> str: + """ + Convert a list of strings to a Markdown formatted list. + + Args: + array (List[str]): A list of strings to convert. + + Returns: + str: A Markdown formatted list. + """ + if not array: + return "" + return "\n".join(f"- {item}" for item in array) + "\n" + + +def dict_to_markdown(dictionary: Dict[str, str]) -> str: + """ + Convert a dictionary to a Markdown formatted list. + + Args: + dictionary (Dict[str, str]): A dictionary where keys are item names and values are item descriptions. + + Returns: + str: A Markdown formatted list of items. + """ + if not dictionary: + return "" + + markdown_parts = [] + for key, value in dictionary.items(): + if isinstance(value, dict): + markdown_parts.append(f"- {key}") + nested_markdown = dict_to_markdown(value) + if nested_markdown: + nested_lines = nested_markdown.split("\n") + for line in nested_lines: + if line.strip(): + markdown_parts.append(f" {line}") + else: + markdown_parts.append(f"- {key}: {value}") + return "\n".join(markdown_parts) + "\n" + + +def extra_to_markdown(obj: BaseModel, is_in_table_cell: bool = False) -> str: + """ + Convert the extra attributes of a data contract to Markdown format. + Args: + obj (BaseModel): The data contract object containing extra attributes. + is_in_table_cell (bool): Whether the extra attributes are in a table cell. + Returns: + str: A Markdown formatted string representing the extra attributes of the data contract. + """ + extra = obj.model_extra + + if not extra: + return "" + + bullet_char = "•" + value_line_ending = "" if is_in_table_cell else "\n" + row_suffix = "
" if is_in_table_cell else "" + + def render_header(key: str) -> str: + return f"{bullet_char} **{key}:** " if is_in_table_cell else f"\n### {key.capitalize()}\n" + + parts: list[str] = [] + for key_extra, value_extra in extra.items(): + if not value_extra: + continue + + parts.append(render_header(key_extra)) + + if isinstance(value_extra, list) and len(value_extra): + if isinstance(value_extra[0], dict): + parts.append(array_of_dict_to_markdown(value_extra)) + elif isinstance(value_extra[0], str): + parts.append(array_to_markdown(value_extra)) + elif isinstance(value_extra, dict): + parts.append(dict_to_markdown(value_extra)) + else: + parts.append(f"{str(value_extra)}{value_line_ending}") + + if row_suffix: + parts.append(row_suffix) + + return "".join(parts) diff --git a/datacontract/export/mermaid_exporter.py b/datacontract/export/mermaid_exporter.py index 2cd91afd9..0eefc6680 100644 --- a/datacontract/export/mermaid_exporter.py +++ b/datacontract/export/mermaid_exporter.py @@ -1,94 +1,57 @@ from open_data_contract_standard.model import OpenDataContractStandard from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification class MermaidExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: return to_mermaid(data_contract) -def to_mermaid(data_contract_spec: DataContractSpecification | OpenDataContractStandard) -> str | None: - if isinstance(data_contract_spec, DataContractSpecification): - return dcs_to_mermaid(data_contract_spec) - elif isinstance(data_contract_spec, OpenDataContractStandard): - return odcs_to_mermaid(data_contract_spec) - else: - return None - - -def dcs_to_mermaid(data_contract_spec: DataContractSpecification) -> str | None: +def to_mermaid(data_contract: OpenDataContractStandard) -> str | None: + """Convert ODCS data contract to Mermaid ER diagram.""" try: - if not data_contract_spec.models: + if not data_contract.schema_: return None mmd_entity = "erDiagram\n" mmd_references = [] - for model_name, model in data_contract_spec.models.items(): - clean_model = _sanitize_name(model_name) - entity_block = "" - - for field_name, field in model.fields.items(): - clean_name = _sanitize_name(field_name) - field_type = field.type or "unknown" - - is_pk = bool(field.primaryKey or (field.unique and field.required)) - is_fk = bool(field.references) - - entity_block += _field_line(clean_name, field_type, pk=is_pk, uk=bool(field.unique), fk=is_fk) - - if field.references: - references = field.references.replace(".", "·") - parts = references.split("·") - referenced_model = _sanitize_name(parts[0]) if len(parts) > 0 else "" - referenced_field = _sanitize_name(parts[1]) if len(parts) > 1 else "" - if referenced_model: - label = referenced_field or clean_name - mmd_references.append(f'"**{referenced_model}**" ||--o{{ "**{clean_model}**" : {label}') - - mmd_entity += f'\t"**{clean_model}**" {{\n{entity_block}}}\n' - - if mmd_references: - mmd_entity += "\n" + "\n".join(mmd_references) - - return mmd_entity + "\n" - - except Exception as e: - print(f"Error generating DCS mermaid diagram: {e}") - return None - - -def odcs_to_mermaid(data_contract_spec: OpenDataContractStandard) -> str | None: - try: - if not data_contract_spec.schema_: - return None - - mmd_entity = "erDiagram\n" - - for schema in data_contract_spec.schema_: + for schema in data_contract.schema_: schema_name = schema.name or schema.physicalName + clean_model = _sanitize_name(schema_name) entity_block = "" if schema.properties: for prop in schema.properties: clean_name = _sanitize_name(prop.name) - indicators = "" + prop_type = prop.logicalType or prop.physicalType or "unknown" - if prop.primaryKey: - indicators += "🔑" - if getattr(prop, "partitioned", False): - indicators += "🔀" - if getattr(prop, "criticalDataElement", False): - indicators += "⚠️" + is_pk = bool(prop.primaryKey) + is_uk = bool(prop.unique) + is_fk = bool(prop.relationships) + + entity_block += _field_line(clean_name, prop_type, pk=is_pk, uk=is_uk, fk=is_fk) + + # Handle references from relationships + if prop.relationships: + for rel in prop.relationships: + ref_target = getattr(rel, 'to', None) or getattr(rel, 'ref', None) + if ref_target: + references = ref_target.replace(".", "·") + parts = references.split("·") + referenced_model = _sanitize_name(parts[0]) if len(parts) > 0 else "" + referenced_field = _sanitize_name(parts[1]) if len(parts) > 1 else "" + if referenced_model: + label = referenced_field or clean_name + mmd_references.append(f'"**{referenced_model}**" ||--o{{ "**{clean_model}**" : {label}') - prop_type = prop.logicalType or prop.physicalType or "unknown" - entity_block += f"\t{clean_name}{indicators} {prop_type}\n" + mmd_entity += f'\t"**{clean_model}**" {{\n{entity_block}}}\n' - mmd_entity += f'\t"**{schema_name}**"' + "{\n" + entity_block + "}\n" + if mmd_references: + mmd_entity += "\n" + "\n".join(mmd_references) - return f"{mmd_entity}\n" + return mmd_entity + "\n" except Exception as e: print(f"Error generating ODCS mermaid diagram: {e}") diff --git a/datacontract/export/odcs_export_helper.py b/datacontract/export/odcs_export_helper.py new file mode 100644 index 000000000..4ffd6dbb7 --- /dev/null +++ b/datacontract/export/odcs_export_helper.py @@ -0,0 +1,319 @@ +"""Helper functions for ODCS export operations.""" + +from typing import Any, Dict, List, Optional, Tuple + +from open_data_contract_standard.model import ( + OpenDataContractStandard, + SchemaObject, + SchemaProperty, + Server, +) + + +def get_schema_by_name( + data_contract: OpenDataContractStandard, name: str +) -> Optional[SchemaObject]: + """Get a schema object by name from the data contract.""" + if data_contract.schema_ is None: + return None + return next((s for s in data_contract.schema_ if s.name == name), None) + + +def get_all_schema_names(data_contract: OpenDataContractStandard) -> List[str]: + """Get all schema names from the data contract.""" + if data_contract.schema_ is None: + return [] + return [s.name for s in data_contract.schema_] + + +def get_server_by_name( + data_contract: OpenDataContractStandard, name: str +) -> Optional[Server]: + """Get a server by name from the data contract.""" + if data_contract.servers is None: + return None + return next((s for s in data_contract.servers if s.server == name), None) + + +def get_first_server(data_contract: OpenDataContractStandard) -> Optional[Server]: + """Get the first server from the data contract.""" + if data_contract.servers is None or len(data_contract.servers) == 0: + return None + return data_contract.servers[0] + + +def get_owner(data_contract: OpenDataContractStandard) -> Optional[str]: + """Get the owner from the data contract (team name).""" + if data_contract.team is None: + return None + return data_contract.team.name + + +def get_description(data_contract: OpenDataContractStandard) -> Optional[str]: + """Get the description from the data contract.""" + return data_contract.description + + +def property_to_dict(properties: List[SchemaProperty]) -> Dict[str, SchemaProperty]: + """Convert a list of properties to a dict keyed by property name.""" + if properties is None: + return {} + return {p.name: p for p in properties} + + +def get_property_type(prop: SchemaProperty) -> Optional[str]: + """Get the type of a property (logicalType or physicalType).""" + return prop.logicalType or prop.physicalType + + +def get_property_config(prop: SchemaProperty, key: str) -> Optional[Any]: + """Get a custom property value from a SchemaProperty.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def get_logical_type_option(prop: SchemaProperty, key: str) -> Optional[Any]: + """Get a logical type option from a SchemaProperty.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def iter_properties(schema: SchemaObject) -> List[Tuple[str, SchemaProperty]]: + """Iterate over properties in a schema object, yielding (name, property) tuples.""" + if schema.properties is None: + return [] + return [(p.name, p) for p in schema.properties] + + +def iter_schemas(data_contract: OpenDataContractStandard) -> List[Tuple[str, SchemaObject]]: + """Iterate over schemas in a data contract, yielding (name, schema) tuples.""" + if data_contract.schema_ is None: + return [] + return [(s.name, s) for s in data_contract.schema_] + + +class PropertyAdapter: + """Adapter to make SchemaProperty compatible with DCS Field interface. + + This adapter provides a Field-like interface for SchemaProperty to maintain + backward compatibility with existing type converters. + """ + + def __init__(self, prop: SchemaProperty): + self._prop = prop + + @property + def name(self) -> Optional[str]: + return self._prop.name + + @property + def type(self) -> Optional[str]: + """Return the logical type (equivalent to DCS field.type).""" + return self._prop.logicalType + + @property + def required(self) -> Optional[bool]: + return self._prop.required + + @property + def description(self) -> Optional[str]: + return self._prop.description + + @property + def primaryKey(self) -> Optional[bool]: + return self._prop.primaryKey + + @property + def primary(self) -> Optional[bool]: + # Alias for primaryKey + return self._prop.primaryKey + + @property + def unique(self) -> Optional[bool]: + return self._prop.unique + + @property + def format(self) -> Optional[str]: + # Check for format in logicalTypeOptions + return get_logical_type_option(self._prop, "format") + + @property + def precision(self) -> Optional[int]: + return get_logical_type_option(self._prop, "precision") + + @property + def scale(self) -> Optional[int]: + return get_logical_type_option(self._prop, "scale") + + @property + def minLength(self) -> Optional[int]: + return get_logical_type_option(self._prop, "minLength") + + @property + def maxLength(self) -> Optional[int]: + return get_logical_type_option(self._prop, "maxLength") + + @property + def minimum(self) -> Optional[float]: + return get_logical_type_option(self._prop, "minimum") + + @property + def maximum(self) -> Optional[float]: + return get_logical_type_option(self._prop, "maximum") + + @property + def exclusiveMinimum(self) -> Optional[float]: + return get_logical_type_option(self._prop, "exclusiveMinimum") + + @property + def exclusiveMaximum(self) -> Optional[float]: + return get_logical_type_option(self._prop, "exclusiveMaximum") + + @property + def pattern(self) -> Optional[str]: + return get_logical_type_option(self._prop, "pattern") + + @property + def enum(self) -> Optional[List[str]]: + return get_logical_type_option(self._prop, "enum") + + @property + def title(self) -> Optional[str]: + return self._prop.businessName + + @property + def tags(self) -> Optional[List[str]]: + return self._prop.tags + + @property + def pii(self) -> Optional[bool]: + return get_property_config(self._prop, "pii") + + @property + def classification(self) -> Optional[str]: + return self._prop.classification + + @property + def references(self) -> Optional[str]: + # Check relationships for foreign key references + if self._prop.relationships is None: + return None + for rel in self._prop.relationships: + if hasattr(rel, 'ref'): + return rel.ref + return None + + @property + def config(self) -> Optional[Dict[str, Any]]: + """Convert customProperties to config dict.""" + if self._prop.customProperties is None: + return None + return {cp.property: cp.value for cp in self._prop.customProperties} + + @property + def items(self) -> Optional["PropertyAdapter"]: + """Return items property for array types.""" + if self._prop.items is None: + return None + return PropertyAdapter(self._prop.items) + + @property + def fields(self) -> Dict[str, "PropertyAdapter"]: + """Return nested fields as a dict of PropertyAdapters.""" + if self._prop.properties is None: + return {} + return {p.name: PropertyAdapter(p) for p in self._prop.properties} + + @property + def keys(self) -> Optional["PropertyAdapter"]: + """Return keys property for map types (from customProperties).""" + keys_prop = get_property_config(self._prop, "mapKeys") + if keys_prop is None: + return None + # Create a minimal property for the key type + from open_data_contract_standard.model import SchemaProperty + key_prop = SchemaProperty(name="key") + key_prop.logicalType = keys_prop if isinstance(keys_prop, str) else "string" + return PropertyAdapter(key_prop) + + @property + def values(self) -> Optional["PropertyAdapter"]: + """Return values property for map types (from customProperties).""" + values_prop = get_property_config(self._prop, "mapValues") + if values_prop is None: + return None + from open_data_contract_standard.model import SchemaProperty + val_prop = SchemaProperty(name="value") + val_prop.logicalType = values_prop if isinstance(values_prop, str) else "string" + return PropertyAdapter(val_prop) + + @property + def namespace(self) -> Optional[str]: + """Get namespace from customProperties.""" + return get_property_config(self._prop, "namespace") + + +class SchemaAdapter: + """Adapter to make SchemaObject compatible with DCS Model interface.""" + + def __init__(self, schema: SchemaObject): + self._schema = schema + + @property + def name(self) -> Optional[str]: + return self._schema.name + + @property + def type(self) -> Optional[str]: + return self._schema.physicalType + + @property + def description(self) -> Optional[str]: + return self._schema.description + + @property + def title(self) -> Optional[str]: + return self._schema.businessName + + @property + def namespace(self) -> Optional[str]: + # Check customProperties for namespace + if self._schema.customProperties is None: + return None + for cp in self._schema.customProperties: + if cp.property == "namespace": + return cp.value + return None + + @property + def primaryKey(self) -> Optional[List[str]]: + """Get primary key columns.""" + if self._schema.properties is None: + return None + pk_cols = [ + p.name for p in self._schema.properties + if p.primaryKey + ] + return pk_cols if pk_cols else None + + @property + def fields(self) -> Dict[str, PropertyAdapter]: + """Return fields as a dict of PropertyAdapters.""" + if self._schema.properties is None: + return {} + return {p.name: PropertyAdapter(p) for p in self._schema.properties} + + +def adapt_property(prop: SchemaProperty) -> PropertyAdapter: + """Create a PropertyAdapter from a SchemaProperty.""" + return PropertyAdapter(prop) + + +def adapt_schema(schema: SchemaObject) -> SchemaAdapter: + """Create a SchemaAdapter from a SchemaObject.""" + return SchemaAdapter(schema) diff --git a/datacontract/export/odcs_v3_exporter.py b/datacontract/export/odcs_v3_exporter.py index 4dfe9e7f9..a1e0e463f 100644 --- a/datacontract/export/odcs_v3_exporter.py +++ b/datacontract/export/odcs_v3_exporter.py @@ -1,375 +1,26 @@ -from typing import Any, Dict +"""ODCS V3 Exporter - Exports the internal ODCS model to YAML format.""" -from open_data_contract_standard.model import ( - CustomProperty, - DataQuality, - Description, - OpenDataContractStandard, - Role, - SchemaObject, - SchemaProperty, - Server, - ServiceLevelAgreementProperty, - Support, -) +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model class OdcsV3Exporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: return to_odcs_v3_yaml(data_contract) -def to_odcs_v3_yaml(data_contract_spec: DataContractSpecification) -> str: - result = to_odcs_v3(data_contract_spec) +def to_odcs_v3_yaml(data_contract: OpenDataContractStandard) -> str: + """Export the internal ODCS model to YAML format. - return result.to_yaml() + Since the internal model is now ODCS, this is a simple serialization. + """ + return data_contract.to_yaml() -def to_odcs_v3(data_contract_spec: DataContractSpecification) -> OpenDataContractStandard: - result = OpenDataContractStandard( - apiVersion="v3.0.1", - kind="DataContract", - id=data_contract_spec.id, - name=data_contract_spec.info.title, - version=data_contract_spec.info.version, - status=to_status(data_contract_spec.info.status), - ) - if data_contract_spec.terms is not None: - result.description = Description( - purpose=data_contract_spec.terms.description.strip() - if data_contract_spec.terms.description is not None - else None, - usage=data_contract_spec.terms.usage.strip() if data_contract_spec.terms.usage is not None else None, - limitations=data_contract_spec.terms.limitations.strip() - if data_contract_spec.terms.limitations is not None - else None, - ) - result.schema_ = [] - for model_key, model_value in data_contract_spec.models.items(): - odcs_schema = to_odcs_schema(model_key, model_value) - result.schema_.append(odcs_schema) - if data_contract_spec.servicelevels is not None: - slas = [] - if data_contract_spec.servicelevels.availability is not None: - slas.append( - ServiceLevelAgreementProperty( - property="generalAvailability", value=data_contract_spec.servicelevels.availability.description - ) - ) - if data_contract_spec.servicelevels.retention is not None: - slas.append( - ServiceLevelAgreementProperty( - property="retention", value=data_contract_spec.servicelevels.retention.period - ) - ) +def to_odcs_v3(data_contract: OpenDataContractStandard) -> OpenDataContractStandard: + """Return the internal ODCS model. - if len(slas) > 0: - result.slaProperties = slas - if data_contract_spec.info.contact is not None: - support = [] - if data_contract_spec.info.contact.email is not None: - support.append(Support(channel="email", url="mailto:" + data_contract_spec.info.contact.email)) - if data_contract_spec.info.contact.url is not None: - support.append(Support(channel="other", url=data_contract_spec.info.contact.url)) - if len(support) > 0: - result.support = support - if data_contract_spec.servers is not None and len(data_contract_spec.servers) > 0: - servers = [] - - for server_key, server_value in data_contract_spec.servers.items(): - server = Server(server=server_key, type=server_value.type or "") - - # Set all the attributes that are not None - if server_value.environment is not None: - server.environment = server_value.environment - if server_value.account is not None: - server.account = server_value.account - if server_value.database is not None: - server.database = server_value.database - if server_value.schema_ is not None: - server.schema_ = server_value.schema_ - if server_value.format is not None: - server.format = server_value.format - if server_value.project is not None: - server.project = server_value.project - if server_value.dataset is not None: - server.dataset = server_value.dataset - if server_value.path is not None: - server.path = server_value.path - if server_value.delimiter is not None: - server.delimiter = server_value.delimiter - if server_value.endpointUrl is not None: - server.endpointUrl = server_value.endpointUrl - if server_value.location is not None: - server.location = server_value.location - if server_value.host is not None: - server.host = server_value.host - if server_value.port is not None: - server.port = server_value.port - if server_value.catalog is not None: - server.catalog = server_value.catalog - if server_value.topic is not None: - server.topic = server_value.topic - if server_value.http_path is not None: - server.http_path = server_value.http_path - if server_value.token is not None: - server.token = server_value.token - if server_value.driver is not None: - server.driver = server_value.driver - - if server_value.roles is not None: - server.roles = [Role(role=role.name, description=role.description) for role in server_value.roles] - - servers.append(server) - - if len(servers) > 0: - result.servers = servers - custom_properties = [] - if data_contract_spec.info.owner is not None: - custom_properties.append(CustomProperty(property="owner", value=data_contract_spec.info.owner)) - if data_contract_spec.info.model_extra is not None: - for key, value in data_contract_spec.info.model_extra.items(): - custom_properties.append(CustomProperty(property=key, value=value)) - if len(custom_properties) > 0: - result.customProperties = custom_properties - return result - - -def to_odcs_schema(model_key, model_value: Model) -> SchemaObject: - schema_obj = SchemaObject( - name=model_key, physicalName=model_key, logicalType="object", physicalType=model_value.type - ) - - if model_value.description is not None: - schema_obj.description = model_value.description - - properties = to_properties(model_value.fields) - if properties: - schema_obj.properties = properties - - model_quality = to_odcs_quality_list(model_value.quality) - if len(model_quality) > 0: - schema_obj.quality = model_quality - - custom_properties = [] - if model_value.model_extra is not None: - for key, value in model_value.model_extra.items(): - custom_properties.append(CustomProperty(property=key, value=value)) - - if len(custom_properties) > 0: - schema_obj.customProperties = custom_properties - - return schema_obj - - -def to_properties(fields: Dict[str, Field]) -> list: - properties = [] - for field_name, field in fields.items(): - property = to_property(field_name, field) - properties.append(property) - return properties - - -def to_logical_type(type: str) -> str | None: - if type is None: - return None - if type.lower() in ["string", "varchar", "text"]: - return "string" - if type.lower() in ["timestamp", "timestamp_tz"]: - return "date" - if type.lower() in ["timestamp_ntz"]: - return "date" - if type.lower() in ["date"]: - return "date" - if type.lower() in ["time"]: - return "string" - if type.lower() in ["number", "decimal", "numeric"]: - return "number" - if type.lower() in ["float", "double"]: - return "number" - if type.lower() in ["integer", "int", "long", "bigint"]: - return "integer" - if type.lower() in ["boolean"]: - return "boolean" - if type.lower() in ["object", "record", "struct"]: - return "object" - if type.lower() in ["bytes"]: - return "array" - if type.lower() in ["array"]: - return "array" - if type.lower() in ["variant"]: - return "variant" - if type.lower() in ["null"]: - return None - return None - - -def to_physical_type(config: Dict[str, Any]) -> str | None: - if config is None: - return None - if "postgresType" in config: - return config["postgresType"] - elif "bigqueryType" in config: - return config["bigqueryType"] - elif "snowflakeType" in config: - return config["snowflakeType"] - elif "redshiftType" in config: - return config["redshiftType"] - elif "sqlserverType" in config: - return config["sqlserverType"] - elif "databricksType" in config: - return config["databricksType"] - elif "physicalType" in config: - return config["physicalType"] - return None - - -def to_property(field_name: str, field: Field) -> SchemaProperty: - property = SchemaProperty(name=field_name) - - if field.fields: - properties = [] - for field_name_, field_ in field.fields.items(): - property_ = to_property(field_name_, field_) - properties.append(property_) - property.properties = properties - - if field.items: - items = to_property(field_name, field.items) - items.name = None # Clear the name for items - property.items = items - - if field.title is not None: - property.businessName = field.title - - if field.type is not None: - property.logicalType = to_logical_type(field.type) - property.physicalType = to_physical_type(field.config) or field.type - - if field.description is not None: - property.description = field.description - - if field.required is not None: - property.required = field.required - - if field.unique is not None: - property.unique = field.unique - - if field.classification is not None: - property.classification = field.classification - - if field.examples is not None: - property.examples = field.examples.copy() - - if field.example is not None: - property.examples = [field.example] - - if field.primaryKey is not None and field.primaryKey: - property.primaryKey = field.primaryKey - property.primaryKeyPosition = 1 - - if field.primary is not None and field.primary: - property.primaryKey = field.primary - property.primaryKeyPosition = 1 - - custom_properties = [] - if field.model_extra is not None: - for key, value in field.model_extra.items(): - custom_properties.append(CustomProperty(property=key, value=value)) - - if field.pii is not None: - custom_properties.append(CustomProperty(property="pii", value=field.pii)) - - if len(custom_properties) > 0: - property.customProperties = custom_properties - - if field.tags is not None and len(field.tags) > 0: - property.tags = field.tags - - logical_type_options = {} - if field.minLength is not None: - logical_type_options["minLength"] = field.minLength - if field.maxLength is not None: - logical_type_options["maxLength"] = field.maxLength - if field.pattern is not None: - logical_type_options["pattern"] = field.pattern - if field.minimum is not None: - logical_type_options["minimum"] = field.minimum - if field.maximum is not None: - logical_type_options["maximum"] = field.maximum - if field.exclusiveMinimum is not None: - logical_type_options["exclusiveMinimum"] = field.exclusiveMinimum - if field.exclusiveMaximum is not None: - logical_type_options["exclusiveMaximum"] = field.exclusiveMaximum - - if logical_type_options: - property.logicalTypeOptions = logical_type_options - - if field.quality is not None: - quality_list = field.quality - quality_property = to_odcs_quality_list(quality_list) - if len(quality_property) > 0: - property.quality = quality_property - - return property - - -def to_odcs_quality_list(quality_list): - quality_property = [] - for quality in quality_list: - quality_property.append(to_odcs_quality(quality)) - return quality_property - - -def to_odcs_quality(quality): - quality_obj = DataQuality(type=quality.type) - - if quality.description is not None: - quality_obj.description = quality.description - if quality.query is not None: - quality_obj.query = quality.query - # dialect is not supported in v3.0.0 - if quality.mustBe is not None: - quality_obj.mustBe = quality.mustBe - if quality.mustNotBe is not None: - quality_obj.mustNotBe = quality.mustNotBe - if quality.mustBeGreaterThan is not None: - quality_obj.mustBeGreaterThan = quality.mustBeGreaterThan - if quality.mustBeGreaterThanOrEqualTo is not None: - quality_obj.mustBeGreaterOrEqualTo = quality.mustBeGreaterThanOrEqualTo - if quality.mustBeLessThan is not None: - quality_obj.mustBeLessThan = quality.mustBeLessThan - if quality.mustBeLessThanOrEqualTo is not None: - quality_obj.mustBeLessOrEqualTo = quality.mustBeLessThanOrEqualTo - if quality.mustBeBetween is not None: - quality_obj.mustBeBetween = quality.mustBeBetween - if quality.mustNotBeBetween is not None: - quality_obj.mustNotBeBetween = quality.mustNotBeBetween - if quality.engine is not None: - quality_obj.engine = quality.engine - if quality.implementation is not None: - quality_obj.implementation = quality.implementation - - return quality_obj - - -def to_status(status): - """Convert the data contract status to ODCS v3 format.""" - if status is None: - return "draft" # Default to draft if no status is provided - - # Valid status values according to ODCS v3.0.1 spec - valid_statuses = ["proposed", "draft", "active", "deprecated", "retired"] - - # Convert to lowercase for comparison - status_lower = status.lower() - - # If status is already valid, return it as is - if status_lower in valid_statuses: - return status_lower - - # Default to "draft" for any non-standard status - return "draft" + Since the internal model is now ODCS, this is an identity function. + """ + return data_contract diff --git a/datacontract/export/pandas_type_converter.py b/datacontract/export/pandas_type_converter.py index 3ab84b68f..21ea272fe 100644 --- a/datacontract/export/pandas_type_converter.py +++ b/datacontract/export/pandas_type_converter.py @@ -2,39 +2,55 @@ Module for converting data contract field types to corresponding pandas data types. """ -from datacontract.model.data_contract_specification import Field +from typing import Optional +from open_data_contract_standard.model import SchemaProperty -def convert_to_pandas_type(field: Field) -> str: + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property.""" + if prop.logicalType: + return prop.logicalType + if prop.physicalType: + return prop.physicalType + return None + + +def convert_to_pandas_type(prop: SchemaProperty) -> str: """ Convert a data contract field type to the equivalent pandas data type. Parameters: ---------- - field : Field - A Field object containing metadata about the data type of the field. + prop : SchemaProperty + A SchemaProperty object containing metadata about the data type of the field. Returns: ------- str The corresponding pandas data type as a string. """ - field_type = field.type + field_type = _get_type(prop) + + if field_type is None: + return "object" + + field_type_lower = field_type.lower() - if field_type in ["string", "varchar", "text"]: + if field_type_lower in ["string", "varchar", "text"]: return "str" - if field_type in ["integer", "int"]: + if field_type_lower in ["integer", "int"]: return "int32" - if field_type == "long": + if field_type_lower == "long": return "int64" - if field_type == "float": + if field_type_lower == "float": return "float32" - if field_type in ["number", "decimal", "numeric", "double"]: + if field_type_lower in ["number", "decimal", "numeric", "double"]: return "float64" - if field_type == "boolean": + if field_type_lower == "boolean": return "bool" - if field_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]: + if field_type_lower in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]: return "datetime64[ns]" - if field_type == "bytes": + if field_type_lower == "bytes": return "object" return "object" diff --git a/datacontract/export/protobuf_converter.py b/datacontract/export/protobuf_converter.py deleted file mode 100644 index 410498f7b..000000000 --- a/datacontract/export/protobuf_converter.py +++ /dev/null @@ -1,199 +0,0 @@ -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification - - -class ProtoBufExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - # Returns a dict containing the protobuf representation. - proto = to_protobuf(data_contract) - return {"protobuf": proto} - - -def to_protobuf(data_contract_spec: DataContractSpecification) -> str: - """ - Generates a Protobuf file from the data contract specification. - Scans all models for enum fields (even if the type is "string") by checking for a "values" property. - """ - messages = "" - enum_definitions = {} - - # Iterate over all models to generate messages and collect enum definitions. - for model_name, model in data_contract_spec.models.items(): - for field_name, field in model.fields.items(): - # If the field has enum values, collect them. - if _is_enum_field(field): - enum_name = _get_enum_name(field, field_name) - enum_values = _get_enum_values(field) - if enum_values and enum_name not in enum_definitions: - enum_definitions[enum_name] = enum_values - - messages += to_protobuf_message(model_name, model.fields, getattr(model, "description", ""), 0) - messages += "\n" - - # Build header with syntax and package declarations. - header = 'syntax = "proto3";\n\n' - package = getattr(data_contract_spec, "package", "example") - header += f"package {package};\n\n" - - # Append enum definitions. - for enum_name, enum_values in enum_definitions.items(): - header += f"// Enum for {enum_name}\n" - header += f"enum {enum_name} {{\n" - # Only iterate if enum_values is a dictionary. - if isinstance(enum_values, dict): - for enum_const, value in sorted(enum_values.items(), key=lambda item: item[1]): - normalized_const = enum_const.upper().replace(" ", "_") - header += f" {normalized_const} = {value};\n" - else: - header += f" // Warning: Enum values for {enum_name} are not a dictionary\n" - header += "}\n\n" - return header + messages - - -def _is_enum_field(field) -> bool: - """ - Returns True if the field (dict or object) has a non-empty "values" property. - """ - if isinstance(field, dict): - return bool(field.get("values")) - return bool(getattr(field, "values", None)) - - -def _get_enum_name(field, field_name: str) -> str: - """ - Returns the enum name either from the field's "enum_name" or derived from the field name. - """ - if isinstance(field, dict): - return field.get("enum_name", _to_protobuf_message_name(field_name)) - return getattr(field, "enum_name", None) or _to_protobuf_message_name(field_name) - - -def _get_enum_values(field) -> dict: - """ - Returns the enum values from the field. - If the values are not a dictionary, attempts to extract enum attributes. - """ - if isinstance(field, dict): - values = field.get("values", {}) - else: - values = getattr(field, "values", {}) - - if not isinstance(values, dict): - # If values is a BaseModel (or similar) with a .dict() method, use it. - if hasattr(values, "dict") and callable(values.dict): - values_dict = values.dict() - return {k: v for k, v in values_dict.items() if k.isupper() and isinstance(v, int)} - else: - # Otherwise, iterate over attributes that look like enums. - return { - key: getattr(values, key) - for key in dir(values) - if key.isupper() and isinstance(getattr(values, key), int) - } - return values - - -def _to_protobuf_message_name(name: str) -> str: - """ - Returns a valid Protobuf message/enum name by capitalizing the first letter. - """ - return name[0].upper() + name[1:] if name else name - - -def to_protobuf_message(model_name: str, fields: dict, description: str, indent_level: int = 0) -> str: - """ - Generates a Protobuf message definition from the model's fields. - Handles nested messages for complex types. - """ - result = "" - if description: - result += f"{indent(indent_level)}// {description}\n" - - result += f"message {_to_protobuf_message_name(model_name)} {{\n" - number = 1 - for field_name, field in fields.items(): - # For nested objects, generate a nested message. - field_type = _get_field_type(field) - if field_type in ["object", "record", "struct"]: - nested_desc = field.get("description", "") if isinstance(field, dict) else getattr(field, "description", "") - nested_fields = field.get("fields", {}) if isinstance(field, dict) else field.fields - nested_message = to_protobuf_message(field_name, nested_fields, nested_desc, indent_level + 1) - result += nested_message + "\n" - - field_desc = field.get("description", "") if isinstance(field, dict) else getattr(field, "description", "") - result += to_protobuf_field(field_name, field, field_desc, number, indent_level + 1) + "\n" - number += 1 - - result += f"{indent(indent_level)}}}\n" - return result - - -def to_protobuf_field(field_name: str, field, description: str, number: int, indent_level: int = 0) -> str: - """ - Generates a field definition within a Protobuf message. - """ - result = "" - if description: - result += f"{indent(indent_level)}// {description}\n" - result += f"{indent(indent_level)}{_convert_type(field_name, field)} {field_name} = {number};" - return result - - -def indent(indent_level: int) -> str: - return " " * indent_level - - -def _get_field_type(field) -> str: - """ - Retrieves the field type from the field definition. - """ - if isinstance(field, dict): - return field.get("type", "").lower() - return getattr(field, "type", "").lower() - - -def _convert_type(field_name: str, field) -> str: - """ - Converts a field's type (from the data contract) to a Protobuf type. - Prioritizes enum conversion if a non-empty "values" property exists. - """ - # For debugging purposes - print("Converting field:", field_name) - # If the field should be treated as an enum, return its enum name. - if _is_enum_field(field): - return _get_enum_name(field, field_name) - - lower_type = _get_field_type(field) - if lower_type in ["string", "varchar", "text"]: - return "string" - if lower_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date", "time"]: - return "string" - if lower_type in ["number", "decimal", "numeric"]: - return "double" - if lower_type in ["float", "double"]: - return lower_type - if lower_type in ["integer", "int"]: - return "int32" - if lower_type in ["long", "bigint"]: - return "int64" - if lower_type in ["boolean"]: - return "bool" - if lower_type in ["bytes"]: - return "bytes" - if lower_type in ["object", "record", "struct"]: - return _to_protobuf_message_name(field_name) - if lower_type == "array": - # Handle array types. Check for an "items" property. - items = field.get("items") if isinstance(field, dict) else getattr(field, "items", None) - if items and isinstance(items, dict) and items.get("type"): - item_type = items.get("type", "").lower() - if item_type in ["object", "record", "struct"]: - # Singularize the field name (a simple approach). - singular = field_name[:-1] if field_name.endswith("s") else field_name - return "repeated " + _to_protobuf_message_name(singular) - else: - return "repeated " + _convert_type(field_name, items) - else: - return "repeated string" - # Fallback for unrecognized types. - return "string" diff --git a/datacontract/export/protobuf_exporter.py b/datacontract/export/protobuf_exporter.py new file mode 100644 index 000000000..86dedf905 --- /dev/null +++ b/datacontract/export/protobuf_exporter.py @@ -0,0 +1,201 @@ +import sys +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty + +from datacontract.export.exporter import Exporter + + +class ProtoBufExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + # Returns a dict containing the protobuf representation. + proto = to_protobuf(data_contract) + return proto + + +def _get_config_value(prop: SchemaProperty, key: str) -> Optional[str]: + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def to_protobuf(data_contract: OpenDataContractStandard) -> str: + """ + Generates a Protobuf file from the data contract specification. + Scans all models for enum fields (even if the type is "string") by checking for a "values" property. + """ + messages = "" + enum_definitions = {} + + if data_contract.schema_ is None: + return "" + + # Iterate over all models to generate messages and collect enum definitions. + for schema_obj in data_contract.schema_: + properties = schema_obj.properties or [] + for prop in properties: + # If the field has enum values, collect them. + if _is_enum_field(prop): + enum_name = _get_enum_name(prop) + enum_values = _get_enum_values(prop) + if enum_values and enum_name not in enum_definitions: + enum_definitions[enum_name] = enum_values + + messages += to_protobuf_message(schema_obj.name, properties, schema_obj.description or "", 0) + messages += "\n" + + # Build header with syntax and package declarations. + header = 'syntax = "proto3";\n\n' + package = "example" # Default package + header += f"package {package};\n\n" + + # Append enum definitions. + for enum_name, enum_values in enum_definitions.items(): + header += f"// Enum for {enum_name}\n" + header += f"enum {enum_name} {{\n" + # Only iterate if enum_values is a dictionary. + if isinstance(enum_values, dict): + for enum_const, value in sorted(enum_values.items(), key=lambda item: item[1]): + normalized_const = enum_const.upper().replace(" ", "_") + header += f" {normalized_const} = {value};\n" + else: + header += f" // Warning: Enum values for {enum_name} are not a dictionary\n" + header += "}\n\n" + return header + messages + + +def _is_enum_field(prop: SchemaProperty) -> bool: + """ + Returns True if the field has a non-empty "enumValues" property (via customProperties). + """ + values = _get_config_value(prop, "enumValues") + return bool(values) + + +def _get_enum_name(prop: SchemaProperty) -> str: + """ + Returns the enum name either from the field's "enum_name" or derived from the field name. + """ + enum_name = _get_config_value(prop, "enum_name") + if enum_name: + return enum_name + return _to_protobuf_message_name(prop.name) + + +def _get_enum_values(prop: SchemaProperty) -> dict: + """ + Returns the enum values from the field. + """ + values = _get_config_value(prop, "enumValues") + if values and isinstance(values, dict): + return values + return {} + + +def _to_protobuf_message_name(name: str) -> str: + """ + Returns a valid Protobuf message/enum name by capitalizing the first letter. + """ + return name[0].upper() + name[1:] if name else name + + +def to_protobuf_message(model_name: str, properties: List[SchemaProperty], description: str, indent_level: int = 0) -> str: + """ + Generates a Protobuf message definition from the model's fields. + Handles nested messages for complex types. + """ + result = "" + if description: + result += f"{indent(indent_level)}// {description}\n" + + result += f"message {_to_protobuf_message_name(model_name)} {{\n" + number = 1 + for prop in properties: + # For nested objects, generate a nested message. + field_type = prop.logicalType or "" + if field_type.lower() in ["object", "record", "struct"]: + nested_desc = prop.description or "" + nested_props = prop.properties or [] + nested_message = to_protobuf_message(prop.name, nested_props, nested_desc, indent_level + 1) + result += nested_message + "\n" + + field_desc = prop.description or "" + result += to_protobuf_field(prop, field_desc, number, indent_level + 1) + "\n" + number += 1 + + result += f"{indent(indent_level)}}}\n" + return result + + +def to_protobuf_field(prop: SchemaProperty, description: str, number: int, indent_level: int = 0) -> str: + """ + Generates a field definition within a Protobuf message. + """ + result = "" + if description: + result += f"{indent(indent_level)}// {description}\n" + result += f"{indent(indent_level)}{_convert_type(prop)} {prop.name} = {number};" + return result + + +def indent(indent_level: int) -> str: + return " " * indent_level + + +def _convert_type(prop: SchemaProperty) -> str: + """ + Converts a field's type (from the data contract) to a Protobuf type. + Prioritizes enum conversion if a non-empty "values" property exists. + """ + # For debugging purposes + print("Converting field:", prop.name, file=sys.stderr) + # If the field should be treated as an enum, return its enum name. + if _is_enum_field(prop): + return _get_enum_name(prop) + + field_type = prop.logicalType or "" + lower_type = field_type.lower() + + if lower_type in ["string", "varchar", "text"]: + return "string" + if lower_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date", "time"]: + return "string" + if lower_type in ["number", "decimal", "numeric"]: + return "double" + if lower_type in ["float", "double"]: + return lower_type + if lower_type in ["integer", "int"]: + return "int32" + if lower_type in ["long", "bigint"]: + return "int64" + if lower_type in ["boolean"]: + return "bool" + if lower_type in ["bytes"]: + return "bytes" + if lower_type in ["object", "record", "struct"]: + return _to_protobuf_message_name(prop.name) + if lower_type == "array": + # Handle array types. Check for an "items" property. + if prop.items: + items_type = prop.items.logicalType or "" + if items_type.lower() in ["object", "record", "struct"]: + # Singularize the field name (a simple approach). + singular = prop.name[:-1] if prop.name.endswith("s") else prop.name + return "repeated " + _to_protobuf_message_name(singular) + else: + return "repeated " + _convert_type(prop.items) + else: + return "repeated string" + # Fallback for unrecognized types. + return "string" diff --git a/datacontract/export/pydantic_converter.py b/datacontract/export/pydantic_converter.py deleted file mode 100644 index 9e6779dc4..000000000 --- a/datacontract/export/pydantic_converter.py +++ /dev/null @@ -1,137 +0,0 @@ -import ast -import typing - -import datacontract.model.data_contract_specification as spec -from datacontract.export.exporter import Exporter - - -class PydanticExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_pydantic_model_str(data_contract) - - -def to_pydantic_model_str(contract: spec.DataContractSpecification) -> str: - classdefs = [generate_model_class(model_name, model) for (model_name, model) in contract.models.items()] - documentation = ( - [ast.Expr(ast.Constant(contract.info.description))] if (contract.info and contract.info.description) else [] - ) - result = ast.Module( - body=[ - ast.Import( - names=[ - ast.Name("datetime", ctx=ast.Load()), - ast.Name("typing", ctx=ast.Load()), - ast.Name("pydantic", ctx=ast.Load()), - ] - ), - *documentation, - *classdefs, - ], - type_ignores=[], - ) - return ast.unparse(result) - - -def optional_of(node) -> ast.Subscript: - return ast.Subscript( - value=ast.Attribute(ast.Name(id="typing", ctx=ast.Load()), attr="Optional", ctx=ast.Load()), slice=node - ) - - -def list_of(node) -> ast.Subscript: - return ast.Subscript(value=ast.Name(id="list", ctx=ast.Load()), slice=node) - - -def product_of(nodes: list[typing.Any]) -> ast.Subscript: - return ast.Subscript( - value=ast.Attribute(value=ast.Name(id="typing", ctx=ast.Load()), attr="Product", ctx=ast.Load()), - slice=ast.Tuple(nodes, ctx=ast.Load()), - ) - - -type_annotation_type = typing.Union[ast.Name, ast.Attribute, ast.Constant, ast.Subscript] - - -def constant_field_annotation( - field_name: str, field: spec.Field -) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]: - match field.type: - case "string" | "text" | "varchar": - return (ast.Name("str", ctx=ast.Load()), None) - case "number", "decimal", "numeric": - # Either integer or float in specification, - # so we use float. - return (ast.Name("float", ctx=ast.Load()), None) - case "int" | "integer" | "long" | "bigint": - return (ast.Name("int", ctx=ast.Load()), None) - case "float" | "double": - return (ast.Name("float", ctx=ast.Load()), None) - case "boolean": - return (ast.Name("bool", ctx=ast.Load()), None) - case "timestamp" | "timestamp_tz" | "timestamp_ntz": - return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="datetime"), None) - case "date": - return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="date"), None) - case "bytes": - return (ast.Name("bytes", ctx=ast.Load()), None) - case "null": - return (ast.Constant("None"), None) - case "array": - (annotated_type, new_class) = type_annotation(field_name, field.items) - return (list_of(annotated_type), new_class) - case "object" | "record" | "struct": - classdef = generate_field_class(field_name.capitalize(), field) - return (ast.Name(field_name.capitalize(), ctx=ast.Load()), classdef) - case _: - raise RuntimeError(f"Unsupported field type {field.type}.") - - -def type_annotation(field_name: str, field: spec.Field) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]: - if field.required: - return constant_field_annotation(field_name, field) - else: - (annotated_type, new_classes) = constant_field_annotation(field_name, field) - return (optional_of(annotated_type), new_classes) - - -def is_simple_field(field: spec.Field) -> bool: - return field.type not in set(["object", "record", "struct"]) - - -def field_definitions(fields: dict[str, spec.Field]) -> tuple[list[ast.Expr], list[ast.ClassDef]]: - annotations = [] - classes = [] - for field_name, field in fields.items(): - (ann, new_class) = type_annotation(field_name, field) - annotations.append(ast.AnnAssign(target=ast.Name(id=field_name, ctx=ast.Store()), annotation=ann, simple=1)) - if field.description and is_simple_field(field): - annotations.append(ast.Expr(ast.Constant(field.description))) - if new_class: - classes.append(new_class) - return (annotations, classes) - - -def generate_field_class(field_name: str, field: spec.Field) -> ast.ClassDef: - assert field.type in set(["object", "record", "struct"]) - (annotated_type, new_classes) = field_definitions(field.fields) - documentation = [ast.Expr(ast.Constant(field.description))] if field.description else [] - return ast.ClassDef( - name=field_name, - bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())], - body=[*documentation, *new_classes, *annotated_type], - keywords=[], - decorator_list=[], - ) - - -def generate_model_class(name: str, model_definition: spec.Model) -> ast.ClassDef: - (field_assignments, nested_classes) = field_definitions(model_definition.fields) - documentation = [ast.Expr(ast.Constant(model_definition.description))] if model_definition.description else [] - result = ast.ClassDef( - name=name.capitalize(), - bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())], - body=[*documentation, *nested_classes, *field_assignments], - keywords=[], - decorator_list=[], - ) - return result diff --git a/datacontract/export/pydantic_exporter.py b/datacontract/export/pydantic_exporter.py new file mode 100644 index 000000000..403d6ca21 --- /dev/null +++ b/datacontract/export/pydantic_exporter.py @@ -0,0 +1,200 @@ +import ast +import typing +from typing import Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty + +from datacontract.export.exporter import Exporter + + +class PydanticExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + return to_pydantic_model_str(data_contract) + + +def _get_description_str(description) -> str | None: + """Extract a string from a description, handling both string and Description object.""" + if description is None: + return None + if isinstance(description, str): + return description + # It's a Description object - get purpose or other meaningful field + if hasattr(description, "purpose") and description.purpose: + return description.purpose + if hasattr(description, "usage") and description.usage: + return description.usage + return None + + +def to_pydantic_model_str(contract: OpenDataContractStandard) -> str: + classdefs = [] + if contract.schema_: + for schema_obj in contract.schema_: + classdefs.append(generate_model_class(schema_obj.name, schema_obj)) + + desc_str = _get_description_str(contract.description) + documentation = [ast.Expr(ast.Constant(desc_str))] if desc_str else [] + result = ast.Module( + body=[ + ast.Import( + names=[ + ast.Name("datetime", ctx=ast.Load()), + ast.Name("typing", ctx=ast.Load()), + ast.Name("pydantic", ctx=ast.Load()), + ast.Name("decimal", ctx=ast.Load()), + ] + ), + *documentation, + *classdefs, + ], + type_ignores=[], + ) + return ast.unparse(result) + + +def optional_of(node) -> ast.Subscript: + return ast.Subscript( + value=ast.Attribute(ast.Name(id="typing", ctx=ast.Load()), attr="Optional", ctx=ast.Load()), slice=node + ) + + +def list_of(node) -> ast.Subscript: + return ast.Subscript(value=ast.Name(id="list", ctx=ast.Load()), slice=node) + + +def product_of(nodes: list[typing.Any]) -> ast.Subscript: + return ast.Subscript( + value=ast.Attribute(value=ast.Name(id="typing", ctx=ast.Load()), attr="Product", ctx=ast.Load()), + slice=ast.Tuple(nodes, ctx=ast.Load()), + ) + + +type_annotation_type = typing.Union[ast.Name, ast.Attribute, ast.Constant, ast.Subscript] + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the logical type from a schema property.""" + return prop.logicalType + + +def _get_physical_type(prop: SchemaProperty) -> Optional[str]: + """Get the physical type from a schema property.""" + return prop.physicalType.lower() if prop.physicalType else None + + +def constant_field_annotation( + field_name: str, prop: SchemaProperty +) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]: + prop_type = _get_type(prop) + physical_type = _get_physical_type(prop) + + match prop_type: + case "string": + return (ast.Name("str", ctx=ast.Load()), None) + case "number": + if physical_type == "decimal": + return (ast.Attribute(value=ast.Name(id="decimal", ctx=ast.Load()), attr="Decimal"), None) + # Either integer or float in specification, + # so we use float. + return (ast.Name("float", ctx=ast.Load()), None) + case "integer": + return (ast.Name("int", ctx=ast.Load()), None) + case "boolean": + return (ast.Name("bool", ctx=ast.Load()), None) + case "date": + return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="datetime"), None) + case "array": + if prop.items: + (annotated_type, new_class) = type_annotation(field_name, prop.items) + return (list_of(annotated_type), new_class) + return (list_of(ast.Name("typing.Any", ctx=ast.Load())), None) + case "object": + classdef = generate_field_class(field_name.capitalize(), prop) + return (ast.Name(field_name.capitalize(), ctx=ast.Load()), classdef) + case _: + # Check physical type for more specific mappings + if physical_type: + if physical_type in ["text", "varchar", "char", "nvarchar"]: + return (ast.Name("str", ctx=ast.Load()), None) + elif physical_type in ["int", "integer", "int32"]: + return (ast.Name("int", ctx=ast.Load()), None) + elif physical_type in ["long", "bigint", "int64"]: + return (ast.Name("int", ctx=ast.Load()), None) + elif physical_type in ["float", "real", "float32"]: + return (ast.Name("float", ctx=ast.Load()), None) + elif physical_type in ["double", "float64"]: + return (ast.Name("float", ctx=ast.Load()), None) + elif physical_type in ["numeric", "number"]: + return (ast.Name("float", ctx=ast.Load()), None) + elif physical_type == "decimal": + return (ast.Attribute(value=ast.Name(id="decimal", ctx=ast.Load()), attr="Decimal"), None) + elif physical_type in ["timestamp", "datetime", "timestamp_tz", "timestamp_ntz"]: + return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="datetime"), None) + elif physical_type == "date": + return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="date"), None) + elif physical_type in ["bytes", "binary", "bytea"]: + return (ast.Name("bytes", ctx=ast.Load()), None) + elif physical_type == "null": + return (ast.Constant("None"), None) + elif physical_type in ["record", "struct"]: + classdef = generate_field_class(field_name.capitalize(), prop) + return (ast.Name(field_name.capitalize(), ctx=ast.Load()), classdef) + # Default to string + return (ast.Name("str", ctx=ast.Load()), None) + + +def type_annotation( + field_name: str, prop: SchemaProperty +) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]: + if prop.required: + return constant_field_annotation(field_name, prop) + else: + (annotated_type, new_classes) = constant_field_annotation(field_name, prop) + return (optional_of(annotated_type), new_classes) + + +def is_simple_field(prop: SchemaProperty) -> bool: + prop_type = _get_type(prop) or "" + physical_type = (prop.physicalType or "").lower() + return prop_type not in {"object"} and physical_type not in {"record", "struct"} + + +def field_definitions(properties: list[SchemaProperty]) -> tuple[list[ast.Expr], list[ast.ClassDef]]: + annotations = [] + classes = [] + for prop in properties: + (ann, new_class) = type_annotation(prop.name, prop) + annotations.append(ast.AnnAssign(target=ast.Name(id=prop.name, ctx=ast.Store()), annotation=ann, simple=1)) + if prop.description and is_simple_field(prop): + annotations.append(ast.Expr(ast.Constant(prop.description))) + if new_class: + classes.append(new_class) + return (annotations, classes) + + +def generate_field_class(field_name: str, prop: SchemaProperty) -> ast.ClassDef: + prop_type = _get_type(prop) or "" + physical_type = (prop.physicalType or "").lower() + assert prop_type == "object" or physical_type in {"record", "struct"} + (annotated_type, new_classes) = field_definitions(prop.properties or []) + documentation = [ast.Expr(ast.Constant(prop.description))] if prop.description else [] + return ast.ClassDef( + name=field_name, + bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())], + body=[*documentation, *new_classes, *annotated_type], + keywords=[], + decorator_list=[], + ) + + +def generate_model_class(name: str, schema_obj: SchemaObject) -> ast.ClassDef: + (field_assignments, nested_classes) = field_definitions(schema_obj.properties or []) + documentation = [ast.Expr(ast.Constant(schema_obj.description))] if schema_obj.description else [] + result = ast.ClassDef( + name=name.capitalize(), + bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())], + body=[*documentation, *nested_classes, *field_assignments], + keywords=[], + decorator_list=[], + ) + return result diff --git a/datacontract/export/rdf_converter.py b/datacontract/export/rdf_converter.py deleted file mode 100644 index 75fa240f2..000000000 --- a/datacontract/export/rdf_converter.py +++ /dev/null @@ -1,180 +0,0 @@ -from pydantic import BaseModel -from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification - - -class RdfExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - self.dict_args = export_args - rdf_base = self.dict_args.get("rdf_base") - return to_rdf_n3(data_contract_spec=data_contract, base=rdf_base) - - -def is_literal(property_name): - return property_name in [ - "dataContractSpecification", - "title", - "version", - "description", - "name", - "url", - "type", - "location", - "format", - "delimiter", - "usage", - "limitations", - "billing", - "noticePeriod", - "required", - "unique", - "minLength", - "maxLength", - "example", - "pii", - "classification", - "data", - "enum", - "minimum", - "maximum", - "patterns", - ] - - -def is_uriref(property_name): - return property_name in ["model", "domain", "owner"] - - -def to_rdf_n3(data_contract_spec: DataContractSpecification, base) -> str: - return to_rdf(data_contract_spec, base).serialize(format="n3") - - -def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph: - if base is not None: - g = Graph(base=base) - else: - g = Graph(base=Namespace("")) - - dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/") - dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/Extension/") - - g.bind("dc", dc) - g.bind("dcx", dcx) - - this_contract = URIRef(data_contract_spec.id) - - g.add((this_contract, dc.dataContractSpecification, Literal(data_contract_spec.dataContractSpecification))) - g.add((this_contract, dc.id, Literal(data_contract_spec.id))) - g.add((this_contract, RDF.type, URIRef(dc + "DataContract"))) - - add_info(contract=this_contract, info=data_contract_spec.info, graph=g, dc=dc, dcx=dcx) - - if data_contract_spec.terms is not None: - add_terms(contract=this_contract, terms=data_contract_spec.terms, graph=g, dc=dc, dcx=dcx) - - for server_name, server in data_contract_spec.servers.items(): - add_server(contract=this_contract, server=server, server_name=server_name, graph=g, dc=dc, dcx=dcx) - - for model_name, model in data_contract_spec.models.items(): - add_model(contract=this_contract, model=model, model_name=model_name, graph=g, dc=dc, dcx=dcx) - - for example in data_contract_spec.examples: - add_example(contract=this_contract, example=example, graph=g, dc=dc, dcx=dcx) - - g.commit() - g.close() - - return g - - -def add_example(contract, example, graph, dc, dcx): - an_example = BNode() - graph.add((contract, dc["example"], an_example)) - graph.add((an_example, RDF.type, URIRef(dc + "Example"))) - for example_property in example.model_fields: - add_triple(sub=an_example, pred=example_property, obj=example, graph=graph, dc=dc, dcx=dcx) - - -def add_triple(sub, pred, obj, graph, dc, dcx): - if pred == "ref": - pass - elif isinstance(getattr(obj, pred), list): - for item in getattr(obj, pred): - add_predicate(sub=sub, pred=pred, obj=item, graph=graph, dc=dc, dcx=dcx) - elif isinstance(getattr(obj, pred), dict): - pass - else: - add_predicate(sub=sub, pred=pred, obj=obj, graph=graph, dc=dc, dcx=dcx) - - -def add_model(contract, model, model_name, graph, dc, dcx): - a_model = URIRef(model_name) - graph.add((contract, dc["model"], a_model)) - graph.add((a_model, dc.description, Literal(model.description))) - graph.add((a_model, RDF.type, URIRef(dc + "Model"))) - for field_name, field in model.fields.items(): - a_field = BNode() - graph.add((a_model, dc["field"], a_field)) - graph.add((a_field, RDF.type, URIRef(dc + "Field"))) - graph.add((a_field, dc["name"], Literal(field_name))) - for field_property in field.model_fields: - add_triple(sub=a_field, pred=field_property, obj=field, graph=graph, dc=dc, dcx=dcx) - - -def add_server(contract, server, server_name, graph, dc, dcx): - a_server = URIRef(server_name) - graph.add((contract, dc.server, a_server)) - graph.add((a_server, RDF.type, URIRef(dc + "Server"))) - for server_property_name in server.model_fields: - add_triple(sub=a_server, pred=server_property_name, obj=server, graph=graph, dc=dc, dcx=dcx) - - -def add_terms(contract, terms, graph, dc, dcx): - bnode_terms = BNode() - graph.add((contract, dc.terms, bnode_terms)) - graph.add((bnode_terms, RDF.type, URIRef(dc + "Terms"))) - for term_name in terms.model_fields: - add_triple(sub=bnode_terms, pred=term_name, obj=terms, graph=graph, dc=dc, dcx=dcx) - - -def add_info(contract, info, graph, dc, dcx): - bnode_info = BNode() - graph.add((contract, dc.info, bnode_info)) - graph.add((bnode_info, RDF.type, URIRef(dc + "Info"))) - graph.add((bnode_info, dc.title, Literal(info.title))) - graph.add((bnode_info, dc.description, Literal(info.description))) - graph.add((bnode_info, dc.version, Literal(info.version))) - - # add owner - owner = Literal(info.owner) - graph.add((bnode_info, dc.owner, owner)) - - # add contact - contact = BNode() - graph.add((bnode_info, dc.contact, contact)) - graph.add((contact, RDF.type, URIRef(dc + "Contact"))) - for contact_property in info.contact.model_fields: - add_triple(sub=contact, pred=contact_property, obj=info.contact, graph=graph, dc=dc, dcx=dcx) - - -def add_predicate(sub, pred, obj, graph, dc, dcx): - if isinstance(obj, BaseModel): - if getattr(obj, pred) is not None: - if is_literal(pred): - graph.add((sub, dc[pred], Literal(getattr(obj, pred)))) - elif is_uriref(pred): - graph.add((sub, dc[pred], URIRef(getattr(obj, pred)))) - else: - # treat it as an extension - graph.add((sub, dcx[pred], Literal(getattr(obj, pred)))) - else: - # assume primitive - if is_literal(pred): - graph.add((sub, dc[pred], Literal(obj))) - elif is_uriref(pred): - graph.add((sub, dc[pred], URIRef(obj))) - else: - # treat it as an extension - graph.add((sub, dcx[pred], Literal(obj))) diff --git a/datacontract/export/rdf_exporter.py b/datacontract/export/rdf_exporter.py new file mode 100644 index 000000000..5db446ccb --- /dev/null +++ b/datacontract/export/rdf_exporter.py @@ -0,0 +1,169 @@ +from open_data_contract_standard.model import OpenDataContractStandard +from pydantic import BaseModel +from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef + +from datacontract.export.exporter import Exporter + + +class RdfExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict: + self.dict_args = export_args + rdf_base = self.dict_args.get("rdf_base") + return to_rdf_n3(data_contract=data_contract, base=rdf_base) + + +def is_literal(property_name): + return property_name in [ + "apiVersion", + "kind", + "id", + "name", + "version", + "description", + "status", + "domain", + "dataProduct", + "tenant", + "type", + "location", + "format", + "delimiter", + "required", + "unique", + "minLength", + "maxLength", + "example", + "pii", + "classification", + "data", + "enum", + "minimum", + "maximum", + "patterns", + "logicalType", + "physicalType", + ] + + +def is_uriref(property_name): + return property_name in ["model", "domain", "owner", "team"] + + +def to_rdf_n3(data_contract: OpenDataContractStandard, base) -> str: + return to_rdf(data_contract, base).serialize(format="n3") + + +def to_rdf(data_contract: OpenDataContractStandard, base) -> Graph: + if base is not None: + g = Graph(base=base) + else: + g = Graph(base=Namespace("")) + + # Use ODCS namespace + odcs = Namespace("https://github.com/bitol-io/open-data-contract-standard/") + odcsx = Namespace("https://github.com/bitol-io/open-data-contract-standard/extension/") + + g.bind("odcs", odcs) + g.bind("odcsx", odcsx) + + this_contract = URIRef(data_contract.id) + + g.add((this_contract, odcs.apiVersion, Literal(data_contract.apiVersion))) + g.add((this_contract, odcs.kind, Literal(data_contract.kind))) + g.add((this_contract, odcs.id, Literal(data_contract.id))) + g.add((this_contract, RDF.type, URIRef(odcs + "DataContract"))) + + add_basic_info(contract=this_contract, data_contract=data_contract, graph=g, odcs=odcs, odcsx=odcsx) + + # Add servers + if data_contract.servers: + for server in data_contract.servers: + add_server(contract=this_contract, server=server, graph=g, odcs=odcs, odcsx=odcsx) + + # Add schema + if data_contract.schema_: + for schema_obj in data_contract.schema_: + add_schema(contract=this_contract, schema_obj=schema_obj, graph=g, odcs=odcs, odcsx=odcsx) + + g.commit() + g.close() + + return g + + +def add_basic_info(contract, data_contract: OpenDataContractStandard, graph, odcs, odcsx): + bnode_info = BNode() + graph.add((contract, odcs.info, bnode_info)) + graph.add((bnode_info, RDF.type, URIRef(odcs + "Info"))) + + if data_contract.name: + graph.add((bnode_info, odcs.name, Literal(data_contract.name))) + if data_contract.description: + desc = data_contract.description + if hasattr(desc, 'purpose') and desc.purpose: + graph.add((bnode_info, odcs.description, Literal(desc.purpose))) + elif isinstance(desc, str): + graph.add((bnode_info, odcs.description, Literal(desc))) + if data_contract.version: + graph.add((bnode_info, odcs.version, Literal(data_contract.version))) + + # Add team/owner + if data_contract.team: + graph.add((bnode_info, odcs.team, Literal(data_contract.team.name))) + + +def add_server(contract, server, graph, odcs, odcsx): + a_server = URIRef(server.server or "default") + graph.add((contract, odcs.server, a_server)) + graph.add((a_server, RDF.type, URIRef(odcs + "Server"))) + for server_property_name in server.model_fields: + add_triple(sub=a_server, pred=server_property_name, obj=server, graph=graph, dc=odcs, dcx=odcsx) + + +def add_schema(contract, schema_obj, graph, odcs, odcsx): + a_model = URIRef(schema_obj.name) + graph.add((contract, odcs.schema_, a_model)) + graph.add((a_model, odcs.description, Literal(schema_obj.description or ""))) + graph.add((a_model, RDF.type, URIRef(odcs + "Schema"))) + + if schema_obj.properties: + for prop in schema_obj.properties: + a_property = BNode() + graph.add((a_model, odcs["property"], a_property)) + graph.add((a_property, RDF.type, URIRef(odcs + "Property"))) + graph.add((a_property, odcs["name"], Literal(prop.name))) + for field_property in prop.model_fields: + add_triple(sub=a_property, pred=field_property, obj=prop, graph=graph, dc=odcs, dcx=odcsx) + + +def add_triple(sub, pred, obj, graph, dc, dcx): + if pred == "ref": + pass + elif isinstance(getattr(obj, pred, None), list): + for item in getattr(obj, pred): + add_predicate(sub=sub, pred=pred, obj=item, graph=graph, dc=dc, dcx=dcx) + elif isinstance(getattr(obj, pred, None), dict): + pass + else: + add_predicate(sub=sub, pred=pred, obj=obj, graph=graph, dc=dc, dcx=dcx) + + +def add_predicate(sub, pred, obj, graph, dc, dcx): + if isinstance(obj, BaseModel): + if getattr(obj, pred, None) is not None: + if is_literal(pred): + graph.add((sub, dc[pred], Literal(getattr(obj, pred)))) + elif is_uriref(pred): + graph.add((sub, dc[pred], URIRef(getattr(obj, pred)))) + else: + # treat it as an extension + graph.add((sub, dcx[pred], Literal(getattr(obj, pred)))) + else: + # assume primitive + if is_literal(pred): + graph.add((sub, dc[pred], Literal(obj))) + elif is_uriref(pred): + graph.add((sub, dc[pred], URIRef(obj))) + else: + # treat it as an extension + graph.add((sub, dcx[pred], Literal(obj))) diff --git a/datacontract/export/sodacl_converter.py b/datacontract/export/sodacl_exporter.py similarity index 63% rename from datacontract/export/sodacl_converter.py rename to datacontract/export/sodacl_exporter.py index 5e1fa9a03..61d72f5ca 100644 --- a/datacontract/export/sodacl_converter.py +++ b/datacontract/export/sodacl_exporter.py @@ -1,16 +1,16 @@ import yaml +from open_data_contract_standard.model import OpenDataContractStandard, Server from datacontract.engines.data_contract_checks import create_checks from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification, Server from datacontract.model.run import Run class SodaExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> str: + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> str: run = Run.create_run() - server = get_server(data_contract, server) - run.checks.extend(create_checks(data_contract, server)) + found_server = get_server(data_contract, server) + run.checks.extend(create_checks(data_contract, found_server)) return to_sodacl_yaml(run) @@ -32,7 +32,7 @@ def to_sodacl_yaml(run: Run) -> str: return yaml.dump(sodacl_dict) -def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None: - if server_name is None: +def get_server(data_contract: OpenDataContractStandard, server_name: str = None) -> Server | None: + if server_name is None or data_contract.servers is None: return None - return data_contract_specification.servers.get(server_name) + return next((s for s in data_contract.servers if s.server == server_name), None) diff --git a/datacontract/export/spark_converter.py b/datacontract/export/spark_converter.py deleted file mode 100644 index 6a55d2686..000000000 --- a/datacontract/export/spark_converter.py +++ /dev/null @@ -1,245 +0,0 @@ -import json - -from pyspark.sql import types - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Field, - Model, -) - - -class SparkExporter(Exporter): - """ - Exporter class for exporting data contracts to Spark schemas. - """ - - def export( - self, - data_contract: DataContractSpecification, - model, - server, - sql_server_type, - export_args, - ) -> dict[str, types.StructType]: - """ - Export the given data contract to Spark schemas. - - Args: - data_contract (DataContractSpecification): The data contract specification. - model: Not used in this implementation. - server: Not used in this implementation. - sql_server_type: Not used in this implementation. - export_args: Additional arguments for export. - - Returns: - dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas. - """ - return to_spark(data_contract) - - -def to_spark(contract: DataContractSpecification) -> str: - """ - Converts a DataContractSpecification into a Spark schema string. - - Args: - contract (DataContractSpecification): The data contract specification containing models. - - Returns: - str: A string representation of the Spark schema for each model in the contract. - """ - return "\n\n".join( - f"{model_name} = {print_schema(to_spark_schema(model))}" for model_name, model in contract.models.items() - ) - - -def to_spark_dict(contract: DataContractSpecification) -> dict[str, types.StructType]: - """ - Convert a data contract specification to Spark schemas. - - Args: - contract (DataContractSpecification): The data contract specification. - - Returns: - dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas. - """ - return {model_name: to_spark_schema(model) for model_name, model in contract.models.items()} - - -def to_spark_schema(model: Model) -> types.StructType: - """ - Convert a model to a Spark schema. - - Args: - model (Model): The model to convert. - - Returns: - types.StructType: The corresponding Spark schema. - """ - return to_struct_type(model.fields) - - -def to_struct_type(fields: dict[str, Field]) -> types.StructType: - """ - Convert a dictionary of fields to a Spark StructType. - - Args: - fields (dict[str, Field]): The fields to convert. - - Returns: - types.StructType: The corresponding Spark StructType. - """ - struct_fields = [to_struct_field(field, field_name) for field_name, field in fields.items()] - return types.StructType(struct_fields) - - -def to_struct_field(field: Field, field_name: str) -> types.StructField: - """ - Convert a field to a Spark StructField. - - Args: - field (Field): The field to convert. - field_name (str): The name of the field. - - Returns: - types.StructField: The corresponding Spark StructField. - """ - data_type = to_spark_data_type(field) - metadata = to_spark_metadata(field) - return types.StructField(name=field_name, dataType=data_type, nullable=not field.required, metadata=metadata) - - -def to_spark_data_type(field: Field) -> types.DataType: - """ - Convert a field to a Spark DataType. - - Args: - field (Field): The field to convert. - - Returns: - types.DataType: The corresponding Spark DataType. - """ - field_type = field.type - if field_type is None or field_type in ["null"]: - return types.NullType() - if field_type == "array": - return types.ArrayType(to_spark_data_type(field.items)) - if field_type in ["object", "record", "struct"]: - return types.StructType(to_struct_type(field.fields)) - if field_type == "map": - return types.MapType(to_spark_data_type(field.keys), to_spark_data_type(field.values)) - if field_type == "variant": - return types.VariantType() - if field_type in ["string", "varchar", "text"]: - return types.StringType() - if field_type in ["number", "decimal", "numeric"]: - precision = field.precision if field.precision is not None else 38 - scale = field.scale if field.scale is not None else 0 - return types.DecimalType(precision=precision, scale=scale) - if field_type in ["integer", "int"]: - return types.IntegerType() - if field_type == "long": - return types.LongType() - if field_type == "float": - return types.FloatType() - if field_type == "double": - return types.DoubleType() - if field_type == "boolean": - return types.BooleanType() - if field_type in ["timestamp", "timestamp_tz"]: - return types.TimestampType() - if field_type == "timestamp_ntz": - return types.TimestampNTZType() - if field_type == "date": - return types.DateType() - if field_type == "bytes": - return types.BinaryType() - return types.StringType() # default if no condition is met - - -def to_spark_metadata(field: Field) -> dict[str, str]: - """ - Convert a field to a Spark metadata dictonary. - - Args: - field (Field): The field to convert. - - Returns: - dict: dictionary that can be supplied to Spark as metadata for a StructField - """ - - metadata = {} - if field.description: - metadata["comment"] = field.description - - return metadata - - -def print_schema(dtype: types.DataType) -> str: - """ - Converts a PySpark DataType schema to its equivalent code representation. - - Args: - dtype (types.DataType): The PySpark DataType schema to be converted. - - Returns: - str: The code representation of the PySpark DataType schema. - """ - - def indent(text: str, level: int) -> str: - """ - Indents each line of the given text by a specified number of levels. - - Args: - text (str): The text to be indented. - level (int): The number of indentation levels. - - Returns: - str: The indented text. - """ - return "\n".join([f"{' ' * level}{line}" for line in text.split("\n")]) - - def repr_column(column: types.StructField) -> str: - """ - Converts a PySpark StructField to its code representation. - - Args: - column (types.StructField): The StructField to be converted. - - Returns: - str: The code representation of the StructField. - """ - name = f'"{column.name}"' - data_type = indent(print_schema(column.dataType), 1) - nullable = indent(f"{column.nullable}", 1) - if column.metadata: - metadata = indent(f"{json.dumps(column.metadata)}", 1) - return f"StructField({name},\n{data_type},\n{nullable},\n{metadata}\n)" - else: - return f"StructField({name},\n{data_type},\n{nullable}\n)" - - def format_struct_type(struct_type: types.StructType) -> str: - """ - Converts a PySpark StructType to its code representation. - - Args: - struct_type (types.StructType): The StructType to be converted. - - Returns: - str: The code representation of the StructType. - """ - fields = ",\n".join([indent(repr_column(field), 1) for field in struct_type.fields]) - return f"StructType([\n{fields}\n])" - - if isinstance(dtype, types.StructType): - return format_struct_type(dtype) - elif isinstance(dtype, types.ArrayType): - return f"ArrayType({print_schema(dtype.elementType)})" - elif isinstance(dtype, types.MapType): - return f"MapType(\n{indent(print_schema(dtype.keyType), 1)}, {print_schema(dtype.valueType)})" - elif isinstance(dtype, types.DecimalType): - return f"DecimalType({dtype.precision}, {dtype.scale})" - else: - dtype_str = str(dtype) - return dtype_str if dtype_str.endswith("()") else f"{dtype_str}()" diff --git a/datacontract/export/spark_exporter.py b/datacontract/export/spark_exporter.py new file mode 100644 index 000000000..c76fada1e --- /dev/null +++ b/datacontract/export/spark_exporter.py @@ -0,0 +1,372 @@ +import json +import re +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty +from pyspark.sql import types + +from datacontract.export.exporter import Exporter + + +class SparkExporter(Exporter): + """ + Exporter class for exporting data contracts to Spark schemas. + """ + + def export( + self, + data_contract: OpenDataContractStandard, + schema_name, + server, + sql_server_type, + export_args, + ) -> dict[str, types.StructType]: + """ + Export the given data contract to Spark schemas. + + Args: + data_contract (OpenDataContractStandard): The data contract specification. + schema_name: The name of the schema to export, or 'all' for all schemas. + server: Not used in this implementation. + sql_server_type: Not used in this implementation. + export_args: Additional arguments for export. + + Returns: + dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas. + """ + return to_spark(data_contract) + + +def to_spark(contract: OpenDataContractStandard) -> str: + """ + Converts an OpenDataContractStandard into a Spark schema string. + + Args: + contract (OpenDataContractStandard): The data contract specification containing models. + + Returns: + str: A string representation of the Spark schema for each model in the contract. + """ + result = [] + if contract.schema_: + for schema_obj in contract.schema_: + result.append(f"{schema_obj.name} = {print_schema(to_spark_schema(schema_obj))}") + return "\n\n".join(result) + + +def to_spark_dict(contract: OpenDataContractStandard) -> dict[str, types.StructType]: + """ + Convert a data contract specification to Spark schemas. + + Args: + contract (OpenDataContractStandard): The data contract specification. + + Returns: + dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas. + """ + result = {} + if contract.schema_: + for schema_obj in contract.schema_: + result[schema_obj.name] = to_spark_schema(schema_obj) + return result + + +def to_spark_schema(schema_obj: SchemaObject) -> types.StructType: + """ + Convert a schema object to a Spark schema. + + Args: + schema_obj (SchemaObject): The schema object to convert. + + Returns: + types.StructType: The corresponding Spark schema. + """ + return to_struct_type(schema_obj.properties or []) + + +def to_struct_type(properties: List[SchemaProperty]) -> types.StructType: + """ + Convert a list of properties to a Spark StructType. + + Args: + properties (List[SchemaProperty]): The properties to convert. + + Returns: + types.StructType: The corresponding Spark StructType. + """ + struct_fields = [to_struct_field(prop) for prop in properties] + return types.StructType(struct_fields) + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the logical type from a schema property.""" + return prop.logicalType + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def _get_custom_property_value(prop: SchemaProperty, key: str) -> Optional[str]: + """Get a custom property value.""" + if prop.customProperties is None: + return None + for cp in prop.customProperties: + if cp.property == key: + return cp.value + return None + + +def _parse_decimal_precision_scale(physical_type: str) -> tuple[Optional[int], Optional[int]]: + """Parse precision and scale from physicalType like 'decimal(10,2)' or 'numeric(18,4)'.""" + match = re.match(r"(?:decimal|numeric)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", physical_type, re.IGNORECASE) + if match: + return int(match.group(1)), int(match.group(2)) + return None, None + + +def _get_decimal_type(prop: SchemaProperty) -> types.DecimalType: + """Get DecimalType: first from customProperties, then parse from physicalType, else Spark defaults.""" + # First check customProperties + precision_str = _get_custom_property_value(prop, "precision") + scale_str = _get_custom_property_value(prop, "scale") + if precision_str is not None or scale_str is not None: + precision = int(precision_str) if precision_str else types.DecimalType().precision + scale = int(scale_str) if scale_str else types.DecimalType().scale + return types.DecimalType(precision=precision, scale=scale) + + # Fallback: parse from physicalType + if prop.physicalType: + precision, scale = _parse_decimal_precision_scale(prop.physicalType) + if precision is not None: + return types.DecimalType(precision=precision, scale=scale if scale is not None else 0) + + # Use Spark defaults + return types.DecimalType() + + +def _logical_type_to_spark_type(logical_type: str) -> types.DataType: + """Convert a logical type string to a Spark DataType.""" + if logical_type is None: + return types.StringType() + lt = logical_type.lower() + if lt == "string": + return types.StringType() + if lt == "integer": + return types.LongType() + if lt == "number": + return types.DoubleType() + if lt == "boolean": + return types.BooleanType() + if lt == "date": + return types.DateType() + if lt == "timestamp": + return types.TimestampType() + if lt == "object": + return types.StructType([]) + return types.StringType() + + +def to_struct_field(prop: SchemaProperty) -> types.StructField: + """ + Convert a property to a Spark StructField. + + Args: + prop (SchemaProperty): The property to convert. + + Returns: + types.StructField: The corresponding Spark StructField. + """ + data_type = to_spark_data_type(prop) + metadata = to_spark_metadata(prop) + return types.StructField(name=prop.name, dataType=data_type, nullable=not prop.required, metadata=metadata) + + +def to_spark_data_type(prop: SchemaProperty) -> types.DataType: + """ + Convert a property to a Spark DataType. + + Args: + prop (SchemaProperty): The property to convert. + + Returns: + types.DataType: The corresponding Spark DataType. + """ + logical_type = _get_type(prop) + physical_type = prop.physicalType.lower() if prop.physicalType else None + + # Check for null type + if logical_type is None and physical_type is None: + return types.NullType() + if physical_type == "null": + return types.NullType() + + # Handle array type + if logical_type == "array": + if prop.items: + return types.ArrayType(to_spark_data_type(prop.items)) + return types.ArrayType(types.StringType()) + + # Handle map type (check physical type) - MUST be before object/struct check + if physical_type == "map": + # Get key type from customProperties, default to string + key_type = types.StringType() + value_type = types.StringType() + + # Check for mapKeyType and mapValueType in customProperties + map_key_type = _get_custom_property_value(prop, "mapKeyType") + map_value_type = _get_custom_property_value(prop, "mapValueType") + + if map_key_type: + key_type = _logical_type_to_spark_type(map_key_type) + + # If map has struct values with properties, use them + if prop.properties: + value_type = to_struct_type(prop.properties) + elif map_value_type: + value_type = _logical_type_to_spark_type(map_value_type) + + return types.MapType(key_type, value_type) + + # Handle object/struct type + if logical_type == "object" or physical_type in ["object", "record", "struct"]: + if prop.properties: + return to_struct_type(prop.properties) + return types.StructType([]) + + # Handle variant type + if physical_type == "variant": + return types.VariantType() + + # Check physical type first for specific SQL types + if physical_type: + if physical_type in ["string", "varchar", "text", "char", "nvarchar"]: + return types.StringType() + if physical_type in ["decimal", "numeric"] or physical_type.startswith(("decimal(", "numeric(")): + return _get_decimal_type(prop) + if physical_type in ["integer", "int", "int32"]: + return types.IntegerType() + if physical_type in ["long", "bigint", "int64"]: + return types.LongType() + if physical_type in ["float", "real", "float32"]: + return types.FloatType() + if physical_type in ["double", "float64"]: + return types.DoubleType() + if physical_type in ["boolean", "bool"]: + return types.BooleanType() + if physical_type in ["timestamp", "timestamp_tz"]: + return types.TimestampType() + if physical_type == "timestamp_ntz": + return types.TimestampNTZType() + if physical_type == "date": + return types.DateType() + if physical_type in ["bytes", "binary", "bytea"]: + return types.BinaryType() + + # Fall back to logical type + match logical_type: + case "string": + return types.StringType() + case "number": + return _get_decimal_type(prop) + case "integer": + return types.LongType() + case "boolean": + return types.BooleanType() + case "date": + return types.DateType() + case "timestamp": + return types.TimestampType() + case _: + return types.StringType() # default if no condition is met + + +def to_spark_metadata(prop: SchemaProperty) -> dict[str, str]: + """ + Convert a property to a Spark metadata dictionary. + + Args: + prop (SchemaProperty): The property to convert. + + Returns: + dict: dictionary that can be supplied to Spark as metadata for a StructField + """ + metadata = {} + if prop.description: + metadata["comment"] = prop.description + + return metadata + + +def print_schema(dtype: types.DataType) -> str: + """ + Converts a PySpark DataType schema to its equivalent code representation. + + Args: + dtype (types.DataType): The PySpark DataType schema to be converted. + + Returns: + str: The code representation of the PySpark DataType schema. + """ + + def indent(text: str, level: int) -> str: + """ + Indents each line of the given text by a specified number of levels. + + Args: + text (str): The text to be indented. + level (int): The number of indentation levels. + + Returns: + str: The indented text. + """ + return "\n".join([f"{' ' * level}{line}" if line else "" for line in text.split("\n")]) + + def repr_column(column: types.StructField) -> str: + """ + Converts a PySpark StructField to its code representation. + + Args: + column (types.StructField): The StructField to be converted. + + Returns: + str: The code representation of the StructField. + """ + name = f'"{column.name}"' + data_type = indent(print_schema(column.dataType), 1) + nullable = indent(f"{column.nullable}", 1) + if column.metadata: + metadata = indent(f"{json.dumps(column.metadata)}", 1) + return f"StructField({name},\n{data_type},\n{nullable},\n{metadata}\n)" + else: + return f"StructField({name},\n{data_type},\n{nullable}\n)" + + def format_struct_type(struct_type: types.StructType) -> str: + """ + Converts a PySpark StructType to its code representation. + + Args: + struct_type (types.StructType): The StructType to be converted. + + Returns: + str: The code representation of the StructType. + """ + if not struct_type.fields: + return "StructType([\n\n])" + fields = ",\n".join([indent(repr_column(field), 1) for field in struct_type.fields]) + return f"StructType([\n{fields}\n])" + + if isinstance(dtype, types.StructType): + return format_struct_type(dtype) + elif isinstance(dtype, types.ArrayType): + return f"ArrayType({print_schema(dtype.elementType)})" + elif isinstance(dtype, types.MapType): + return f"MapType(\n{indent(print_schema(dtype.keyType), 1)}, {print_schema(dtype.valueType)})" + elif isinstance(dtype, types.DecimalType): + return f"DecimalType({dtype.precision}, {dtype.scale})" + else: + dtype_str = str(dtype) + return dtype_str if dtype_str.endswith("()") else f"{dtype_str}()" diff --git a/datacontract/export/sql_converter.py b/datacontract/export/sql_converter.py deleted file mode 100644 index 9e42f56da..000000000 --- a/datacontract/export/sql_converter.py +++ /dev/null @@ -1,138 +0,0 @@ -from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type -from datacontract.export.sql_type_converter import convert_to_sql_type -from datacontract.model.data_contract_specification import DataContractSpecification, Model - - -class SqlExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> str: - server_type = _determine_sql_server_type( - data_contract, - sql_server_type, - ) - return to_sql_ddl(data_contract, server_type, export_args.get("server")) - - -class SqlQueryExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> str: - model_name, model_value = _check_models_for_export(data_contract, model, self.export_format) - server_type = _determine_sql_server_type(data_contract, sql_server_type, export_args.get("server")) - return to_sql_query( - data_contract, - model_name, - model_value, - server_type, - ) - - -def to_sql_query( - data_contract_spec: DataContractSpecification, model_name: str, model_value: Model, server_type: str = "snowflake" -) -> str: - if data_contract_spec is None: - return "" - if data_contract_spec.models is None or len(data_contract_spec.models) == 0: - return "" - - result = "" - result += f"-- Data Contract: {data_contract_spec.id}\n" - result += f"-- SQL Dialect: {server_type}\n" - result += _to_sql_query(model_name, model_value, server_type) - - return result - - -def _to_sql_query(model_name, model_value, server_type) -> str: - columns = [] - for field_name, field in model_value.fields.items(): - # TODO escape SQL reserved key words, probably dependent on server type - columns.append(field_name) - - result = "select\n" - current_column_index = 1 - number_of_columns = len(columns) - for column in columns: - result += f" {column}" - if current_column_index < number_of_columns: - result += "," - result += "\n" - current_column_index += 1 - result += f"from {model_name}\n" - return result - - -def to_sql_ddl( - data_contract_spec: DataContractSpecification, server_type: str = "snowflake", server: str = None -) -> str: - if data_contract_spec is None: - return "" - if data_contract_spec.models is None or len(data_contract_spec.models) == 0: - return "" - - table_prefix = "" - - if server is None: - servers = data_contract_spec.servers - else: - servers = {server: data_contract_spec.servers[server]} - - for server_name, server in iter(servers.items()): - if server.type == "snowflake": - server_type = "snowflake" - break - if server.type == "postgres": - server_type = "postgres" - break - if server.type == "databricks": - server_type = "databricks" - if server.catalog is not None and server.schema_ is not None: - table_prefix = server.catalog + "." + server.schema_ + "." - break - if server.type == server_type: - break - - result = "" - result += f"-- Data Contract: {data_contract_spec.id}\n" - result += f"-- SQL Dialect: {server_type}\n" - - for model_name, model in iter(data_contract_spec.models.items()): - result += _to_sql_table(table_prefix + model_name, model, server_type) - - return result.strip() - - -def _to_sql_table(model_name, model, server_type="snowflake"): - if server_type == "databricks": - # Databricks recommends to use the CREATE OR REPLACE statement for unity managed tables - # https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html - result = f"CREATE OR REPLACE TABLE {model_name} (\n" - else: - result = f"CREATE TABLE {model_name} (\n" - fields = len(model.fields) - current_field_index = 1 - for field_name, field in iter(model.fields.items()): - type = convert_to_sql_type(field, server_type) - result += f" {field_name} {type}" - if field.required: - result += " not null" - if field.primaryKey or field.primary: - result += " primary key" - if server_type == "databricks" and field.description is not None: - result += f' COMMENT "{_escape(field.description)}"' - if server_type == "snowflake" and field.description is not None: - result += f" COMMENT '{_escape(field.description)}'" - if current_field_index < fields: - result += "," - result += "\n" - current_field_index += 1 - result += ")" - if server_type == "databricks" and model.description is not None: - result += f' COMMENT "{_escape(model.description)}"' - if server_type == "snowflake" and model.description is not None: - result += f" COMMENT='{_escape(model.description)}'" - result += ";\n" - return result - - -def _escape(text: str | None) -> str | None: - if text is None: - return None - return text.replace('"', '\\"') diff --git a/datacontract/export/sql_exporter.py b/datacontract/export/sql_exporter.py new file mode 100644 index 000000000..5252d27c2 --- /dev/null +++ b/datacontract/export/sql_exporter.py @@ -0,0 +1,206 @@ +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject + +from datacontract.export.exporter import Exporter, _check_schema_name_for_export, _determine_sql_server_type +from datacontract.export.sql_type_converter import convert_to_sql_type + + +class SqlExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> str: + server_type = _determine_sql_server_type( + data_contract, + sql_server_type, + ) + return to_sql_ddl(data_contract, server_type, export_args.get("server")) + + +class SqlQueryExporter(Exporter): + def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> str: + model_name, model_value = _check_schema_name_for_export(data_contract, schema_name, self.export_format) + server_type = _determine_sql_server_type(data_contract, sql_server_type, export_args.get("server")) + return to_sql_query( + data_contract, + model_name, + model_value, + server_type, + ) + + +def to_sql_query( + data_contract: OpenDataContractStandard, model_name: str, model_value: SchemaObject, server_type: str = "snowflake" +) -> str: + if data_contract is None: + return "" + if data_contract.schema_ is None or len(data_contract.schema_) == 0: + return "" + + result = "" + result += f"-- Data Contract: {data_contract.id}\n" + result += f"-- SQL Dialect: {server_type}\n" + result += _to_sql_query(model_name, model_value, server_type) + + return result + + +def _to_sql_query(model_name: str, model_value: SchemaObject, server_type: str) -> str: + columns = [] + if model_value.properties: + for prop in model_value.properties: + # TODO escape SQL reserved key words, probably dependent on server type + columns.append(prop.name) + + result = "select\n" + current_column_index = 1 + number_of_columns = len(columns) + for column in columns: + result += f" {column}" + if current_column_index < number_of_columns: + result += "," + result += "\n" + current_column_index += 1 + result += f"from {model_name}\n" + return result + + +def to_sql_ddl(data_contract: OpenDataContractStandard, server_type: str = "snowflake", server: str = None) -> str: + if data_contract is None: + return "" + if data_contract.schema_ is None or len(data_contract.schema_) == 0: + return "" + + table_prefix = "" + + # Get servers list + servers = data_contract.servers or [] + if server is not None: + # Filter to just the requested server + servers = [s for s in servers if s.server == server] + + for srv in servers: + if srv.type == "snowflake": + server_type = "snowflake" + table_prefix = srv.database + "." + srv.schema_ + "." + break + if srv.type == "postgres": + server_type = "postgres" + break + if srv.type == "databricks": + server_type = "databricks" + if srv.catalog is not None and srv.schema_ is not None: + table_prefix = srv.catalog + "." + srv.schema_ + "." + break + if srv.type == server_type: + break + + result = "" + result += f"-- Data Contract: {data_contract.id}\n" + result += f"-- SQL Dialect: {server_type}\n" + + for schema_obj in data_contract.schema_: + if schema_obj.physicalType == "table" or None: + result += _to_sql_table(table_prefix + schema_obj.name, schema_obj, server_type) + elif schema_obj.physicalType == "view": + result += _to_sql_view(table_prefix + schema_obj.name, schema_obj, server_type) + + return result.strip() + + +def _to_sql_table(model_name: str, model: SchemaObject, server_type: str = "snowflake") -> str: + if server_type == "databricks": + # Databricks recommends to use the CREATE OR REPLACE statement for unity managed tables + # https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html + result = f"CREATE OR REPLACE TABLE {model_name} (\n" + else: + result = f"CREATE TABLE {model_name} (\n" + + properties = model.properties or [] + fields = len(properties) + current_field_index = 1 + + def _get_sorted_primary_keys(props: list) -> list: + pk_list = [] + for p in props: + if p.primaryKey is True: + pk_list.append(p) + + pk_list.sort(key=lambda p: p.primaryKeyPosition) + return pk_list + + pks = _get_sorted_primary_keys(properties) + + for prop in properties: + type_str = convert_to_sql_type(prop, server_type) + result += f" {prop.name} {type_str}" + if prop.required: + result += " not null" + if (prop.primaryKey and ((prop.primaryKeyPosition == -1 or prop.primaryKeyPosition is None)) or (pks and len(pks) == 1)): + # last position is the finest grain a.k.a the surroage key + result += " primary key" + if server_type == "databricks" and prop.description is not None: + result += f' COMMENT "{_escape(prop.description)}"' + if server_type == "snowflake" and prop.description is not None: + result += f" COMMENT '{_escape(prop.description)}'" + if current_field_index < fields: + result += "," + result += "\n" + current_field_index += 1 + + # COMPOSITE KEY management in databricks with dedicated PRIMARY KEYS constraints + if server_type not in ["databricks","snowflake"] and pks and len(pks) > 1: + result += f" , CONSTRAINT PK_{model.name} PRIMARY KEY({','.join([pk.name for pk in pks])})" + # COMPOSITE KEY management in snowflake with UNIQUE constraints + if server_type == "snowflake" and pks and len(pks) > 1: + result += f" , UNIQUE({','.join([pk.name for pk in pks if pk.primaryKeyPosition >= 0])})" + + result += ")" + if server_type == "databricks" and model.description is not None: + result += f' COMMENT "{_escape(model.description)}"' + if server_type == "snowflake" and model.description is not None: + result += f" COMMENT='{_escape(model.description)}'" + result += ";\n" + return result + + +def _escape(text: str | None) -> str | None: + if text is None: + return None + return text.replace('"', '\\"') + + +def _to_sql_view(model_name: str, model: SchemaObject, server_type: str = "snowflake") -> str: + if server_type in ("databricks", "snowflake", "postgres"): + # Databricks recommends to use the CREATE OR REPLACE statement for unity managed tables + # https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-ddl-create-view + # For Snowflake and Postgres, CREATE OR REPLACE VIEW is also supported + result = f"CREATE OR REPLACE VIEW {model_name} (\n" + else: + result = f"CREATE VIEW {model_name} (\n" + + properties = model.properties or [] + fields = len(properties) + current_field_index = 1 + select_statement = "\nSELECT\n" + transformSourceObjects = {} + + for prop in properties: + if prop.transformSourceObjects: + transformSourceObjects = {object for object in prop.transformSourceObjects} + + result += f" {prop.name}" + select_statement += f" {prop.name},\n" + + if server_type == "databricks" and prop.description is not None: + result += f' COMMENT "{_escape(prop.description)}"' + if server_type == "snowflake" and prop.description is not None and prop.description != "": + result += f" COMMENT '{_escape(prop.description)}'" + + if current_field_index < fields: + result += "," + result += "\n" + current_field_index += 1 + result += ")" + if server_type == "databricks" and model.description is not None: + result += f' COMMENT "{_escape(model.description)}"' + if server_type == "snowflake" and model.description is not None and model.description != "": + result += f" COMMENT='{_escape(model.description)}'" + result += f" AS {select_statement.strip()[:-1]}\n FROM {' JOIN '.join(transformSourceObjects)};\n" + return result diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index db1688372..f29b110e9 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -1,10 +1,119 @@ -from datacontract.export.bigquery_converter import map_type_to_bigquery -from datacontract.model.data_contract_specification import Field +from typing import Any, Dict, Optional, Protocol, Union + +from open_data_contract_standard.model import SchemaProperty + + +class FieldLike(Protocol): + """Protocol for field-like objects (DCS Field or PropertyAdapter).""" + type: Optional[str] + config: Optional[Dict[str, Any]] + precision: Optional[int] + scale: Optional[int] + format: Optional[str] + items: Optional["FieldLike"] + fields: Dict[str, "FieldLike"] + + +def _get_type(field: Union[SchemaProperty, FieldLike]) -> Optional[str]: + """Get the type from a field, handling both ODCS and DCS. Prefers physicalType for accuracy.""" + if field and isinstance(field, SchemaProperty): + # Prefer physicalType for accurate type mapping + if field.physicalType: + return field.physicalType + return field.logicalType + if field and field.type: + return field.type + + return "string" + + +def _get_config(field: Union[SchemaProperty, FieldLike]) -> Optional[Dict[str, Any]]: + """Get the config from a field, handling both ODCS and DCS.""" + if isinstance(field, SchemaProperty): + config = field.model_dump() + if field.customProperties: + config.update({cp.property: cp.value for cp in field.customProperties}) + return config + return None -def convert_to_sql_type(field: Field, server_type: str) -> str: - if field.config and "physicalType" in field.config: - return field.config["physicalType"] +def _get_config_value(field: Union[SchemaProperty, FieldLike], key: str) -> Optional[Any]: + """Get a config value from a field.""" + config = _get_config(field) + if config is None: + return None + return config.get(key) + + +def _get_precision(field: Union[SchemaProperty, FieldLike]) -> Optional[int]: + """Get precision from a field.""" + if isinstance(field, SchemaProperty): + if field.logicalTypeOptions and field.logicalTypeOptions.get("precision"): + return field.logicalTypeOptions.get("precision") + # Also check customProperties + val = _get_config_value(field, "precision") + if val: + return int(val) + return None + return field.precision + + +def _get_scale(field: Union[SchemaProperty, FieldLike]) -> Optional[int]: + """Get scale from a field.""" + if isinstance(field, SchemaProperty): + if field.logicalTypeOptions and field.logicalTypeOptions.get("scale"): + return field.logicalTypeOptions.get("scale") + # Also check customProperties + val = _get_config_value(field, "scale") + if val: + return int(val) + return None + return field.scale + + +def _get_format(field: Union[SchemaProperty, FieldLike]) -> Optional[str]: + """Get format from a field.""" + if isinstance(field, SchemaProperty): + if field.logicalTypeOptions: + return field.logicalTypeOptions.get("format") + return None + return field.format + + +def _get_items(field: Union[SchemaProperty, FieldLike]) -> Optional[Union[SchemaProperty, FieldLike]]: + """Get items from an array field.""" + if isinstance(field, SchemaProperty): + return field.items + return field.items + + +def _get_nested_fields(field: Union[SchemaProperty, FieldLike]) -> Dict[str, Union[SchemaProperty, FieldLike]]: + """Get nested fields from an object field.""" + if isinstance(field, SchemaProperty): + if field.properties is None: + return {} + return {p.name: p for p in field.properties} + return field.fields if field.fields else {} + + +def convert_to_sql_type(field: Union[SchemaProperty, FieldLike], server_type: str) -> str: + physical_type = _get_config_value(field, "physicalType") + if physical_type and physical_type.lower() not in ['array', 'object', 'record', 'struct'] : + return physical_type + elif physical_type and physical_type.lower() == 'array': + items = _get_items(field) + if items: + item_type = convert_to_sql_type(items, server_type) + return f"ARRAY<{item_type}>" + return "TEXT[]" + elif physical_type and physical_type.lower() in ['object', 'record', 'struct']: + structure_field = "STRUCT<" + field_strings = [] + for fieldKey, fieldValue in _get_nested_fields(field).items(): + field_strings.append(f"{fieldKey}:{convert_to_sql_type(fieldValue, server_type)}") + structure_field += ", ".join(field_strings) + structure_field += ">" + return structure_field if server_type == "snowflake": return convert_to_snowflake(field) @@ -22,17 +131,20 @@ def convert_to_sql_type(field: Field, server_type: str) -> str: return convert_type_to_bigquery(field) elif server_type == "trino": return convert_type_to_trino(field) + elif server_type == "oracle": + return convert_type_to_oracle(field) - return field.type + return _get_type(field) # snowflake data types: # https://docs.snowflake.com/en/sql-reference/data-types.html -def convert_to_snowflake(field: Field) -> None | str: - if field.config and "snowflakeType" in field.config: - return field.config["snowflakeType"] +def convert_to_snowflake(field: Union[SchemaProperty, FieldLike]) -> None | str: + snowflake_type = _get_config_value(field, "snowflakeType") + if snowflake_type: + return snowflake_type - type = field.type + type = _get_type(field) # currently optimized for snowflake # LEARNING: data contract has no direct support for CHAR,CHARACTER # LEARNING: data contract has no support for "date-time", "datetime", "time" @@ -71,15 +183,17 @@ def convert_to_snowflake(field: Field) -> None | str: # https://www.postgresql.org/docs/current/datatype.html # Using the name whenever possible -def convert_type_to_postgres(field: Field) -> None | str: - if field.config and "postgresType" in field.config: - return field.config["postgresType"] +def convert_type_to_postgres(field: Union[SchemaProperty, FieldLike]) -> None | str: + postgres_type = _get_config_value(field, "postgresType") + if postgres_type: + return postgres_type - type = field.type + type = _get_type(field) if type is None: return None + format = _get_format(field) if type.lower() in ["string", "varchar", "text"]: - if field.format == "uuid": + if format == "uuid": return "uuid" return "text" # STRING does not exist, TEXT and VARCHAR are all the same in postrges if type.lower() in ["timestamp", "timestamp_tz"]: @@ -110,16 +224,20 @@ def convert_type_to_postgres(field: Field) -> None | str: if type.lower() in ["bytes"]: return "bytea" if type.lower() in ["array"]: - return convert_to_sql_type(field.items, "postgres") + "[]" + items = _get_items(field) + if items: + return convert_to_sql_type(items, "postgres") + "[]" + return "text[]" return None # dataframe data types: # https://spark.apache.org/docs/latest/sql-ref-datatypes.html -def convert_to_dataframe(field: Field) -> None | str: - if field.config and "dataframeType" in field.config: - return field.config["dataframeType"] - type = field.type +def convert_to_dataframe(field: Union[SchemaProperty, FieldLike]) -> None | str: + dataframe_type = _get_config_value(field, "dataframeType") + if dataframe_type: + return dataframe_type + type = _get_type(field) if type is None: return None if type.lower() in ["string", "varchar", "text"]: @@ -133,8 +251,11 @@ def convert_to_dataframe(field: Field) -> None | str: if type.lower() in ["time"]: return "STRING" if type.lower() in ["number", "decimal", "numeric"]: - # precision and scale not supported by data contract - return "DECIMAL" + precision = _get_precision(field) + scale = _get_scale(field) + precision = precision if precision is not None else 38 + scale = scale if scale is not None else 0 + return f"DECIMAL({precision},{scale})" if type.lower() in ["float"]: return "FLOAT" if type.lower() in ["double"]: @@ -147,28 +268,32 @@ def convert_to_dataframe(field: Field) -> None | str: return "BOOLEAN" if type.lower() in ["object", "record", "struct"]: nested_fields = [] - for nested_field_name, nested_field in field.fields.items(): + for nested_field_name, nested_field in _get_nested_fields(field).items(): nested_field_type = convert_to_dataframe(nested_field) nested_fields.append(f"{nested_field_name}:{nested_field_type}") return f"STRUCT<{','.join(nested_fields)}>" if type.lower() in ["bytes"]: return "BINARY" if type.lower() in ["array"]: - item_type = convert_to_dataframe(field.items) - return f"ARRAY<{item_type}>" + items = _get_items(field) + if items: + item_type = convert_to_dataframe(items) + return f"ARRAY<{item_type}>" + return "ARRAY" return None # databricks data types: # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html -def convert_to_databricks(field: Field) -> None | str: - type = field.type +def convert_to_databricks(field: Union[SchemaProperty, FieldLike]) -> None | str: + type = _get_type(field) + databricks_type = _get_config_value(field, "databricksType") if ( - field.config - and "databricksType" in field.config + databricks_type + and type and type.lower() not in ["array", "object", "record", "struct"] ): - return field.config["databricksType"] + return databricks_type if type is None: return None if type.lower() in ["string", "varchar", "text"]: @@ -182,8 +307,11 @@ def convert_to_databricks(field: Field) -> None | str: if type.lower() in ["time"]: return "STRING" if type.lower() in ["number", "decimal", "numeric"]: - # precision and scale not supported by data contract - return "DECIMAL" + precision = _get_precision(field) + scale = _get_scale(field) + precision = precision if precision is not None else 38 + scale = scale if scale is not None else 0 + return f"DECIMAL({precision},{scale})" if type.lower() in ["float"]: return "FLOAT" if type.lower() in ["double"]: @@ -196,36 +324,40 @@ def convert_to_databricks(field: Field) -> None | str: return "BOOLEAN" if type.lower() in ["object", "record", "struct"]: nested_fields = [] - for nested_field_name, nested_field in field.fields.items(): + for nested_field_name, nested_field in _get_nested_fields(field).items(): nested_field_type = convert_to_databricks(nested_field) nested_fields.append(f"{nested_field_name}:{nested_field_type}") return f"STRUCT<{','.join(nested_fields)}>" if type.lower() in ["bytes"]: return "BINARY" if type.lower() in ["array"]: - item_type = convert_to_databricks(field.items) - return f"ARRAY<{item_type}>" + items = _get_items(field) + if items: + item_type = convert_to_databricks(items) + return f"ARRAY<{item_type}>" + return "ARRAY" if type.lower() in ["variant"]: return "VARIANT" return None -def convert_to_duckdb(field: Field) -> None | str: +def convert_to_duckdb(field: Union[SchemaProperty, FieldLike]) -> None | str: """ Convert a data contract field to the corresponding DuckDB SQL type. Parameters: - field (Field): The data contract field to convert. + field: The data contract field to convert (SchemaProperty or FieldLike). Returns: str: The corresponding DuckDB SQL type. """ # Check - if field is None or field.type is None: + type = _get_type(field) + if field is None or type is None: return None # Get - type_lower = field.type.lower() + type_lower = type.lower() # Prepare type_mapping = { @@ -257,20 +389,28 @@ def convert_to_duckdb(field: Field) -> None | str: # convert decimal numbers with precision and scale if type_lower == "decimal" or type_lower == "number" or type_lower == "numeric": - return f"DECIMAL({field.precision},{field.scale})" + precision = _get_precision(field) + scale = _get_scale(field) + return f"DECIMAL({precision},{scale})" # Check list and map if type_lower == "list" or type_lower == "array": - item_type = convert_to_duckdb(field.items) - return f"{item_type}[]" + items = _get_items(field) + if items: + item_type = convert_to_duckdb(items) + return f"{item_type}[]" + return "VARCHAR[]" if type_lower == "map": - key_type = convert_to_duckdb(field.keys) - value_type = convert_to_duckdb(field.values) + # For ODCS, we need to get key/value types from customProperties + keys = _get_config_value(field, "mapKeys") + values = _get_config_value(field, "mapValues") + key_type = keys if keys else "VARCHAR" + value_type = values if values else "VARCHAR" return f"MAP({key_type}, {value_type})" if type_lower == "struct" or type_lower == "object" or type_lower == "record": structure_field = "STRUCT(" field_strings = [] - for fieldKey, fieldValue in field.fields.items(): + for fieldKey, fieldValue in _get_nested_fields(field).items(): field_strings.append(f"{fieldKey} {convert_to_duckdb(fieldValue)}") structure_field += ", ".join(field_strings) structure_field += ")" @@ -280,25 +420,27 @@ def convert_to_duckdb(field: Field) -> None | str: return None -def convert_type_to_sqlserver(field: Field) -> None | str: +def convert_type_to_sqlserver(field: Union[SchemaProperty, FieldLike]) -> None | str: """Convert from supported datacontract types to equivalent sqlserver types""" - field_type = field.type + field_type = _get_type(field) if not field_type: return None # If provided sql-server config type, prefer it over default mapping - if sqlserver_type := get_type_config(field, "sqlserverType"): + sqlserver_type = _get_config_value(field, "sqlserverType") + if sqlserver_type: return sqlserver_type field_type = field_type.lower() + format = _get_format(field) if field_type in ["string", "varchar", "text"]: - if field.format == "uuid": + if format == "uuid": return "uniqueidentifier" return "varchar" if field_type in ["timestamp", "timestamp_tz"]: return "datetimeoffset" if field_type in ["timestamp_ntz"]: - if field.format == "datetime": + if format == "datetime": return "datetime" return "datetime2" if field_type in ["date"]: @@ -329,25 +471,31 @@ def convert_type_to_sqlserver(field: Field) -> None | str: return None -def convert_type_to_bigquery(field: Field) -> None | str: +def convert_type_to_bigquery(field: Union[SchemaProperty, FieldLike]) -> None | str: """Convert from supported datacontract types to equivalent bigquery types""" + # Import here to avoid circular import + from datacontract.export.bigquery_exporter import map_type_to_bigquery # BigQuery exporter cannot be used for complex types, as the exporter has different syntax than SodaCL - field_type = field.type + field_type = _get_type(field) if not field_type: return None - if field.config and "bigqueryType" in field.config: - return field.config["bigqueryType"] + bigquery_type = _get_config_value(field, "bigqueryType") + if bigquery_type: + return bigquery_type if field_type.lower() in ["array"]: - item_type = convert_type_to_bigquery(field.items) - return f"ARRAY<{item_type}>" + items = _get_items(field) + if items: + item_type = convert_type_to_bigquery(items) + return f"ARRAY<{item_type}>" + return "ARRAY" if field_type.lower() in ["object", "record", "struct"]: nested_fields = [] - for nested_field_name, nested_field in field.fields.items(): + for nested_field_name, nested_field in _get_nested_fields(field).items(): nested_field_type = convert_type_to_bigquery(nested_field) nested_fields.append(f"{nested_field_name} {nested_field_type}") return f"STRUCT<{', '.join(nested_fields)}>" @@ -355,19 +503,16 @@ def convert_type_to_bigquery(field: Field) -> None | str: return map_type_to_bigquery(field) -def get_type_config(field: Field, config_attr: str) -> dict[str, str] | None: - """Retrieve type configuration if provided in datacontract.""" - if not field.config: - return None - return field.config.get(config_attr, None) - - -def convert_type_to_trino(field: Field) -> None | str: +def convert_type_to_trino(field: Union[SchemaProperty, FieldLike]) -> None | str: """Convert from supported datacontract types to equivalent trino types""" - if field.config and "trinoType" in field.config: - return field.config["trinoType"] + trino_type = _get_config_value(field, "trinoType") + if trino_type: + return trino_type - field_type = field.type.lower() + field_type = _get_type(field) + if not field_type: + return None + field_type = field_type.lower() if field_type in ["string", "text", "varchar"]: return "varchar" # tinyint, smallint not supported by data contract @@ -388,3 +533,94 @@ def convert_type_to_trino(field: Field) -> None | str: return "varbinary" if field_type in ["object", "record", "struct"]: return "json" + return None + + +def convert_type_to_impala(field: Union[SchemaProperty, FieldLike]) -> None | str: + """Convert from supported data contract types to equivalent Impala types. + + Used as a fallback when `physicalType` is not present. + """ + # Allow an explicit override via config/customProperties + impala_type = _get_config_value(field, "impalaType") + if impala_type: + return impala_type + + field_type = _get_type(field) + if not field_type: + return None + + t = field_type.lower() + + # String-like + if t in ["string", "varchar", "text"]: + return "STRING" + + # Numeric / decimal + if t in ["number", "decimal", "numeric"]: + precision = _get_precision(field) or 38 + scale = _get_scale(field) or 0 + return f"DECIMAL({precision},{scale})" + + if t == "float": + return "FLOAT" + if t == "double": + return "DOUBLE" + + # Integers + if t in ["integer", "int"]: + return "INT" + if t in ["long", "bigint"]: + return "BIGINT" + + # Boolean + if t == "boolean": + return "BOOLEAN" + + # Temporal – Impala has a single TIMESTAMP type + if t in ["timestamp", "timestamp_ntz", "timestamp_tz"]: + return "TIMESTAMP" + if t == "date": + return "DATE" + # No dedicated TIME type in Impala → store as string + if t == "time": + return "STRING" + + # Binary + if t in ["bytes", "binary"]: + return "BINARY" + + # For complex / JSON-like types we currently do not emit a type check + # (returning None means no "has type" check is generated) + return None + +def convert_type_to_oracle(schema_property: SchemaProperty) -> None | str: + """Convert ODCS logical types to Oracle types. + + Uses physicalType if set, otherwise maps ODCS logical types to Oracle equivalents. + Reference: https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Data-Types.html + """ + if schema_property.physicalType: + return schema_property.physicalType + + if not schema_property.logicalType: + return None + + logical_type = schema_property.logicalType + + # ODCS logical type mappings + mapping = { + "string": "NVARCHAR2", + "number": "NUMBER", + "integer": "NUMBER", + "float": "BINARY_FLOAT", + "double": "BINARY_DOUBLE", + "boolean": "CHAR", + "date": "DATE", + "timestamp": "TIMESTAMP(6) WITH TIME ZONE", + "bytes": "RAW", + "object": "CLOB", + "array": "CLOB", + } + + return mapping.get(logical_type) \ No newline at end of file diff --git a/datacontract/export/sqlalchemy_converter.py b/datacontract/export/sqlalchemy_converter.py deleted file mode 100644 index 71d116aaa..000000000 --- a/datacontract/export/sqlalchemy_converter.py +++ /dev/null @@ -1,170 +0,0 @@ -import ast -import typing - -import datacontract.model.data_contract_specification as spec -from datacontract.export.exporter import Exporter, _determine_sql_server_type - - -class SQLAlchemyExporter(Exporter): - def export( - self, data_contract: spec.DataContractSpecification, model, server, sql_server_type, export_args - ) -> dict: - sql_server_type = _determine_sql_server_type(data_contract, sql_server_type, server) - return to_sqlalchemy_model_str(data_contract, sql_server_type, server) - - -DECLARATIVE_BASE = "Base" - - -def to_sqlalchemy_model_str(contract: spec.DataContractSpecification, sql_server_type: str = "", server=None) -> str: - server_obj = contract.servers.get(server) - classdefs = [ - generate_model_class(model_name, model, server_obj, sql_server_type) - for (model_name, model) in contract.models.items() - ] - documentation = ( - [ast.Expr(ast.Constant(contract.info.description))] if (contract.info and contract.info.description) else [] - ) - - declarative_base = ast.ClassDef( - name=DECLARATIVE_BASE, - bases=[ast.Name(id="DeclarativeBase", ctx=ast.Load())], - body=[ast.Pass()], - keywords=[], - decorator_list=[], - ) - - databricks_timestamp = ast.ImportFrom( - module="databricks.sqlalchemy", names=[ast.alias("TIMESTAMP"), ast.alias("TIMESTAMP_NTZ")] - ) - timestamp = ast.ImportFrom(module="sqlalchemy", names=[ast.alias(name="TIMESTAMP")]) - result = ast.Module( - body=[ - ast.ImportFrom(module="sqlalchemy.orm", names=[ast.alias(name="DeclarativeBase")]), - ast.ImportFrom( - module="sqlalchemy", - names=[ - ast.alias("Column"), - ast.alias("Date"), - ast.alias("Integer"), - ast.alias("Numeric"), - ast.alias("String"), - ast.alias("Text"), - ast.alias("VARCHAR"), - ast.alias("BigInteger"), - ast.alias("Float"), - ast.alias("Double"), - ast.alias("Boolean"), - ast.alias("Date"), - ast.alias("ARRAY"), - ast.alias("LargeBinary"), - ], - ), - databricks_timestamp if sql_server_type == "databricks" else timestamp, - *documentation, - declarative_base, - *classdefs, - ], - type_ignores=[], - ) - return ast.unparse(result) - - -def Call(name, *args, **kwargs) -> ast.Call: - return ast.Call( - ast.Name(name), - args=[v for v in args], - keywords=[ast.keyword(arg=f"{k}", value=ast.Constant(v)) for (k, v) in kwargs.items()], - ) - - -def Column(predicate, **kwargs) -> ast.Call: - return Call("Column", predicate, **kwargs) - - -def sqlalchemy_primitive(field: spec.Field): - sqlalchemy_name = { - "string": Call("String", ast.Constant(field.maxLength)), - "text": Call("Text", ast.Constant(field.maxLength)), - "varchar": Call("VARCHAR", ast.Constant(field.maxLength)), - "number": Call("Numeric", ast.Constant(field.precision), ast.Constant(field.scale)), - "decimal": Call("Numeric", ast.Constant(field.precision), ast.Constant(field.scale)), - "numeric": Call("Numeric", ast.Constant(field.precision), ast.Constant(field.scale)), - "int": ast.Name("Integer"), - "integer": ast.Name("Integer"), - "long": ast.Name("BigInteger"), - "bigint": ast.Name("BigInteger"), - "float": ast.Name("Float"), - "double": ast.Name("Double"), - "boolean": ast.Name("Boolean"), - "timestamp": ast.Name("TIMESTAMP"), - "timestamp_tz": Call("TIMESTAMP", ast.Constant(True)), - "timestamp_ntz": ast.Name("TIMESTAMP_NTZ"), - "date": ast.Name("Date"), - "bytes": Call("LargeBinary", ast.Constant(field.maxLength)), - } - return sqlalchemy_name.get(field.type) - - -def constant_field_value(field_name: str, field: spec.Field) -> tuple[ast.Call, typing.Optional[ast.ClassDef]]: - new_type = sqlalchemy_primitive(field) - match field.type: - case "array": - new_type = Call("ARRAY", sqlalchemy_primitive(field.items)) - if new_type is None: - raise RuntimeError(f"Unsupported field type {field.type}.") - - return Column( - new_type, nullable=not field.required, comment=field.description, primary_key=field.primaryKey or field.primary - ), None - - -def column_assignment(field_name: str, field: spec.Field) -> tuple[ast.Call, typing.Optional[ast.ClassDef]]: - return constant_field_value(field_name, field) - - -def is_simple_field(field: spec.Field) -> bool: - return field.type not in set(["object", "record", "struct"]) - - -def field_definitions(fields: dict[str, spec.Field]) -> tuple[list[ast.Expr], list[ast.ClassDef]]: - annotations: list[ast.Expr] = [] - classes: list[typing.Any] = [] - for field_name, field in fields.items(): - (ann, new_class) = column_assignment(field_name, field) - annotations.append(ast.Assign(targets=[ast.Name(id=field_name, ctx=ast.Store())], value=ann, lineno=0)) - return (annotations, classes) - - -def generate_model_class( - name: str, model_definition: spec.Model, server=None, sql_server_type: str = "" -) -> ast.ClassDef: - (field_assignments, nested_classes) = field_definitions(model_definition.fields) - documentation = [ast.Expr(ast.Constant(model_definition.description))] if model_definition.description else [] - - schema = None if server is None else server.schema_ - table_name = ast.Constant(name) - if sql_server_type == "databricks": - table_name = ast.Constant(name.lower()) - - result = ast.ClassDef( - name=name.capitalize(), - bases=[ast.Name(id=DECLARATIVE_BASE, ctx=ast.Load())], - body=[ - *documentation, - ast.Assign(targets=[ast.Name("__tablename__")], value=table_name, lineno=0), - ast.Assign( - targets=[ast.Name("__table_args__")], - value=ast.Dict( - keys=[ast.Constant("comment"), ast.Constant("schema")], - values=[ast.Constant(model_definition.description), ast.Constant(schema)], - ), - lineno=0, - ), - *nested_classes, - *field_assignments, - ], - keywords=[], - decorator_list=[], - ) - return result diff --git a/datacontract/export/sqlalchemy_exporter.py b/datacontract/export/sqlalchemy_exporter.py new file mode 100644 index 000000000..8fba2f487 --- /dev/null +++ b/datacontract/export/sqlalchemy_exporter.py @@ -0,0 +1,216 @@ +import ast +import typing +from typing import List, Optional + +from open_data_contract_standard.model import OpenDataContractStandard, SchemaObject, SchemaProperty, Server + +from datacontract.export.exporter import Exporter, _determine_sql_server_type + + +class SQLAlchemyExporter(Exporter): + def export( + self, data_contract: OpenDataContractStandard, schema_name, server, sql_server_type, export_args + ) -> dict: + sql_server_type = _determine_sql_server_type(data_contract, sql_server_type, server) + return to_sqlalchemy_model_str(data_contract, sql_server_type, server) + + +DECLARATIVE_BASE = "Base" + + +def _get_server_by_name(data_contract: OpenDataContractStandard, name: str) -> Optional[Server]: + """Get a server by name.""" + if data_contract.servers is None: + return None + return next((s for s in data_contract.servers if s.server == name), None) + + +def _get_type(prop: SchemaProperty) -> Optional[str]: + """Get the type from a schema property.""" + if prop.logicalType: + return prop.logicalType + if prop.physicalType: + return prop.physicalType + return None + + +def _get_logical_type_option(prop: SchemaProperty, key: str): + """Get a logical type option value.""" + if prop.logicalTypeOptions is None: + return None + return prop.logicalTypeOptions.get(key) + + +def to_sqlalchemy_model_str( + odcs: OpenDataContractStandard, sql_server_type: str = "", server=None +) -> str: + server_obj = _get_server_by_name(odcs, server) if server else None + classdefs = [] + if odcs.schema_: + for schema_obj in odcs.schema_: + classdefs.append(generate_model_class(schema_obj.name, schema_obj, server_obj, sql_server_type)) + + description_str = None + if odcs.description: + if hasattr(odcs.description, "purpose"): + description_str = odcs.description.purpose + documentation = [ast.Expr(ast.Constant(description_str))] if description_str else [] + + declarative_base = ast.ClassDef( + name=DECLARATIVE_BASE, + bases=[ast.Name(id="DeclarativeBase", ctx=ast.Load())], + body=[ast.Pass()], + keywords=[], + decorator_list=[], + ) + + databricks_timestamp = ast.ImportFrom( + module="databricks.sqlalchemy", names=[ast.alias("TIMESTAMP"), ast.alias("TIMESTAMP_NTZ")] + ) + timestamp = ast.ImportFrom(module="sqlalchemy", names=[ast.alias(name="TIMESTAMP")]) + result = ast.Module( + body=[ + ast.ImportFrom(module="sqlalchemy.orm", names=[ast.alias(name="DeclarativeBase")]), + ast.ImportFrom( + module="sqlalchemy", + names=[ + ast.alias("Column"), + ast.alias("Date"), + ast.alias("Integer"), + ast.alias("Numeric"), + ast.alias("String"), + ast.alias("Text"), + ast.alias("VARCHAR"), + ast.alias("BigInteger"), + ast.alias("Float"), + ast.alias("Double"), + ast.alias("Boolean"), + ast.alias("Date"), + ast.alias("ARRAY"), + ast.alias("LargeBinary"), + ], + ), + databricks_timestamp if sql_server_type == "databricks" else timestamp, + *documentation, + declarative_base, + *classdefs, + ], + type_ignores=[], + ) + return ast.unparse(result) + + +def Call(name, *args, **kwargs) -> ast.Call: + return ast.Call( + ast.Name(name), + args=[v for v in args], + keywords=[ast.keyword(arg=f"{k}", value=ast.Constant(v)) for (k, v) in kwargs.items()], + ) + + +def Column(predicate, **kwargs) -> ast.Call: + return Call("Column", predicate, **kwargs) + + +def sqlalchemy_primitive(prop: SchemaProperty): + prop_type = _get_type(prop) + max_length = _get_logical_type_option(prop, "maxLength") + precision = _get_logical_type_option(prop, "precision") + scale = _get_logical_type_option(prop, "scale") + + if prop_type is None: + return None + + prop_type_lower = prop_type.lower() + + sqlalchemy_name = { + "string": Call("String", ast.Constant(max_length)), + "text": Call("Text", ast.Constant(max_length)), + "varchar": Call("VARCHAR", ast.Constant(max_length)), + "number": Call("Numeric", ast.Constant(precision), ast.Constant(scale)), + "decimal": Call("Numeric", ast.Constant(precision), ast.Constant(scale)), + "numeric": Call("Numeric", ast.Constant(precision), ast.Constant(scale)), + "int": ast.Name("Integer"), + "integer": ast.Name("Integer"), + "long": ast.Name("BigInteger"), + "bigint": ast.Name("BigInteger"), + "float": ast.Name("Float"), + "double": ast.Name("Double"), + "boolean": ast.Name("Boolean"), + "timestamp": ast.Name("TIMESTAMP"), + "timestamp_tz": Call("TIMESTAMP", ast.Constant(True)), + "timestamp_ntz": ast.Name("TIMESTAMP_NTZ"), + "date": ast.Name("Date"), + "bytes": Call("LargeBinary", ast.Constant(max_length)), + } + return sqlalchemy_name.get(prop_type_lower) + + +def constant_field_value(field_name: str, prop: SchemaProperty) -> tuple[ast.Call, typing.Optional[ast.ClassDef]]: + new_type = sqlalchemy_primitive(prop) + prop_type = _get_type(prop) + + if prop_type and prop_type.lower() == "array": + if prop.items: + new_type = Call("ARRAY", sqlalchemy_primitive(prop.items)) + else: + new_type = Call("ARRAY", ast.Name("String")) + + if new_type is None: + raise RuntimeError(f"Unsupported field type {prop_type}.") + + return Column( + new_type, nullable=not prop.required, comment=prop.description, primary_key=prop.primaryKey if prop.primaryKey else None + ), None + + +def column_assignment(field_name: str, prop: SchemaProperty) -> tuple[ast.Call, typing.Optional[ast.ClassDef]]: + return constant_field_value(field_name, prop) + + +def is_simple_field(prop: SchemaProperty) -> bool: + prop_type = _get_type(prop) or "" + return prop_type.lower() not in {"object", "record", "struct"} + + +def field_definitions(properties: List[SchemaProperty]) -> tuple[list[ast.Expr], list[ast.ClassDef]]: + annotations: list[ast.Expr] = [] + classes: list[typing.Any] = [] + for prop in properties: + (ann, new_class) = column_assignment(prop.name, prop) + annotations.append(ast.Assign(targets=[ast.Name(id=prop.name, ctx=ast.Store())], value=ann, lineno=0)) + return (annotations, classes) + + +def generate_model_class( + name: str, schema_obj: SchemaObject, server: Optional[Server] = None, sql_server_type: str = "" +) -> ast.ClassDef: + (field_assignments, nested_classes) = field_definitions(schema_obj.properties or []) + documentation = [ast.Expr(ast.Constant(schema_obj.description))] if schema_obj.description else [] + + schema = None if server is None else server.schema_ + table_name = ast.Constant(name) + if sql_server_type == "databricks": + table_name = ast.Constant(name.lower()) + + result = ast.ClassDef( + name=name.capitalize(), + bases=[ast.Name(id=DECLARATIVE_BASE, ctx=ast.Load())], + body=[ + *documentation, + ast.Assign(targets=[ast.Name("__tablename__")], value=table_name, lineno=0), + ast.Assign( + targets=[ast.Name("__table_args__")], + value=ast.Dict( + keys=[ast.Constant("comment"), ast.Constant("schema")], + values=[ast.Constant(schema_obj.description), ast.Constant(schema)], + ), + lineno=0, + ), + *nested_classes, + *field_assignments, + ], + keywords=[], + decorator_list=[], + ) + return result diff --git a/datacontract/export/terraform_converter.py b/datacontract/export/terraform_converter.py deleted file mode 100644 index 850442d77..000000000 --- a/datacontract/export/terraform_converter.py +++ /dev/null @@ -1,76 +0,0 @@ -import re - -from datacontract.export.exporter import Exporter -from datacontract.model.data_contract_specification import DataContractSpecification, Server - - -class TerraformExporter(Exporter): - def export(self, data_contract, model, server, sql_server_type, export_args) -> dict: - return to_terraform(data_contract) - - -def to_terraform(data_contract_spec: DataContractSpecification, server_id: str = None) -> str: - if data_contract_spec is None: - return "" - if data_contract_spec.servers is None or len(data_contract_spec.servers) == 0: - return "" - - result = "" - for server_name, server in iter(data_contract_spec.servers.items()): - if server_id is not None and server_name != server_id: - continue - result = server_to_terraform_resource(data_contract_spec, result, server, server_name) - - return result.strip() - - -def server_to_terraform_resource(data_contract_spec, result, server: Server, server_name): - tag_data_contract = data_contract_spec.id - tag_name = data_contract_spec.info.title - tag_server = server_name - bucket_name = extract_bucket_name(server) - resource_id = f"{data_contract_spec.id}_{server_name}" - data_product_id = server.dataProductId - - if data_product_id is not None: - result += f""" -resource "aws_s3_bucket" "{resource_id}" {{ - bucket = "{bucket_name}" - - tags = {{ - Name = "{tag_name}" - DataContract = "{tag_data_contract}" - Server = "{tag_server}" - DataProduct = "{data_product_id}" - }} -}} - -""" - else: - result += f""" -resource "aws_s3_bucket" "{resource_id}" {{ - bucket = "{bucket_name}" - - tags = {{ - Name = "{tag_name}" - DataContract = "{tag_data_contract}" - Server = "{tag_server}" - }} -}} - -""" - return result - - -def extract_bucket_name(server) -> str | None: - if server.type == "s3": - s3_url = server.location - # Regular expression to match the S3 bucket name - match = re.search(r"s3://([^/]+)/", s3_url) - if match: - # Return the first group (bucket name) - return match.group(1) - else: - return "" - - return "" diff --git a/datacontract/imports/avro_importer.py b/datacontract/imports/avro_importer.py index da045380d..f433622cc 100644 --- a/datacontract/imports/avro_importer.py +++ b/datacontract/imports/avro_importer.py @@ -1,49 +1,42 @@ -from typing import Dict, List +from typing import List import avro.schema +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.model.exceptions import DataContractException +# Avro logical type to ODCS logical type mapping +LOGICAL_TYPE_MAPPING = { + "decimal": "number", + "date": "date", + "time-millis": "string", + "time-micros": "string", + "timestamp-millis": "date", + "timestamp-micros": "date", + "local-timestamp-millis": "date", + "local-timestamp-micros": "date", + "duration": "string", + "uuid": "string", +} + class AvroImporter(Importer): """Class to import Avro Schema file""" def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - """ - Import Avro schema from a source file. - - Args: - data_contract_specification: The data contract specification to update. - source: The path to the Avro schema file. - import_args: Additional import arguments. - - Returns: - The updated data contract specification. - """ - return import_avro(data_contract_specification, source) - - -def import_avro(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: - """ - Import an Avro schema from a file and update the data contract specification. + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_avro(source) - Args: - data_contract_specification: The data contract specification to update. - source: The path to the Avro schema file. - - Returns: - DataContractSpecification: The updated data contract specification. - - Raises: - DataContractException: If there's an error parsing the Avro schema. - """ - if data_contract_specification.models is None: - data_contract_specification.models = {} +def import_avro(source: str) -> OpenDataContractStandard: + """Import an Avro schema from a file.""" try: with open(source, "r") as file: avro_schema = avro.schema.parse(file.read()) @@ -55,197 +48,236 @@ def import_avro(data_contract_specification: DataContractSpecification, source: engine="datacontract", original_exception=e, ) - # type record is being used for both the table and the object types in data contract - # -> CONSTRAINT: one table per .avsc input, all nested records are interpreted as objects - fields = import_record_fields(avro_schema.fields) - data_contract_specification.models[avro_schema.name] = Model( - fields=fields, - ) + odcs = create_odcs() + odcs.schema_ = [] - if avro_schema.get_prop("doc") is not None: - data_contract_specification.models[avro_schema.name].description = avro_schema.get_prop("doc") - - if avro_schema.get_prop("namespace") is not None: - data_contract_specification.models[avro_schema.name].namespace = avro_schema.get_prop("namespace") + properties = import_record_fields(avro_schema.fields) - return data_contract_specification + schema_obj = create_schema_object( + name=avro_schema.name, + physical_type="record", + description=avro_schema.get_prop("doc"), + properties=properties, + ) + # Add namespace as custom property if present + if avro_schema.get_prop("namespace") is not None: + from open_data_contract_standard.model import CustomProperty + schema_obj.customProperties = [ + CustomProperty(property="namespace", value=avro_schema.get_prop("namespace")) + ] -def handle_config_avro_custom_properties(field: avro.schema.Field, imported_field: Field) -> None: - """ - Handle custom Avro properties and add them to the imported field's config. + odcs.schema_.append(schema_obj) - Args: - field: The Avro field. - imported_field: The imported field to update. - """ - if field.get_prop("logicalType") is not None: - if imported_field.config is None: - imported_field.config = {} - imported_field.config["avroLogicalType"] = field.get_prop("logicalType") + return odcs - if field.default is not None: - if imported_field.config is None: - imported_field.config = {} - imported_field.config["avroDefault"] = field.default +def import_record_fields(record_fields: List[avro.schema.Field]) -> List[SchemaProperty]: + """Import Avro record fields and convert them to ODCS properties.""" + properties = [] -LOGICAL_TYPE_MAPPING = { - "decimal": "decimal", - "date": "date", - "time-millis": "time", - "time-micros": "time", - "timestamp-millis": "timestamp_tz", - "timestamp-micros": "timestamp_tz", - "local-timestamp-micros": "timestamp_ntz", - "local-timestamp-millis": "timestamp_ntz", - "duration": "string", - "uuid": "string", -} + for field in record_fields: + prop = import_avro_field(field) + if prop: + properties.append(prop) + return properties -def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]: - """ - Import Avro record fields and convert them to data contract fields. - Args: - record_fields: List of Avro record fields. +def import_avro_field(field: avro.schema.Field) -> SchemaProperty: + """Import a single Avro field as an ODCS SchemaProperty.""" + custom_props = {} - Returns: - A dictionary of imported fields. - """ - imported_fields = {} - for field in record_fields: - imported_field = Field() - imported_field.required = True - imported_field.description = field.doc - - handle_config_avro_custom_properties(field, imported_field) - - # Determine field type and handle nested structures - if field.type.type == "record": - imported_field.type = "object" - imported_field.description = field.type.doc - imported_field.fields = import_record_fields(field.type.fields) - elif field.type.type == "union": - imported_field.required = False - # Check for enum in union first, since it needs special handling - enum_schema = get_enum_from_union_field(field) - if enum_schema: - imported_field.type = "string" - imported_field.enum = enum_schema.symbols - imported_field.title = enum_schema.name - if not imported_field.config: - imported_field.config = {} - imported_field.config["avroType"] = "enum" - else: - type = import_type_of_optional_field(field) - imported_field.type = type - if type == "record": - imported_field.fields = import_record_fields(get_record_from_union_field(field).fields) - elif type == "array": - imported_field.type = "array" - imported_field.items = import_avro_array_items(get_array_from_union_field(field)) - elif field.type.type == "array": - imported_field.type = "array" - imported_field.items = import_avro_array_items(field.type) - elif field.type.type == "map": - imported_field.type = "map" - imported_field.values = import_avro_map_values(field.type) - elif field.type.type == "enum": - imported_field.type = "string" - imported_field.enum = field.type.symbols - imported_field.title = field.type.name - if not imported_field.config: - imported_field.config = {} - imported_field.config["avroType"] = "enum" + # Handle Avro custom properties + if field.get_prop("logicalType") is not None: + custom_props["avroLogicalType"] = field.get_prop("logicalType") + if field.default is not None: + custom_props["avroDefault"] = str(field.default) + + # Determine type and nested structures + if field.type.type == "record": + nested_properties = import_record_fields(field.type.fields) + prop = create_property( + name=field.name, + logical_type="object", + physical_type="record", + description=field.type.doc or field.doc, + required=True, + properties=nested_properties, + custom_properties=custom_props if custom_props else None, + ) + elif field.type.type == "union": + # Union types indicate optional fields (null + type) + enum_schema = get_enum_from_union_field(field) + if enum_schema: + prop = create_property( + name=field.name, + logical_type="string", + physical_type="enum", + description=field.doc, + required=False, + custom_properties={**custom_props, "avroType": "enum"} if custom_props else {"avroType": "enum"}, + ) else: - logical_type = field.type.get_prop("logicalType") - if logical_type in LOGICAL_TYPE_MAPPING: - imported_field.type = LOGICAL_TYPE_MAPPING[logical_type] - if logical_type == "decimal": - imported_field.precision = field.type.precision - imported_field.scale = field.type.scale + logical_type, physical_type = import_type_of_optional_field(field) + if logical_type == "object": + record_schema = get_record_from_union_field(field) + nested_properties = import_record_fields(record_schema.fields) if record_schema else [] + prop = create_property( + name=field.name, + logical_type="object", + physical_type="record", + description=field.doc, + required=False, + properties=nested_properties, + custom_properties=custom_props if custom_props else None, + ) + elif logical_type == "array": + array_schema = get_array_from_union_field(field) + items_prop = import_avro_array_items(array_schema) if array_schema else None + prop = create_property( + name=field.name, + logical_type="array", + physical_type="array", + description=field.doc, + required=False, + items=items_prop, + custom_properties=custom_props if custom_props else None, + ) else: - imported_field.type = map_type_from_avro(field.type.type) - imported_fields[field.name] = imported_field - - return imported_fields - - -def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> Field: - """ - Import Avro array items and convert them to a data contract field. + prop = create_property( + name=field.name, + logical_type=logical_type, + physical_type=physical_type, + description=field.doc, + required=False, + custom_properties=custom_props if custom_props else None, + ) + elif field.type.type == "array": + items_prop = import_avro_array_items(field.type) + prop = create_property( + name=field.name, + logical_type="array", + physical_type="array", + description=field.doc, + required=True, + items=items_prop, + custom_properties=custom_props if custom_props else None, + ) + elif field.type.type == "map": + prop = create_property( + name=field.name, + logical_type="object", + physical_type="map", + description=field.doc, + required=True, + custom_properties={**custom_props, "avroType": "map"} if custom_props else {"avroType": "map"}, + ) + elif field.type.type == "enum": + prop = create_property( + name=field.name, + logical_type="string", + physical_type="enum", + description=field.doc, + required=True, + custom_properties={**custom_props, "avroType": "enum", "avroSymbols": field.type.symbols} if custom_props else {"avroType": "enum", "avroSymbols": field.type.symbols}, + ) + else: + # Primitive types + avro_logical_type = field.type.get_prop("logicalType") + if avro_logical_type in LOGICAL_TYPE_MAPPING: + logical_type = LOGICAL_TYPE_MAPPING[avro_logical_type] + precision = getattr(field.type, 'precision', None) + scale = getattr(field.type, 'scale', None) + prop = create_property( + name=field.name, + logical_type=logical_type, + physical_type=field.type.type, + description=field.doc, + required=True, + precision=precision, + scale=scale, + custom_properties=custom_props if custom_props else None, + ) + else: + logical_type = map_type_from_avro(field.type.type) + prop = create_property( + name=field.name, + logical_type=logical_type, + physical_type=field.type.type, + description=field.doc, + required=True, + custom_properties=custom_props if custom_props else None, + ) - Args: - array_schema: The Avro array schema. + return prop - Returns: - Field: The imported field representing the array items. - """ - items = Field() - for prop in array_schema.other_props: - items.__setattr__(prop, array_schema.other_props[prop]) +def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> SchemaProperty: + """Import Avro array items as an ODCS SchemaProperty.""" if array_schema.items.type == "record": - items.type = "object" - items.fields = import_record_fields(array_schema.items.fields) + nested_properties = import_record_fields(array_schema.items.fields) + return create_property( + name="items", + logical_type="object", + physical_type="record", + properties=nested_properties, + ) elif array_schema.items.type == "array": - items.type = "array" - items.items = import_avro_array_items(array_schema.items) - else: # primitive type - items.type = map_type_from_avro(array_schema.items.type) - - return items - - -def import_avro_map_values(map_schema: avro.schema.MapSchema) -> Field: - """ - Import Avro map values and convert them to a data contract field. - - Args: - map_schema: The Avro map schema. + items_prop = import_avro_array_items(array_schema.items) + return create_property( + name="items", + logical_type="array", + physical_type="array", + items=items_prop, + ) + else: + logical_type = map_type_from_avro(array_schema.items.type) + return create_property( + name="items", + logical_type=logical_type, + physical_type=array_schema.items.type, + ) - Returns: - Field: The imported field representing the map values. - """ - values = Field() - for prop in map_schema.other_props: - values.__setattr__(prop, map_schema.other_props[prop]) +def import_avro_map_values(map_schema: avro.schema.MapSchema) -> SchemaProperty: + """Import Avro map values as an ODCS SchemaProperty.""" if map_schema.values.type == "record": - values.type = "object" - values.fields = import_record_fields(map_schema.values.fields) + nested_properties = import_record_fields(map_schema.values.fields) + return create_property( + name="values", + logical_type="object", + physical_type="record", + properties=nested_properties, + ) elif map_schema.values.type == "array": - values.type = "array" - values.items = import_avro_array_items(map_schema.values) - else: # primitive type - values.type = map_type_from_avro(map_schema.values.type) - - return values - - -def import_type_of_optional_field(field: avro.schema.Field) -> str: - """ - Determine the type of optional field in an Avro union. - - Args: - field: The Avro field with a union type. + items_prop = import_avro_array_items(map_schema.values) + return create_property( + name="values", + logical_type="array", + physical_type="array", + items=items_prop, + ) + else: + logical_type = map_type_from_avro(map_schema.values.type) + return create_property( + name="values", + logical_type=logical_type, + physical_type=map_schema.values.type, + ) - Returns: - str: The mapped type of the non-null field in the union. - Raises: - DataContractException: If no non-null type is found in the union. - """ +def import_type_of_optional_field(field: avro.schema.Field) -> tuple[str, str]: + """Determine the type of optional field in an Avro union.""" for field_type in field.type.schemas: if field_type.type != "null": - logical_type = field_type.get_prop("logicalType") - if logical_type and logical_type in LOGICAL_TYPE_MAPPING: - return LOGICAL_TYPE_MAPPING[logical_type] + avro_logical_type = field_type.get_prop("logicalType") + if avro_logical_type and avro_logical_type in LOGICAL_TYPE_MAPPING: + return LOGICAL_TYPE_MAPPING[avro_logical_type], field_type.type else: - return map_type_from_avro(field_type.type) + return map_type_from_avro(field_type.type), field_type.type + raise DataContractException( type="schema", result="failed", @@ -256,15 +288,7 @@ def import_type_of_optional_field(field: avro.schema.Field) -> str: def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordSchema | None: - """ - Get the record schema from a union field. - - Args: - field: The Avro field with a union type. - - Returns: - The record schema if found, None otherwise. - """ + """Get the record schema from a union field.""" for field_type in field.type.schemas: if field_type.type == "record": return field_type @@ -272,15 +296,7 @@ def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordS def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySchema | None: - """ - Get the array schema from a union field. - - Args: - field: The Avro field with a union type. - - Returns: - The array schema if found, None otherwise. - """ + """Get the array schema from a union field.""" for field_type in field.type.schemas: if field_type.type == "array": return field_type @@ -288,15 +304,7 @@ def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySch def get_enum_from_union_field(field: avro.schema.Field) -> avro.schema.EnumSchema | None: - """ - Get the enum schema from a union field. - - Args: - field: The Avro field with a union type. - - Returns: - The enum schema if found, None otherwise. - """ + """Get the enum schema from a union field.""" for field_type in field.type.schemas: if field_type.type == "enum": return field_type @@ -304,48 +312,30 @@ def get_enum_from_union_field(field: avro.schema.Field) -> avro.schema.EnumSchem def map_type_from_avro(avro_type_str: str) -> str: - """ - Map Avro type strings to data contract type strings. - - Args: - avro_type_str (str): The Avro type string. - - Returns: - str: The corresponding data contract type string. - - Raises: - DataContractException: If the Avro type is unsupported. - """ - # TODO: ambiguous mapping in the export - if avro_type_str == "null": - return "null" - elif avro_type_str == "string": - return "string" - elif avro_type_str == "bytes": - return "binary" - elif avro_type_str == "double": - return "double" - elif avro_type_str == "float": - return "float" - elif avro_type_str == "int": - return "int" - elif avro_type_str == "long": - return "long" - elif avro_type_str == "boolean": - return "boolean" - elif avro_type_str == "record": - return "record" - elif avro_type_str == "array": - return "array" - elif avro_type_str == "map": - return "map" - elif avro_type_str == "enum": - return "string" - else: - raise DataContractException( - type="schema", - result="failed", - name="Map avro type to data contract type", - reason=f"Unsupported type {avro_type_str} in avro schema.", - engine="datacontract", - ) + """Map Avro type strings to ODCS logical type strings.""" + type_mapping = { + "null": "string", # null type maps to string as placeholder + "string": "string", + "bytes": "array", + "double": "number", + "float": "number", + "int": "integer", + "long": "integer", + "boolean": "boolean", + "record": "object", + "array": "array", + "map": "object", + "enum": "string", + "fixed": "array", + } + + if avro_type_str in type_mapping: + return type_mapping[avro_type_str] + + raise DataContractException( + type="schema", + result="failed", + name="Map avro type to data contract type", + reason=f"Unsupported type {avro_type_str} in avro schema.", + engine="datacontract", + ) diff --git a/datacontract/imports/bigquery_importer.py b/datacontract/imports/bigquery_importer.py index e07904c91..007935eaa 100644 --- a/datacontract/imports/bigquery_importer.py +++ b/datacontract/imports/bigquery_importer.py @@ -2,30 +2,32 @@ import logging from typing import List +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty + from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.model.exceptions import DataContractException class BigQueryImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: + self, source: str, import_args: dict + ) -> OpenDataContractStandard: if source is not None: - data_contract_specification = import_bigquery_from_json(data_contract_specification, source) + return import_bigquery_from_json(source) else: - data_contract_specification = import_bigquery_from_api( - data_contract_specification, + return import_bigquery_from_api( import_args.get("bigquery_table"), import_args.get("bigquery_project"), import_args.get("bigquery_dataset"), ) - return data_contract_specification -def import_bigquery_from_json( - data_contract_specification: DataContractSpecification, source: str -) -> DataContractSpecification: +def import_bigquery_from_json(source: str) -> OpenDataContractStandard: try: with open(source, "r") as file: bigquery_schema = json.loads(file.read()) @@ -37,15 +39,14 @@ def import_bigquery_from_json( engine="datacontract", original_exception=e, ) - return convert_bigquery_schema(data_contract_specification, bigquery_schema) + return convert_bigquery_schema(bigquery_schema) def import_bigquery_from_api( - data_contract_specification: DataContractSpecification, bigquery_tables: List[str], bigquery_project: str, bigquery_dataset: str, -) -> DataContractSpecification: +) -> OpenDataContractStandard: try: from google.cloud import bigquery except ImportError as e: @@ -63,6 +64,9 @@ def import_bigquery_from_api( if bigquery_tables is None: bigquery_tables = fetch_table_names(client, bigquery_dataset) + odcs = create_odcs() + odcs.schema_ = [] + for table in bigquery_tables: try: api_table = client.get_table("{}.{}.{}".format(bigquery_project, bigquery_dataset, table)) @@ -82,13 +86,14 @@ def import_bigquery_from_api( type="request", result="failed", name="Query bigtable Schema from API", - reason=f"Table {table} bnot found on bigtable schema Project {bigquery_project}, dataset {bigquery_dataset}.", + reason=f"Table {table} not found on bigtable schema Project {bigquery_project}, dataset {bigquery_dataset}.", engine="datacontract", ) - convert_bigquery_schema(data_contract_specification, api_table.to_api_repr()) + schema_obj = convert_bigquery_table_to_schema(api_table.to_api_repr()) + odcs.schema_.append(schema_obj) - return data_contract_specification + return odcs def fetch_table_names(client, dataset: str) -> List[str]: @@ -100,118 +105,196 @@ def fetch_table_names(client, dataset: str) -> List[str]: return table_names -def convert_bigquery_schema( - data_contract_specification: DataContractSpecification, bigquery_schema: dict -) -> DataContractSpecification: - if data_contract_specification.models is None: - data_contract_specification.models = {} +def convert_bigquery_schema(bigquery_schema: dict) -> OpenDataContractStandard: + """Convert a BigQuery schema to ODCS format.""" + odcs = create_odcs() + odcs.schema_ = [convert_bigquery_table_to_schema(bigquery_schema)] + return odcs + - fields = import_table_fields(bigquery_schema.get("schema").get("fields")) +def convert_bigquery_table_to_schema(bigquery_schema: dict): + """Convert a BigQuery table definition to an ODCS SchemaObject.""" + properties = import_table_fields(bigquery_schema.get("schema", {}).get("fields", [])) - # Looking at actual export data, I guess this is always set and friendlyName isn't, though I couldn't say - # what exactly leads to friendlyName being set - table_id = bigquery_schema.get("tableReference").get("tableId") + table_id = bigquery_schema.get("tableReference", {}).get("tableId", "unknown") + description = bigquery_schema.get("description") + title = bigquery_schema.get("friendlyName") + table_type = map_bigquery_type(bigquery_schema.get("type", "TABLE")) - data_contract_specification.models[table_id] = Model( - fields=fields, type=map_bigquery_type(bigquery_schema.get("type")) + schema_obj = create_schema_object( + name=table_id, + physical_type=table_type, + description=description, + properties=properties, ) - # Copy the description, if it exists - if bigquery_schema.get("description") is not None: - data_contract_specification.models[table_id].description = bigquery_schema.get("description") + if title: + schema_obj.businessName = title - # Set the title from friendlyName if it exists - if bigquery_schema.get("friendlyName") is not None: - data_contract_specification.models[table_id].title = bigquery_schema.get("friendlyName") + return schema_obj - return data_contract_specification +def import_table_fields(table_fields) -> List[SchemaProperty]: + """Import BigQuery table fields as ODCS SchemaProperties.""" + properties = [] -def import_table_fields(table_fields): - imported_fields = {} for field in table_fields: field_name = field.get("name") - imported_fields[field_name] = Field() - imported_fields[field_name].required = field.get("mode") == "REQUIRED" - imported_fields[field_name].description = field.get("description") - - if field.get("type") == "RECORD": - imported_fields[field_name].type = "object" - imported_fields[field_name].fields = import_table_fields(field.get("fields")) - elif field.get("type") == "STRUCT": - imported_fields[field_name].type = "struct" - imported_fields[field_name].fields = import_table_fields(field.get("fields")) - elif field.get("type") == "RANGE": - # This is a range of date/datetime/timestamp but multiple values - # So we map it to an array - imported_fields[field_name].type = "array" - imported_fields[field_name].items = Field( - type=map_type_from_bigquery(field["rangeElementType"].get("type")) + required = field.get("mode") == "REQUIRED" + description = field.get("description") + field_type = field.get("type") + repeated = field.get("mode") == "REPEATED" + + if field_type == "RECORD": + nested_properties = import_table_fields(field.get("fields", [])) + if repeated: + items_prop = create_property( + name="items", + logical_type="object", + physical_type="RECORD", + properties=nested_properties, + ) + prop = create_property( + name=field_name, + logical_type="array", + description=description, + required=None, + items=items_prop, + ) + else: + prop = create_property( + name=field_name, + logical_type="object", + physical_type="RECORD", + description=description, + required=required if required else None, + properties=nested_properties, + ) + elif field_type == "STRUCT": + nested_properties = import_table_fields(field.get("fields", [])) + if repeated: + items_prop = create_property( + name="items", + logical_type="object", + physical_type="STRUCT", + properties=nested_properties, + ) + prop = create_property( + name=field_name, + logical_type="array", + description=description, + required=None, + items=items_prop, + ) + else: + prop = create_property( + name=field_name, + logical_type="object", + physical_type="STRUCT", + description=description, + required=required if required else None, + properties=nested_properties, + ) + elif field_type == "RANGE": + # Range of date/datetime/timestamp - multiple values, map to array + items_prop = create_property( + name="items", + logical_type=map_type_from_bigquery(field.get("rangeElementType", {}).get("type", "STRING")), + physical_type=field.get("rangeElementType", {}).get("type", "STRING"), ) - else: # primitive type - imported_fields[field_name].type = map_type_from_bigquery(field.get("type")) - - if field.get("type") == "STRING": - # in bigquery both string and bytes have maxLength but in the datacontracts - # spec it is only valid for strings - if field.get("maxLength") is not None: - imported_fields[field_name].maxLength = int(field.get("maxLength")) - - if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC": - if field.get("precision") is not None: - imported_fields[field_name].precision = int(field.get("precision")) - - if field.get("scale") is not None: - imported_fields[field_name].scale = int(field.get("scale")) - - return imported_fields - - -def map_type_from_bigquery(bigquery_type_str: str): - if bigquery_type_str == "STRING": - return "string" - elif bigquery_type_str == "BYTES": - return "bytes" - elif bigquery_type_str == "INTEGER": - return "int" - elif bigquery_type_str == "INT64": - return "bigint" - elif bigquery_type_str == "FLOAT": - return "float" - elif bigquery_type_str == "FLOAT64": - return "double" - elif bigquery_type_str == "BOOLEAN" or bigquery_type_str == "BOOL": - return "boolean" - elif bigquery_type_str == "TIMESTAMP": - return "timestamp" - elif bigquery_type_str == "DATE": - return "date" - elif bigquery_type_str == "TIME": - return "timestamp_ntz" - elif bigquery_type_str == "DATETIME": - return "timestamp" - elif bigquery_type_str == "NUMERIC": - return "numeric" - elif bigquery_type_str == "BIGNUMERIC": - return "double" - elif bigquery_type_str == "GEOGRAPHY": - return "object" - elif bigquery_type_str == "JSON": - return "object" - else: - raise DataContractException( - type="schema", - result="failed", - name="Map bigquery type to data contract type", - reason=f"Unsupported type {bigquery_type_str} in bigquery json definition.", - engine="datacontract", - ) + prop = create_property( + name=field_name, + logical_type="array", + physical_type="RANGE", + description=description, + required=required if required else None, + items=items_prop, + ) + else: + logical_type = map_type_from_bigquery(field_type) + max_length = None + precision = None + scale = None + + if field_type == "STRING" and field.get("maxLength") is not None: + max_length = int(field.get("maxLength")) + + if field_type in ("NUMERIC", "BIGNUMERIC"): + if field.get("precision") is not None: + precision = int(field.get("precision")) + if field.get("scale") is not None: + scale = int(field.get("scale")) + + if repeated: + items_prop = create_property( + name="items", + logical_type=logical_type, + physical_type=field_type, + max_length=max_length, + precision=precision, + scale=scale, + ) + prop = create_property( + name=field_name, + logical_type="array", + description=description, + required=None, + items=items_prop, + ) + else: + prop = create_property( + name=field_name, + logical_type=logical_type, + physical_type=field_type, + description=description, + required=required if required else None, + max_length=max_length, + precision=precision, + scale=scale, + ) + properties.append(prop) + + return properties + + +def map_type_from_bigquery(bigquery_type_str: str) -> str: + """Map BigQuery type to ODCS logical type.""" + type_mapping = { + "STRING": "string", + "BYTES": "array", + "INTEGER": "integer", + "INT64": "integer", + "FLOAT": "number", + "FLOAT64": "number", + "BOOLEAN": "boolean", + "BOOL": "boolean", + "TIMESTAMP": "date", + "DATE": "date", + "TIME": "date", + "DATETIME": "date", + "NUMERIC": "number", + "BIGNUMERIC": "number", + "GEOGRAPHY": "object", + "JSON": "object", + } + + if bigquery_type_str in type_mapping: + return type_mapping[bigquery_type_str] + + raise DataContractException( + type="schema", + result="failed", + name="Map bigquery type to data contract type", + reason=f"Unsupported type {bigquery_type_str} in bigquery json definition.", + engine="datacontract", + ) def map_bigquery_type(bigquery_type: str) -> str: - if bigquery_type == "TABLE" or bigquery_type == "EXTERNAL" or bigquery_type == "SNAPSHOT": + """Map BigQuery table type to ODCS physical type.""" + if bigquery_type in ("TABLE", "EXTERNAL", "SNAPSHOT"): return "table" - elif bigquery_type == "VIEW" or bigquery_type == "MATERIALIZED_VIEW": + elif bigquery_type in ("VIEW", "MATERIALIZED_VIEW"): return "view" else: logger = logging.getLogger(__name__) diff --git a/datacontract/imports/csv_importer.py b/datacontract/imports/csv_importer.py index f58d2fc34..f1191d57c 100644 --- a/datacontract/imports/csv_importer.py +++ b/datacontract/imports/csv_importer.py @@ -1,22 +1,33 @@ import os -from typing import Any, Dict, List +from typing import Dict, List -import duckdb +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, +) class CsvImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_csv(data_contract_specification, source) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_csv(source) def import_csv( - data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False -) -> DataContractSpecification: + source: str, include_examples: bool = False +) -> OpenDataContractStandard: + """Import a CSV file and create an ODCS data contract.""" + try: + import duckdb + except ImportError: + raise ImportError("duckdb is required for CSV import. Install with: pip install datacontract-cli[duckdb]") + # use the file name as table name table_name = os.path.splitext(os.path.basename(source))[0] @@ -28,8 +39,7 @@ def import_csv( dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy() tbl = con.table(table_name) - if data_contract_specification.servers is None: - data_contract_specification.servers = {} + odcs = create_odcs() delimiter = None if dialect is None else dialect["Delimiter"][0] @@ -38,9 +48,19 @@ def import_csv( else: dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes] - data_contract_specification.servers["production"] = Server( - type="local", path=source, format="csv", delimiter=delimiter - ) + # Add server + odcs.servers = [ + create_server( + name="production", + server_type="local", + path=source, + format="csv", + ) + ] + # Set delimiter as custom property on server if needed + if delimiter: + from open_data_contract_standard.model import CustomProperty + odcs.servers[0].customProperties = [CustomProperty(property="delimiter", value=delimiter)] rowcount = tbl.shape[0] @@ -74,70 +94,89 @@ def import_csv( elif res[2] == res[0]: formats[field_name] = "uuid" - fields = {} + properties = [] for i in range(tbl.shape[1]): field_name = tbl.columns[i] dc_type = dc_types[i] - ## specifying "integer" rather than "bigint" looks nicer + # specifying "integer" rather than "bigint" looks nicer if ( - dc_type == "bigint" + dc_type == "integer" and tallies[("max", field_name)] <= 2147483647 and tallies[("min", field_name)] >= -2147483648 ): dc_type = "integer" - field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)} + format_val = formats.get(field_name, None) + required = tallies[("count", field_name)] == rowcount - if tallies[("count", field_name)] == rowcount: - field["required"] = True - if dc_type not in ["boolean", "bytes"]: - distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore + unique = None + if dc_type not in ["boolean", "array"]: + distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] if distinct_values > 0 and distinct_values == tallies[("count", field_name)]: - field["unique"] = True - s = samples.get(field_name, None) - if s is not None: - field["examples"] = s - if dc_type in ["integer", "bigint", "float", "double"]: - field["minimum"] = tallies[("min", field_name)] - field["maximum"] = tallies[("max", field_name)] - - fields[field_name] = field - - model_examples = None - if include_examples: - model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall() - - data_contract_specification.models[table_name] = Model( - type="table", description="Generated model of " + source, fields=fields, examples=model_examples + unique = True + + examples = samples.get(field_name, None) + + minimum = None + maximum = None + if dc_type in ["integer", "number"]: + minimum = tallies[("min", field_name)] + maximum = tallies[("max", field_name)] + + # Build custom properties for format if present + custom_props = {} + if format_val: + custom_props["format"] = format_val + + prop = create_property( + name=field_name, + logical_type=dc_type, + physical_type=str(tbl.dtypes[i]), + required=required if required else None, + unique=unique, + examples=examples, + minimum=minimum, + maximum=maximum, + custom_properties=custom_props if custom_props else None, + ) + properties.append(prop) + + schema_obj = create_schema_object( + name=table_name, + physical_type="table", + description="Generated model of " + source, + properties=properties, ) - return data_contract_specification + odcs.schema_ = [schema_obj] + + return odcs _duck_db_types = { "BOOLEAN": "boolean", - "BLOB": "bytes", + "BLOB": "array", "TINYINT": "integer", "SMALLINT": "integer", "INTEGER": "integer", - "BIGINT": "bigint", + "BIGINT": "integer", "UTINYINT": "integer", "USMALLINT": "integer", "UINTEGER": "integer", - "UBIGINT": "bigint", - "FLOAT": "float", - "DOUBLE": "double", + "UBIGINT": "integer", + "FLOAT": "number", + "DOUBLE": "number", "VARCHAR": "string", - "TIMESTAMP": "timestamp", + "TIMESTAMP": "date", "DATE": "date", - # TODO: Add support for NULL } -def map_type_from_duckdb(sql_type: None | str): +def map_type_from_duckdb(sql_type: None | str) -> str: + """Map DuckDB type to ODCS logical type.""" if sql_type is None: - return None + return "string" sql_type_normed = sql_type.upper().strip() return _duck_db_types.get(sql_type_normed, "string") diff --git a/datacontract/imports/dbml_importer.py b/datacontract/imports/dbml_importer.py index 6ee3864f7..70568a9de 100644 --- a/datacontract/imports/dbml_importer.py +++ b/datacontract/imports/dbml_importer.py @@ -1,33 +1,35 @@ from typing import List +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from pydbml import Database, PyDBML from pyparsing import ParseException from datacontract.imports.importer import Importer +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.imports.sql_importer import map_type_from_sql -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model from datacontract.model.exceptions import DataContractException class DBMLImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - data_contract_specification = import_dbml_from_source( - data_contract_specification, + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_dbml_from_source( source, import_args.get("dbml_schema"), import_args.get("dbml_table"), ) - return data_contract_specification def import_dbml_from_source( - data_contract_specification: DataContractSpecification, source: str, import_schemas: List[str], import_tables: List[str], -) -> DataContractSpecification: +) -> OpenDataContractStandard: try: with open(source, "r") as file: dbml_schema = PyDBML(file) @@ -40,65 +42,92 @@ def import_dbml_from_source( original_exception=e, ) - return convert_dbml(data_contract_specification, dbml_schema, import_schemas, import_tables) + return convert_dbml(dbml_schema, import_schemas, import_tables) def convert_dbml( - data_contract_specification: DataContractSpecification, dbml_schema: Database, import_schemas: List[str], import_tables: List[str], -) -> DataContractSpecification: +) -> OpenDataContractStandard: + odcs = create_odcs() + if dbml_schema.project is not None: - data_contract_specification.info.title = dbml_schema.project.name + odcs.name = dbml_schema.project.name - if data_contract_specification.models is None: - data_contract_specification.models = {} + odcs.schema_ = [] for table in dbml_schema.tables: schema_name = table.schema table_name = table.name - # Skip if import schemas or table names are defined - # and the current table doesn't match - # if empty no filtering is done + # Skip if import schemas or table names are defined and current table doesn't match if import_schemas and schema_name not in import_schemas: continue if import_tables and table_name not in import_tables: continue - fields = import_table_fields(table, dbml_schema.refs) + properties = import_table_fields(table, dbml_schema.refs) - data_contract_specification.models[table_name] = Model( - fields=fields, namespace=schema_name, description=table.note.text + schema_obj = create_schema_object( + name=table_name, + physical_type="table", + description=table.note.text if table.note else None, + properties=properties, ) - return data_contract_specification + # Store namespace as custom property + if schema_name: + from open_data_contract_standard.model import CustomProperty + schema_obj.customProperties = [CustomProperty(property="namespace", value=schema_name)] + + odcs.schema_.append(schema_obj) + + return odcs + +def import_table_fields(table, references) -> List[SchemaProperty]: + """Import DBML table fields as ODCS SchemaProperties.""" + properties = [] + pk_position = 1 -def import_table_fields(table, references) -> dict[str, Field]: - imported_fields = {} for field in table.columns: field_name = field.name - imported_fields[field_name] = Field() - imported_fields[field_name].required = field.not_null - imported_fields[field_name].description = field.note.text - imported_fields[field_name].primaryKey = field.pk - imported_fields[field_name].unique = field.unique - # This is an assumption, that these might be valid SQL Types, since - # DBML doesn't really enforce anything other than 'no spaces' in column types - imported_fields[field_name].type = map_type_from_sql(field.type) + required = field.not_null + description = field.note.text if field.note else None + is_primary_key = field.pk + is_unique = field.unique + logical_type = map_type_from_sql(field.type) ref = get_reference(field, references) - if ref is not None: - imported_fields[field_name].references = ref - return imported_fields + custom_props = {} + if ref: + custom_props["references"] = ref + + prop = create_property( + name=field_name, + logical_type=logical_type if logical_type else "string", + physical_type=field.type, + description=description, + required=required if required else None, + primary_key=is_primary_key if is_primary_key else None, + primary_key_position=pk_position if is_primary_key else None, + unique=is_unique if is_unique else None, + custom_properties=custom_props if custom_props else None, + ) + + if is_primary_key: + pk_position += 1 + + properties.append(prop) + + return properties def get_reference(field, references): - result = None + """Get the reference for a field if it exists.""" for ref in references: ref_table_name = ref.col1[0].table.name ref_col_name = ref.col1[0].name @@ -106,7 +135,6 @@ def get_reference(field, references): field_name = field.name if ref_table_name == field_table_name and ref_col_name == field_name: - result = f"{ref.col2[0].table.name}.{ref.col2[0].name}" - return result + return f"{ref.col2[0].table.name}.{ref.col2[0].name}" - return result + return None diff --git a/datacontract/imports/dbt_importer.py b/datacontract/imports/dbt_importer.py index 2214eb77d..9a11a5aad 100644 --- a/datacontract/imports/dbt_importer.py +++ b/datacontract/imports/dbt_importer.py @@ -1,27 +1,22 @@ import json -from typing import TypedDict +from typing import List, TypedDict from dbt.artifacts.resources.v1.components import ColumnInfo from dbt.contracts.graph.manifest import Manifest from dbt.contracts.graph.nodes import GenericTestNode, ManifestNode, ModelNode from dbt_common.contracts.constraints import ConstraintType +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from datacontract.imports.bigquery_importer import map_type_from_bigquery from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) class DBTImportArgs(TypedDict, total=False): - """ - A dictionary representing arguments for importing DBT models. - Makes the DBT Importer more customizable by allowing for flexible filtering - of models and their properties, through wrapping or extending. - - Attributes: - dbt_models: The keys of models to be used in contract. All as default. - resource_types: Nodes listed in resource_types are kept while importing. model as default. - """ - dbt_nodes: list[str] resource_types: list[str] @@ -29,13 +24,11 @@ class DBTImportArgs(TypedDict, total=False): class DbtManifestImporter(Importer): def import_source( self, - data_contract_specification: DataContractSpecification, source: str, import_args: DBTImportArgs, - ) -> DataContractSpecification: + ) -> OpenDataContractStandard: manifest = read_dbt_manifest(manifest_path=source) return import_dbt_manifest( - data_contract_specification=data_contract_specification, manifest=manifest, dbt_nodes=import_args.get("dbt_model", []), resource_types=import_args.get("resource_types", ["model"]), @@ -92,26 +85,25 @@ def _get_references(manifest: Manifest, node: ManifestNode) -> dict[str, str]: def import_dbt_manifest( - data_contract_specification: DataContractSpecification, manifest: Manifest, dbt_nodes: list[str], resource_types: list[str], -) -> DataContractSpecification: - """ - Extracts all relevant information from the manifest, - and puts it in a data contract specification. - """ - data_contract_specification.info.title = manifest.metadata.project_name - data_contract_specification.info.dbt_version = manifest.metadata.dbt_version +) -> OpenDataContractStandard: + """Extracts all relevant information from the manifest into an ODCS data contract.""" + odcs = create_odcs() + odcs.name = manifest.metadata.project_name + + # Store dbt version as custom property + from open_data_contract_standard.model import CustomProperty + odcs.customProperties = [CustomProperty(property="dbt_version", value=manifest.metadata.dbt_version)] + adapter_type = manifest.metadata.adapter_type - data_contract_specification.models = data_contract_specification.models or {} + odcs.schema_ = [] + for node in manifest.nodes.values(): - # Only intressted in processing models. if node.resource_type not in resource_types: continue - # To allow args stored in dbt_models to filter relevant models. - # If dbt_models is empty, use all models. if dbt_nodes and node.name not in dbt_nodes: continue @@ -123,30 +115,82 @@ def import_dbt_manifest( if len(primary_keys) == 1: primary_key = primary_keys[0] - dc_model = Model( + properties = create_fields( + manifest, + model_unique_id=model_unique_id, + columns=node.columns, + primary_key_name=primary_key, + references=references, + adapter_type=adapter_type, + ) + + schema_obj = create_schema_object( + name=node.name, + physical_type="table", description=node.description, - tags=node.tags, - fields=create_fields( - manifest, - model_unique_id=model_unique_id, - columns=node.columns, - primary_key_name=primary_key, - references=references, - adapter_type=adapter_type, - ), + properties=properties, ) + + # Add tags as custom property + if node.tags: + if schema_obj.customProperties is None: + schema_obj.customProperties = [] + schema_obj.customProperties.append(CustomProperty(property="tags", value=",".join(node.tags))) + + # Handle composite primary key if len(primary_keys) > 1: - dc_model.primaryKey = primary_keys + if schema_obj.customProperties is None: + schema_obj.customProperties = [] + schema_obj.customProperties.append(CustomProperty(property="primaryKey", value=",".join(primary_keys))) - data_contract_specification.models[node.name] = dc_model + odcs.schema_.append(schema_obj) - return data_contract_specification + return odcs def convert_data_type_by_adapter_type(data_type: str, adapter_type: str) -> str: if adapter_type == "bigquery": return map_type_from_bigquery(data_type) - return data_type + return map_dbt_type_to_odcs(data_type) + + +def map_dbt_type_to_odcs(data_type: str) -> str: + """Map dbt data type to ODCS logical type.""" + if not data_type: + return "string" + + data_type_lower = data_type.lower() + + type_mapping = { + "string": "string", + "varchar": "string", + "text": "string", + "char": "string", + "int": "integer", + "integer": "integer", + "bigint": "integer", + "smallint": "integer", + "float": "number", + "double": "number", + "decimal": "number", + "numeric": "number", + "boolean": "boolean", + "bool": "boolean", + "date": "date", + "datetime": "date", + "timestamp": "date", + "time": "string", + "array": "array", + "object": "object", + "struct": "object", + "json": "object", + } + + for key, value in type_mapping.items(): + if data_type_lower.startswith(key): + return value + + return "string" def create_fields( @@ -156,12 +200,12 @@ def create_fields( primary_key_name: str, references: dict[str, str], adapter_type: str, -) -> dict[str, Field]: - fields = { - column.name: create_field(manifest, model_unique_id, column, primary_key_name, references, adapter_type) +) -> List[SchemaProperty]: + """Create ODCS SchemaProperties from dbt columns.""" + return [ + create_field(manifest, model_unique_id, column, primary_key_name, references, adapter_type) for column in columns.values() - } - return fields + ] def get_column_tests(manifest: Manifest, model_name: str, column_name: str) -> list[dict[str, str]]: @@ -204,13 +248,9 @@ def create_field( primary_key_name: str, references: dict[str, str], adapter_type: str, -) -> Field: - column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else "" - field = Field( - description=column.description, - type=column_type, - tags=column.tags, - ) +) -> SchemaProperty: + """Create an ODCS SchemaProperty from a dbt column.""" + column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else "string" all_tests = get_column_tests(manifest, model_unique_id, column.name) @@ -219,22 +259,30 @@ def create_field( required = True if [test for test in all_tests if test["test_type"] == "not_null"]: required = True - if required: - field.required = required unique = False if any(constraint.type == ConstraintType.unique for constraint in column.constraints): unique = True if [test for test in all_tests if test["test_type"] == "unique"]: unique = True - if unique: - field.unique = unique - if column.name == primary_key_name: - field.primaryKey = True + is_primary_key = column.name == primary_key_name + custom_props = {} references_key = f"{manifest.nodes[model_unique_id].name}.{column.name}" if references_key in references: - field.references = references[references_key] - - return field + custom_props["references"] = references[references_key] + if column.tags: + custom_props["tags"] = ",".join(column.tags) + + return create_property( + name=column.name, + logical_type=column_type, + physical_type=column.data_type, + description=column.description, + required=required if required else None, + unique=unique if unique else None, + primary_key=is_primary_key if is_primary_key else None, + primary_key_position=1 if is_primary_key else None, + custom_properties=custom_props if custom_props else None, + ) diff --git a/datacontract/imports/dcs_importer.py b/datacontract/imports/dcs_importer.py new file mode 100644 index 000000000..45d9ed2d1 --- /dev/null +++ b/datacontract/imports/dcs_importer.py @@ -0,0 +1,807 @@ +"""DCS Importer - Converts Data Contract Specification (DCS) to ODCS format.""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from datacontract_specification.model import DataContractSpecification, Field, Model +from datacontract_specification.model import Server as DCSServer +from open_data_contract_standard.model import ( + CustomProperty, + DataQuality, + Description, + OpenDataContractStandard, + Relationship, + SchemaObject, + SchemaProperty, + ServiceLevelAgreementProperty, + Team, +) +from open_data_contract_standard.model import ( + Server as ODCSServer, +) + +from datacontract.export.sql_type_converter import convert_to_sql_type +from datacontract.imports.importer import Importer + +logger = logging.getLogger(__name__) + + +class DcsImporter(Importer): + """Importer for Data Contract Specification (DCS) format.""" + + def import_source(self, source: str, import_args: dict) -> OpenDataContractStandard: + import yaml + + from datacontract.lint.resources import read_resource + + source_str = read_resource(source) + dcs_dict = yaml.safe_load(source_str) + dcs = parse_dcs_from_dict(dcs_dict) + return convert_dcs_to_odcs(dcs) + + +def parse_dcs_from_dict(dcs_dict: dict) -> DataContractSpecification: + """Parse a DCS dictionary into a DataContractSpecification object.""" + return DataContractSpecification(**dcs_dict) + + +def convert_dcs_to_odcs(dcs: DataContractSpecification) -> OpenDataContractStandard: + """Convert a DCS data contract to ODCS format.""" + odcs = OpenDataContractStandard( + id=dcs.id, + kind="DataContract", + apiVersion="v3.1.0", + ) + + serverType = "local" + + # Convert basic info + if dcs.info: + odcs.name = dcs.info.title + odcs.version = dcs.info.version + if dcs.info.description: + odcs.description = Description(purpose=dcs.info.description) + if dcs.info.owner: + team = Team(name=dcs.info.owner) + # Add contact info to customProperties if available + if dcs.info.contact: + contact_props = [] + if dcs.info.contact.name: + contact_props.append(CustomProperty(property="contactName", value=dcs.info.contact.name)) + if dcs.info.contact.url: + contact_props.append(CustomProperty(property="contactUrl", value=dcs.info.contact.url)) + if dcs.info.contact.email: + contact_props.append(CustomProperty(property="contactEmail", value=dcs.info.contact.email)) + if contact_props: + team.customProperties = contact_props + odcs.team = team + + # Convert status + if dcs.info and dcs.info.status: + odcs.status = dcs.info.status + + # Convert servers + if dcs.servers: + odcs.servers, serverType = _convert_servers(dcs.servers) + + # Convert models to schema + if dcs.models: + odcs.schema_ = _convert_models_to_schema(dcs.models, dcs.definitions, serverType) + + # Convert service levels to SLA properties + if dcs.servicelevels: + odcs.slaProperties = _convert_servicelevels(dcs.servicelevels) + + # Convert tags + if dcs.tags: + odcs.tags = dcs.tags + + # Convert links to authoritativeDefinitions + if dcs.links: + from open_data_contract_standard.model import AuthoritativeDefinition + + odcs.authoritativeDefinitions = [ + AuthoritativeDefinition(type=key, url=value) for key, value in dcs.links.items() + ] + + # Convert terms to Description fields + if dcs.terms: + if odcs.description is None: + odcs.description = Description() + if dcs.terms.usage: + odcs.description.usage = dcs.terms.usage + if dcs.terms.limitations: + odcs.description.limitations = dcs.terms.limitations + # Convert policies to authoritativeDefinitions + if dcs.terms.policies: + from open_data_contract_standard.model import AuthoritativeDefinition + + policy_defs = [ + AuthoritativeDefinition( + type=p.name, description=getattr(p, "description", None), url=getattr(p, "url", None) + ) + for p in dcs.terms.policies + ] + if odcs.authoritativeDefinitions: + odcs.authoritativeDefinitions.extend(policy_defs) + else: + odcs.authoritativeDefinitions = policy_defs + # Store billing, noticePeriod in customProperties + desc_custom_props = odcs.description.customProperties or [] + if dcs.terms.billing: + desc_custom_props.append(CustomProperty(property="billing", value=dcs.terms.billing)) + if dcs.terms.noticePeriod: + desc_custom_props.append(CustomProperty(property="noticePeriod", value=dcs.terms.noticePeriod)) + if desc_custom_props: + odcs.description.customProperties = desc_custom_props + + return odcs + + +def _convert_servers(dcs_servers: Dict[str, DCSServer]) -> List[ODCSServer]: + """Convert DCS servers dict to ODCS servers list.""" + servers = [] + for server_name, dcs_server in dcs_servers.items(): + odcs_server = ODCSServer( + server=server_name, + type=dcs_server.type, + ) + + # Copy common attributes + if dcs_server.environment: + odcs_server.environment = dcs_server.environment + if dcs_server.account: + odcs_server.account = dcs_server.account + if dcs_server.database: + odcs_server.database = dcs_server.database + if dcs_server.schema_: + odcs_server.schema_ = dcs_server.schema_ + if dcs_server.format: + odcs_server.format = dcs_server.format + if dcs_server.project: + odcs_server.project = dcs_server.project + if dcs_server.dataset: + odcs_server.dataset = dcs_server.dataset + if dcs_server.path: + odcs_server.path = dcs_server.path + if dcs_server.delimiter: + odcs_server.delimiter = dcs_server.delimiter + if dcs_server.endpointUrl: + odcs_server.endpointUrl = dcs_server.endpointUrl + if dcs_server.location: + odcs_server.location = dcs_server.location + if dcs_server.host: + odcs_server.host = dcs_server.host + if dcs_server.port: + odcs_server.port = dcs_server.port + if dcs_server.catalog: + odcs_server.catalog = dcs_server.catalog + if dcs_server.description: + odcs_server.description = dcs_server.description + if dcs_server.roles: + from open_data_contract_standard.model import Role as ODCSRole + + odcs_server.roles = [ODCSRole(role=r.name, description=r.description) for r in dcs_server.roles] + if dcs_server.topic: + # Store topic in customProperties since ODCS Server doesn't have a topic field + if odcs_server.customProperties is None: + odcs_server.customProperties = [] + odcs_server.customProperties.append(CustomProperty(property="topic", value=dcs_server.topic)) + if getattr(dcs_server, "http_path", None): + # Store http_path in customProperties since ODCS Server doesn't have it + if odcs_server.customProperties is None: + odcs_server.customProperties = [] + odcs_server.customProperties.append(CustomProperty(property="http_path", value=dcs_server.http_path)) + if getattr(dcs_server, "driver", None): + # Store driver in customProperties since ODCS Server doesn't have it + if odcs_server.customProperties is None: + odcs_server.customProperties = [] + odcs_server.customProperties.append(CustomProperty(property="driver", value=dcs_server.driver)) + if getattr(dcs_server, "service_name", None): + odcs_server.serviceName = dcs_server.service_name + + servers.append(odcs_server) + + return servers, odcs_server.type + + +def _convert_models_to_schema( + models: Dict[str, Model], definitions: Dict[str, Field] = None, serverType: str = None +) -> List[SchemaObject]: + """Convert DCS models dict to ODCS schema list.""" + schema = [] + serverType = "" + for model_name, model in models.items(): + if model.type in [ + "snowflake", + "postgres", + "dataframe", + "databricks", + "local", + "s3", + "sqlserver", + "bigquery", + "trino", + "oracle", + None, + ]: + physical_type = convert_to_sql_type(model.type, serverType) + else: + physical_type = model.type + + schema_obj = SchemaObject( + name=model_name, + physicalType=physical_type, + description=model.description, + ) + + # Convert config.*Table to physicalName + if model.config: + physical_name = _get_physical_name_from_config(model.config) + if physical_name: + schema_obj.physicalName = physical_name + + # Store namespace in customProperties for Avro export + if hasattr(model, "namespace") and model.namespace: + if schema_obj.customProperties is None: + schema_obj.customProperties = [] + schema_obj.customProperties.append(CustomProperty(property="namespace", value=model.namespace)) + + # Convert fields to properties + # Pass model-level primaryKey list to set primaryKey and primaryKeyPosition on fields + model_primary_keys = model.primaryKey if hasattr(model, "primaryKey") and model.primaryKey else [] + if model.fields: + schema_obj.properties = _convert_fields_to_properties(model.fields, model_primary_keys, definitions) + + # Convert quality rules + if model.quality: + schema_obj.quality = _convert_quality_list(model.quality) + + schema.append(schema_obj) + + return schema + + +def _get_physical_name_from_config(config: Dict[str, Any]) -> Optional[str]: + """Extract physical table name from DCS model config.""" + # Check for server-specific table name config keys + table_config_keys = [ + "postgresTable", + "databricksTable", + "snowflakeTable", + "sqlserverTable", + "bigqueryTable", + "redshiftTable", + "oracleTable", + "mysqlTable", + ] + for key in table_config_keys: + if key in config and config[key]: + return config[key] + return None + + +def _convert_fields_to_properties( + fields: Dict[str, Field], model_primary_keys: List[str] = None, definitions: Dict[str, Field] = None +) -> List[SchemaProperty]: + """Convert DCS fields dict to ODCS properties list.""" + model_primary_keys = model_primary_keys or [] + properties = [] + for field_name, field in fields.items(): + # Determine primaryKeyPosition from model-level primaryKey list + primary_key_position = None + if field_name in model_primary_keys: + primary_key_position = model_primary_keys.index(field_name) + 1 + prop = _convert_field_to_property(field_name, field, primary_key_position, definitions) + properties.append(prop) + return properties + + +def _resolve_field_ref(field: Field, definitions: Dict[str, Field]) -> Field: + """Resolve a field's $ref and merge with field properties.""" + if not field.ref: + return field + + ref_path = field.ref + resolved_data = None + + # Handle file:// references + if ref_path.startswith("file://"): + resolved_data = _resolve_file_ref(ref_path) + # Handle #/definitions/ references + elif ref_path.startswith("#/") and definitions: + resolved_data = _resolve_local_ref(ref_path, definitions) + + if resolved_data is None: + return field + + # Create merged field: resolved values as base, field values override + merged_data = {} + for attr in Field.model_fields.keys(): + resolved_value = resolved_data.get(attr) + field_value = getattr(field, attr, None) + # Field value takes precedence if set (not None and not empty for collections) + if field_value is not None: + if isinstance(field_value, (list, dict)): + if field_value: # Non-empty collection + merged_data[attr] = field_value + elif resolved_value: + merged_data[attr] = resolved_value + else: + merged_data[attr] = field_value + elif resolved_value is not None: + merged_data[attr] = resolved_value + # Clear ref to avoid infinite recursion + merged_data["ref"] = None + return Field(**merged_data) + + +def _resolve_file_ref(ref_path: str) -> Optional[Dict[str, Any]]: + """Resolve a file:// reference, optionally with a JSON pointer path.""" + from urllib.parse import urlparse + + import yaml + + # Split file path and JSON pointer + if "#" in ref_path: + file_url, pointer = ref_path.split("#", 1) + else: + file_url, pointer = ref_path, "" + + try: + # Parse the file:// URL to get the path + parsed = urlparse(file_url) + if parsed.scheme == "file": + file_path = parsed.path + else: + # Not a file:// URL, can't handle + logger.warning(f"Unsupported URL scheme in reference: {ref_path}") + return None + + # Read the file + with open(file_path, "r") as f: + content = f.read() + + data = yaml.safe_load(content) + + if pointer: + # Navigate the JSON pointer path + data = _navigate_path(data, pointer) + + return data if isinstance(data, dict) else None + except Exception as e: + logger.warning(f"Failed to resolve file reference {ref_path}: {e}") + return None + + +def _resolve_local_ref(ref_path: str, definitions: Dict[str, Field]) -> Optional[Dict[str, Any]]: + """Resolve a local #/ reference within definitions.""" + if not ref_path.startswith("#/definitions/"): + return None + + # Remove the #/definitions/ prefix + path_after_definitions = ref_path[len("#/definitions/") :] + + # Check for simple case: #/definitions/name + if "/" not in path_after_definitions: + if path_after_definitions in definitions: + definition = definitions[path_after_definitions] + return {attr: getattr(definition, attr, None) for attr in Field.model_fields.keys()} + return None + + # Complex case: #/definitions/name/fields/field_name + parts = path_after_definitions.split("/") + def_name = parts[0] + + if def_name not in definitions: + return None + + definition = definitions[def_name] + + # Navigate remaining path + remaining_path = "/" + "/".join(parts[1:]) + data = {attr: getattr(definition, attr, None) for attr in Field.model_fields.keys()} + # Convert fields dict to nested structure for navigation + if definition.fields: + data["fields"] = {name: _field_to_dict(f) for name, f in definition.fields.items()} + + return _navigate_path(data, remaining_path) + + +def _field_to_dict(field: Field) -> Dict[str, Any]: + """Convert a Field object to a dictionary.""" + return {attr: getattr(field, attr, None) for attr in Field.model_fields.keys()} + + +def _navigate_path(data: Any, path: str) -> Optional[Dict[str, Any]]: + """Navigate a JSON pointer-like path within data.""" + if not path or path == "/": + return data if isinstance(data, dict) else None + + # Remove leading slash and split + parts = path.lstrip("/").split("/") + + current = data + for part in parts: + if isinstance(current, dict) and part in current: + current = current[part] + else: + return None + + return current if isinstance(current, dict) else None + + +def _convert_field_to_property( + field_name: str, field: Field, primary_key_position: int = None, definitions: Dict[str, Field] = None +) -> SchemaProperty: + """Convert a DCS field to an ODCS property.""" + # Resolve $ref if present + field = _resolve_field_ref(field, definitions) + + prop = SchemaProperty(name=field_name) + + # Preserve original type as physicalType and convert to logicalType + if field.type: + prop.physicalType = field.type + prop.logicalType = _convert_type_to_logical_type(field.type) + + # Copy direct attributes + if field.description: + prop.description = field.description + if field.required is not None: + prop.required = field.required + if field.unique is not None: + prop.unique = field.unique + # Set primaryKey from field-level or model-level primaryKey + if field.primaryKey is not None: + prop.primaryKey = field.primaryKey + elif primary_key_position is not None: + prop.primaryKey = True + prop.primaryKeyPosition = primary_key_position + if field.title: + prop.businessName = field.title + if field.classification: + prop.classification = field.classification + if field.tags: + prop.tags = field.tags + if field.examples is not None: + prop.examples = field.examples + if field.example is not None: + prop.examples = [field.example] + # Convert constraints to logicalTypeOptions + logical_type_options = {} + if field.minLength is not None: + logical_type_options["minLength"] = field.minLength + if field.maxLength is not None: + logical_type_options["maxLength"] = field.maxLength + if field.pattern: + logical_type_options["pattern"] = field.pattern + if field.minimum is not None: + logical_type_options["minimum"] = field.minimum + if field.maximum is not None: + logical_type_options["maximum"] = field.maximum + if field.exclusiveMinimum is not None: + logical_type_options["exclusiveMinimum"] = field.exclusiveMinimum + if field.exclusiveMaximum is not None: + logical_type_options["exclusiveMaximum"] = field.exclusiveMaximum + if field.format: + logical_type_options["format"] = field.format + + if logical_type_options: + prop.logicalTypeOptions = logical_type_options + + # Convert config to customProperties + custom_properties = [] + # Handle enum as quality rule (invalidValues with validValues, mustBe: 0) + quality_rules = [] + if field.enum: + quality_rules.append( + DataQuality( + type="library", + metric="invalidValues", + arguments={"validValues": field.enum}, + mustBe=0, + ) + ) + if field.pii is not None: + custom_properties.append(CustomProperty(property="pii", value=str(field.pii))) + if field.precision is not None: + custom_properties.append(CustomProperty(property="precision", value=str(field.precision))) + if field.scale is not None: + custom_properties.append(CustomProperty(property="scale", value=str(field.scale))) + if field.config: + # Server-specific type overrides physicalType + server_type_keys = [ + "oracleType", + "snowflakeType", + "postgresType", + "bigqueryType", + "databricksType", + "sqlserverType", + "trinoType", + "physicalType", + ] + for key in server_type_keys: + if key in field.config: + prop.physicalType = field.config[key] + break + + for key, value in field.config.items(): + # Use JSON serialization for lists and dicts to preserve structure + if isinstance(value, (list, dict)): + custom_properties.append(CustomProperty(property=key, value=json.dumps(value))) + else: + custom_properties.append(CustomProperty(property=key, value=str(value))) + + # Convert references to relationships + if field.references: + prop.relationships = [Relationship(type="foreignKey", to=field.references)] + + # Convert nested fields (for object types) + if field.fields: + prop.properties = _convert_fields_to_properties(field.fields, None, definitions) + + # Convert items (for array types) + if field.items: + prop.items = _convert_field_to_property("item", field.items, None, definitions) + + # Convert keys/values (for map types) - store types in customProperties + if field.keys or field.values: + if field.keys and field.keys.type: + custom_properties.append( + CustomProperty(property="mapKeyType", value=_convert_type_to_logical_type(field.keys.type)) + ) + if field.values and field.values.type: + custom_properties.append( + CustomProperty(property="mapValueType", value=_convert_type_to_logical_type(field.values.type)) + ) + # For map with struct values, store the value fields in properties + if field.values.fields: + prop.properties = _convert_fields_to_properties(field.values.fields, None, definitions) + + # Set customProperties after all have been added + if custom_properties: + prop.customProperties = custom_properties + + # Convert quality rules (merge enum quality rule with field-level quality) + if field.quality: + quality_rules.extend(_convert_quality_list(field.quality)) + if quality_rules: + prop.quality = quality_rules + + # Convert lineage + if field.lineage: + if hasattr(field.lineage, "inputFields") and field.lineage.inputFields: + prop.transformSourceObjects = [ + f"{f.namespace}.{f.name}.{f.field}" + if hasattr(f, "namespace") and f.namespace + else f"{f.name}.{f.field}" + for f in field.lineage.inputFields + ] + if hasattr(field.lineage, "transformationDescription") and field.lineage.transformationDescription: + prop.transformDescription = field.lineage.transformationDescription + if hasattr(field.lineage, "transformationType") and field.lineage.transformationType: + prop.transformLogic = field.lineage.transformationType + + return prop + + +def _convert_type_to_logical_type(dcs_type: str) -> str: + """Convert DCS type to ODCS logical type.""" + if dcs_type is None: + return "string" + + t = dcs_type.lower() + + # Map DCS types to ODCS logical types + type_mapping = { + "string": "string", + "text": "string", + "varchar": "string", + "char": "string", + "integer": "integer", + "int": "integer", + "long": "integer", + "bigint": "integer", + "float": "number", + "double": "number", + "decimal": "number", + "numeric": "number", + "number": "number", + "boolean": "boolean", + "bool": "boolean", + "timestamp": "timestamp", + "timestamp_tz": "timestamp", + "timestamp_ntz": "timestamp", + "date": "date", + "time": "string", # not supported in ODCS + "datetime": "timestamp", + "array": "array", + "object": "object", + "record": "object", + "struct": "object", + "map": "object", + "bytes": "string", # not supported in ODCS + "binary": "string", # not supported in ODCS + "null": "string", # not supported in ODCS + } + + return type_mapping.get(t, t) + + +def _convert_quality_list(quality_list: list) -> List[DataQuality]: + """Convert DCS quality list to ODCS DataQuality list.""" + if not quality_list: + return [] + + result = [] + for q in quality_list: + if q is None: + continue + dq = DataQuality(type=getattr(q, "type", None)) + if hasattr(q, "description") and q.description: + dq.description = q.description + if hasattr(q, "query") and q.query: + dq.query = q.query + if hasattr(q, "metric") and q.metric: + dq.metric = q.metric + if hasattr(q, "mustBe") and q.mustBe is not None: + dq.mustBe = q.mustBe + if hasattr(q, "mustNotBe") and q.mustNotBe is not None: + dq.mustNotBe = q.mustNotBe + if hasattr(q, "mustBeGreaterThan") and q.mustBeGreaterThan is not None: + dq.mustBeGreaterThan = q.mustBeGreaterThan + if hasattr(q, "mustBeGreaterOrEqualTo") and q.mustBeGreaterOrEqualTo is not None: + dq.mustBeGreaterOrEqualTo = q.mustBeGreaterOrEqualTo + if hasattr(q, "mustBeGreaterThanOrEqualTo") and q.mustBeGreaterThanOrEqualTo is not None: + dq.mustBeGreaterOrEqualTo = q.mustBeGreaterThanOrEqualTo + if hasattr(q, "mustBeLessThan") and q.mustBeLessThan is not None: + dq.mustBeLessThan = q.mustBeLessThan + if hasattr(q, "mustBeLessOrEqualTo") and q.mustBeLessOrEqualTo is not None: + dq.mustBeLessOrEqualTo = q.mustBeLessOrEqualTo + if hasattr(q, "mustBeLessThanOrEqualTo") and q.mustBeLessThanOrEqualTo is not None: + dq.mustBeLessOrEqualTo = q.mustBeLessThanOrEqualTo + if hasattr(q, "mustBeBetween") and q.mustBeBetween is not None: + dq.mustBeBetween = q.mustBeBetween + if hasattr(q, "mustNotBeBetween") and q.mustNotBeBetween is not None: + dq.mustNotBeBetween = q.mustNotBeBetween + if hasattr(q, "engine") and q.engine: + dq.engine = q.engine + if hasattr(q, "implementation") and q.implementation: + dq.implementation = q.implementation + + result.append(dq) + + return result + + +def _convert_servicelevels(servicelevels: Any) -> List[ServiceLevelAgreementProperty]: + """Convert DCS service levels to ODCS SLA properties.""" + sla_properties = [] + + if hasattr(servicelevels, "availability") and servicelevels.availability: + sla_properties.append( + ServiceLevelAgreementProperty( + property="generalAvailability", + value=servicelevels.availability.description + if hasattr(servicelevels.availability, "description") + else str(servicelevels.availability), + ) + ) + + if hasattr(servicelevels, "retention") and servicelevels.retention: + retention = servicelevels.retention + period = retention.period if hasattr(retention, "period") else str(retention) + element = retention.timestampField if hasattr(retention, "timestampField") else None + sla_properties.append( + ServiceLevelAgreementProperty( + property="retention", + value=period, + element=element, + ) + ) + + if hasattr(servicelevels, "freshness") and servicelevels.freshness: + freshness = servicelevels.freshness + if ( + hasattr(freshness, "threshold") + and freshness.threshold + and hasattr(freshness, "timestampField") + and freshness.timestampField + ): + value, unit = _parse_iso8601_duration(freshness.threshold) + if value is not None and unit is not None: + sla_properties.append( + ServiceLevelAgreementProperty( + property="freshness", + value=value, + unit=unit, + element=freshness.timestampField, + ) + ) + + if hasattr(servicelevels, "latency") and servicelevels.latency: + latency = servicelevels.latency + if hasattr(latency, "threshold") and latency.threshold: + value, unit = _parse_iso8601_duration(latency.threshold) + if value is not None and unit is not None: + element = None + if hasattr(latency, "sourceTimestampField") and latency.sourceTimestampField: + element = latency.sourceTimestampField + sla_properties.append( + ServiceLevelAgreementProperty( + property="latency", + value=value, + unit=unit, + element=element, + ) + ) + + if hasattr(servicelevels, "frequency") and servicelevels.frequency: + frequency = servicelevels.frequency + freq_value = ( + frequency.interval + if hasattr(frequency, "interval") and frequency.interval + else (frequency.cron if hasattr(frequency, "cron") else None) + ) + if freq_value: + sla_properties.append( + ServiceLevelAgreementProperty( + property="frequency", + value=freq_value, + ) + ) + + if hasattr(servicelevels, "support") and servicelevels.support: + support = servicelevels.support + support_value = ( + support.time + if hasattr(support, "time") and support.time + else (support.description if hasattr(support, "description") else None) + ) + if support_value: + sla_properties.append( + ServiceLevelAgreementProperty( + property="support", + value=support_value, + ) + ) + + if hasattr(servicelevels, "backup") and servicelevels.backup: + backup = servicelevels.backup + backup_value = ( + backup.interval + if hasattr(backup, "interval") and backup.interval + else (backup.cron if hasattr(backup, "cron") else None) + ) + if backup_value: + sla_properties.append( + ServiceLevelAgreementProperty( + property="backup", + value=backup_value, + ) + ) + + return sla_properties + + +def _parse_iso8601_duration(duration: str) -> tuple: + """Parse ISO 8601 duration (e.g., PT1H, P1D) to value and unit.""" + import re + + if not duration: + return None, None + + # Remove P and T prefixes + duration = duration.upper().replace("P", "").replace("T", "") + + # Match patterns like 1H, 30M, 1D + match = re.match(r"(\d+)([DHMS])", duration) + if match: + value = int(match.group(1)) + unit_char = match.group(2) + unit_map = {"D": "d", "H": "h", "M": "m", "S": "s"} + return value, unit_map.get(unit_char, "d") + + return None, None diff --git a/datacontract/imports/excel_importer.py b/datacontract/imports/excel_importer.py index 8502a2b0d..df18bae00 100644 --- a/datacontract/imports/excel_importer.py +++ b/datacontract/imports/excel_importer.py @@ -16,15 +16,13 @@ ServiceLevelAgreementProperty, Support, Team, + TeamMember, ) from openpyxl.cell.cell import Cell from openpyxl.workbook.workbook import Workbook from openpyxl.worksheet.worksheet import Worksheet from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import ( - DataContractSpecification, -) from datacontract.model.exceptions import DataContractException logger = logging.getLogger(__name__) @@ -33,10 +31,9 @@ class ExcelImporter(Importer): def import_source( self, - data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str, import_args: dict, - ) -> DataContractSpecification | OpenDataContractStandard: + ) -> OpenDataContractStandard: return import_excel_as_odcs(source) @@ -540,7 +537,7 @@ def import_team(workbook: Workbook) -> Optional[List[Team]]: if (not (username or name or role)) or row_idx == team_range[0] - 1: continue - team_member = Team( + team_member = TeamMember( username=username, name=name, description=get_cell_value(row, headers.get("description")), @@ -702,7 +699,7 @@ def import_servers(workbook) -> Optional[List[Server]]: elif server_type == "postgres": server.database = get_server_cell_value(workbook, sheet, "servers.postgres.database", index) server.host = get_server_cell_value(workbook, sheet, "servers.postgres.host", index) - server.port = get_server_cell_value(workbook, sheet, "servers.postgres.port", index) + server.port = parse_integer(get_server_cell_value(workbook, sheet, "servers.postgres.port", index)) server.schema_ = get_server_cell_value(workbook, sheet, "servers.postgres.schema", index) elif server_type == "s3": server.delimiter = get_server_cell_value(workbook, sheet, "servers.s3.delimiter", index) @@ -713,14 +710,18 @@ def import_servers(workbook) -> Optional[List[Server]]: server.account = get_server_cell_value(workbook, sheet, "servers.snowflake.account", index) server.database = get_server_cell_value(workbook, sheet, "servers.snowflake.database", index) server.host = get_server_cell_value(workbook, sheet, "servers.snowflake.host", index) - server.port = get_server_cell_value(workbook, sheet, "servers.snowflake.port", index) + server.port = parse_integer(get_server_cell_value(workbook, sheet, "servers.snowflake.port", index)) server.schema_ = get_server_cell_value(workbook, sheet, "servers.snowflake.schema", index) server.warehouse = get_server_cell_value(workbook, sheet, "servers.snowflake.warehouse", index) elif server_type == "sqlserver": server.database = get_server_cell_value(workbook, sheet, "servers.sqlserver.database", index) server.host = get_server_cell_value(workbook, sheet, "servers.sqlserver.host", index) - server.port = get_server_cell_value(workbook, sheet, "servers.sqlserver.port", index) + server.port = parse_integer(get_server_cell_value(workbook, sheet, "servers.sqlserver.port", index)) server.schema_ = get_server_cell_value(workbook, sheet, "servers.sqlserver.schema", index) + elif server_type == "oracle": + server.host = get_server_cell_value(workbook, sheet, "servers.oracle.host", index) + server.port = parse_integer(get_server_cell_value(workbook, sheet, "servers.oracle.port", index)) + server.serviceName = get_server_cell_value(workbook, sheet, "servers.oracle.servicename", index) else: # Custom server type - grab all possible fields server.account = get_server_cell_value(workbook, sheet, "servers.custom.account", index) @@ -733,12 +734,10 @@ def import_servers(workbook) -> Optional[List[Server]]: server.host = get_server_cell_value(workbook, sheet, "servers.custom.host", index) server.location = get_server_cell_value(workbook, sheet, "servers.custom.location", index) server.path = get_server_cell_value(workbook, sheet, "servers.custom.path", index) - server.port = get_server_cell_value(workbook, sheet, "servers.custom.port", index) + server.port = parse_integer(get_server_cell_value(workbook, sheet, "servers.custom.port", index)) server.project = get_server_cell_value(workbook, sheet, "servers.custom.project", index) server.schema_ = get_server_cell_value(workbook, sheet, "servers.custom.schema", index) server.stagingDir = get_server_cell_value(workbook, sheet, "servers.custom.stagingDir", index) - server.table = get_server_cell_value(workbook, sheet, "servers.custom.table", index) - server.view = get_server_cell_value(workbook, sheet, "servers.custom.view", index) server.warehouse = get_server_cell_value(workbook, sheet, "servers.custom.warehouse", index) server.region = get_server_cell_value(workbook, sheet, "servers.custom.region", index) server.regionName = get_server_cell_value(workbook, sheet, "servers.custom.regionName", index) @@ -985,9 +984,15 @@ def parse_threshold_values(threshold_operator: str, threshold_value: str) -> Dic # Single value for other operators try: # Try to parse as number + isFraction = "." in threshold_value if threshold_value.replace(".", "").replace("-", "").isdigit(): - value = Decimal(threshold_value) - threshold_dict[threshold_operator] = value + if isFraction: + value = float(threshold_value) + else: + value = int(threshold_value) + + threshold_dict[threshold_operator] = value + except (ValueError, TypeError) as e: logger.warning(f"Failed to parse threshold value: {threshold_value}, error: {e}") diff --git a/datacontract/imports/glue_importer.py b/datacontract/imports/glue_importer.py index b3063be08..11ccdc4cc 100644 --- a/datacontract/imports/glue_importer.py +++ b/datacontract/imports/glue_importer.py @@ -2,32 +2,26 @@ from typing import Dict, Generator, List import boto3 +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Field, - Model, - Server, +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, ) class GlueImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_glue(data_contract_specification, source, import_args.get("glue_table")) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_glue(source, import_args.get("glue_table")) def get_glue_database(database_name: str): - """Get the details Glue database. - - Args: - database_name (str): glue database to request. - - Returns: - set: catalogid and locationUri - """ + """Get the details Glue database.""" glue = boto3.client("glue") try: response = glue.get_database(Name=database_name) @@ -35,7 +29,6 @@ def get_glue_database(database_name: str): print(f"Database not found {database_name}.") return (None, None) except Exception as e: - # todo catch all print(f"Error: {e}") return (None, None) @@ -46,31 +39,18 @@ def get_glue_database(database_name: str): def get_glue_tables(database_name: str) -> List[str]: - """Get the list of tables in a Glue database. - - Args: - database_name (str): Glue database to request. - - Returns: - List[str]: List of table names - """ + """Get the list of tables in a Glue database.""" glue = boto3.client("glue") - - # Set the paginator paginator = glue.get_paginator("get_tables") - - # Initialize an empty list to store the table names table_names = [] + try: - # Paginate through the tables for page in paginator.paginate(DatabaseName=database_name, PaginationConfig={"PageSize": 100}): - # Add the tables from the current page to the list table_names.extend([table["Name"] for table in page["TableList"] if "Name" in table]) except glue.exceptions.EntityNotFoundException: print(f"Database {database_name} not found.") return [] except Exception as e: - # todo catch all print(f"Error: {e}") return [] @@ -78,32 +58,20 @@ def get_glue_tables(database_name: str) -> List[str]: def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]: - """Get the schema of a Glue table. - - Args: - database_name (str): Glue database name. - table_name (str): Glue table name. - - Returns: - dict: Table schema - """ - + """Get the schema of a Glue table.""" glue = boto3.client("glue") - # Get the table schema try: response = glue.get_table(DatabaseName=database_name, Name=table_name) except glue.exceptions.EntityNotFoundException: print(f"Table {table_name} not found in database {database_name}.") return [] except Exception as e: - # todo catch all print(f"Error: {e}") return [] table_schema = response["Table"]["StorageDescriptor"]["Columns"] - # when using hive partition keys, the schema is stored in the PartitionKeys field if response["Table"].get("PartitionKeys") is not None: for pk in response["Table"]["PartitionKeys"]: table_schema.append( @@ -118,119 +86,124 @@ def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]: def import_glue( - data_contract_specification: DataContractSpecification, source: str, table_names: List[str], -) -> DataContractSpecification: - """Import the schema of a Glue database. - - Args: - data_contract_specification (DataContractSpecification): The data contract specification to update. - source (str): The name of the Glue database. - table_names (List[str]): List of table names to import. If None, all tables in the database are imported. - - Returns: - DataContractSpecification: The updated data contract specification. - """ +) -> OpenDataContractStandard: + """Import the schema of a Glue database.""" catalogid, location_uri = get_glue_database(source) - # something went wrong if catalogid is None: - return data_contract_specification + return create_odcs() if table_names is None: table_names = get_glue_tables(source) - server_kwargs = {"type": "glue", "account": catalogid, "database": source} + odcs = create_odcs() - if location_uri: - server_kwargs["location"] = location_uri + # Create server + server = create_server( + name="production", + server_type="glue", + account=catalogid, + database=source, + location=location_uri, + ) + odcs.servers = [server] - data_contract_specification.servers = { - "production": Server(**server_kwargs), - } + odcs.schema_ = [] for table_name in table_names: - if data_contract_specification.models is None: - data_contract_specification.models = {} - table_schema = get_glue_table_schema(source, table_name) - fields = {} + properties = [] for column in table_schema: - field = create_typed_field(column["Type"]) + prop = create_typed_property(column["Name"], column["Type"]) - # hive partitions are required, but are not primary keys + # Hive partitions are required if column.get("Hive"): - field.required = True + prop.required = True - field.description = column.get("Comment") - fields[column["Name"]] = field + if column.get("Comment"): + prop.description = column.get("Comment") - data_contract_specification.models[table_name] = Model( - type="table", - fields=fields, - ) + properties.append(prop) - return data_contract_specification + schema_obj = create_schema_object( + name=table_name, + physical_type="table", + properties=properties, + ) + odcs.schema_.append(schema_obj) -def create_typed_field(dtype: str) -> Field: - """Create a typed field based on the given data type. + return odcs - Args: - dtype (str): The data type of the field. - Returns: - Field: The created field with the appropriate type. - """ - field = Field() +def create_typed_property(name: str, dtype: str) -> SchemaProperty: + """Create a typed SchemaProperty based on the given data type.""" dtype = dtype.strip().lower().replace(" ", "") - # Example: array + if dtype.startswith("array"): - field.type = "array" - field.items = create_typed_field(dtype[6:-1]) - # Example: struct + inner_type = dtype[6:-1] + items_prop = create_typed_property("items", inner_type) + return create_property( + name=name, + logical_type="array", + physical_type=dtype, + items=items_prop, + ) elif dtype.startswith("struct"): - field.type = "struct" + nested_props = [] for f in split_struct(dtype[7:-1]): - field_name, field_key = f.split(":", 1) - field.fields[field_name] = create_typed_field(field_key) - # Example: map + field_name, field_type = f.split(":", 1) + nested_props.append(create_typed_property(field_name, field_type)) + return create_property( + name=name, + logical_type="object", + physical_type="struct", + properties=nested_props, + ) elif dtype.startswith("map"): - field.type = "map" - map_match = re.match(r"map<(.+?),\s*(.+)>", dtype) - if map_match: - key_type = map_match.group(1) - value_type = map_match.group(2) - field.keys = create_typed_field(key_type) - field.values = create_typed_field(value_type) - # Example: decimal(38, 6) or decimal + return create_property( + name=name, + logical_type="object", + physical_type="map", + ) elif dtype.startswith("decimal"): - field.type = "decimal" + precision = None + scale = None decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype) - if decimal_match: # if precision specified - field.precision = int(decimal_match.group(1)) - field.scale = int(decimal_match.group(2)) - # Example: varchar(255) or varchar + if decimal_match: + precision = int(decimal_match.group(1)) + scale = int(decimal_match.group(2)) + return create_property( + name=name, + logical_type="number", + physical_type="decimal", + precision=precision, + scale=scale, + ) elif dtype.startswith("varchar"): - field.type = "varchar" + max_length = None if len(dtype) > 7: - field.maxLength = int(dtype[8:-1]) + max_length = int(dtype[8:-1]) + return create_property( + name=name, + logical_type="string", + physical_type="varchar", + max_length=max_length, + ) else: - field.type = map_type_from_sql(dtype) - return field + logical_type = map_glue_type_to_odcs(dtype) + return create_property( + name=name, + logical_type=logical_type, + physical_type=dtype, + ) def split_fields(s: str) -> Generator[str, None, None]: - """Split a string of fields considering nested structures. - - Args: - s (str): The string to split. - - Yields: - str: The next field in the string. - """ + """Split a string of fields considering nested structures.""" counter: int = 0 last: int = 0 for i, x in enumerate(s): @@ -245,39 +218,25 @@ def split_fields(s: str) -> Generator[str, None, None]: def split_struct(s: str) -> List[str]: - """Split a struct string into individual fields. - - Args: - s (str): The struct string to split. - - Returns: - List[str]: List of individual fields in the struct. - """ + """Split a struct string into individual fields.""" return list(split_fields(s=s)) -def map_type_from_sql(sql_type: str) -> str: - """Map an SQL type to a corresponding field type. - - Args: - sql_type (str): The SQL type to map. - - Returns: - str: The corresponding field type. - """ +def map_glue_type_to_odcs(sql_type: str) -> str: + """Map a Glue/SQL type to ODCS logical type.""" if sql_type is None: - return None + return "string" sql_type = sql_type.lower() type_mapping = { "string": "string", - "int": "int", - "bigint": "bigint", - "float": "float", - "double": "double", + "int": "integer", + "bigint": "integer", + "float": "number", + "double": "number", "boolean": "boolean", - "timestamp": "timestamp", + "timestamp": "date", "date": "date", } @@ -285,4 +244,4 @@ def map_type_from_sql(sql_type: str) -> str: if sql_type.startswith(prefix): return mapped_type - return "unknown" + return "string" diff --git a/datacontract/imports/iceberg_importer.py b/datacontract/imports/iceberg_importer.py index f63db25fa..2bfd3f95a 100644 --- a/datacontract/imports/iceberg_importer.py +++ b/datacontract/imports/iceberg_importer.py @@ -1,21 +1,24 @@ -from typing import Any, Dict +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from pydantic import ValidationError from pyiceberg import types as iceberg_types from pyiceberg.schema import Schema from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.model.exceptions import DataContractException class IcebergImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: + self, source: str, import_args: dict + ) -> OpenDataContractStandard: schema = load_and_validate_iceberg_schema(source) return import_iceberg( - data_contract_specification, schema, import_args.get("iceberg_table"), ) @@ -34,139 +37,139 @@ def load_and_validate_iceberg_schema(source: str) -> Schema: ) -def import_iceberg( - data_contract_specification: DataContractSpecification, schema: Schema, table_name: str -) -> DataContractSpecification: - if data_contract_specification.models is None: - data_contract_specification.models = {} - - model = Model(type="table", title=table_name) +def import_iceberg(schema: Schema, table_name: str) -> OpenDataContractStandard: + """Import an Iceberg schema and create an ODCS data contract.""" + odcs = create_odcs() # Iceberg identifier_fields aren't technically primary keys since Iceberg doesn't support primary keys, - # but they are close enough that we can probably treat them as primary keys on the conversion. - # ref: https://iceberg.apache.org/spec/#identifier-field-ids - # this code WILL NOT support finding nested primary key fields. + # but they are close enough that we can treat them as primary keys on the conversion. identifier_fields_ids = schema.identifier_field_ids + properties = [] + pk_position = 1 + for field in schema.fields: - model_field = _field_from_nested_field(field) + prop = _property_from_nested_field(field) if field.field_id in identifier_fields_ids: - model_field.primaryKey = True - - model.fields[field.name] = model_field - - data_contract_specification.models[table_name] = model - return data_contract_specification + prop.primaryKey = True + prop.primaryKeyPosition = pk_position + pk_position += 1 + properties.append(prop) -def _field_from_nested_field(nested_field: iceberg_types.NestedField) -> Field: - """ - Converts an Iceberg NestedField into a Field object for the data contract. - - Args: - nested_field: The Iceberg NestedField to convert. - - Returns: - Field: The generated Field object. - """ - field = Field( - title=nested_field.name, - required=nested_field.required, - config=build_field_config(nested_field), + schema_obj = create_schema_object( + name=table_name or "iceberg_table", + physical_type="table", + properties=properties, ) - if nested_field.doc is not None: - field.description = nested_field.doc - - return _type_from_iceberg_type(field, nested_field.field_type) - - -def _type_from_iceberg_type(field: Field, iceberg_type: iceberg_types.IcebergType) -> Field: - """ - Maps Iceberg data types to the Data Contract type system and updates the field. - - Args: - field: The Field object to update. - iceberg_type: The Iceberg data type to map. - - Returns: - Field: The updated Field object. - """ - field.type = _data_type_from_iceberg(iceberg_type) - - if field.type == "array": - field.items = _type_from_iceberg_type(Field(required=iceberg_type.element_required), iceberg_type.element_type) - - elif field.type == "map": - field.keys = _type_from_iceberg_type(Field(required=True), iceberg_type.key_type) - field.values = _type_from_iceberg_type(Field(required=iceberg_type.value_required), iceberg_type.value_type) - - elif field.type == "object": - field.fields = {nf.name: _field_from_nested_field(nf) for nf in iceberg_type.fields} - - return field - - -def build_field_config(iceberg_field: iceberg_types.NestedField) -> Dict[str, Any]: - config = {} - - if iceberg_field.field_id > 0: - config["icebergFieldId"] = iceberg_field.field_id + odcs.schema_ = [schema_obj] + return odcs + + +def _property_from_nested_field(nested_field: iceberg_types.NestedField) -> SchemaProperty: + """Converts an Iceberg NestedField into an ODCS SchemaProperty.""" + logical_type = _data_type_from_iceberg(nested_field.field_type) + + custom_props = {} + if nested_field.field_id > 0: + custom_props["icebergFieldId"] = nested_field.field_id + if nested_field.initial_default is not None: + custom_props["icebergInitialDefault"] = str(nested_field.initial_default) + if nested_field.write_default is not None: + custom_props["icebergWriteDefault"] = str(nested_field.write_default) + + nested_properties = None + items_prop = None + physical_type = str(nested_field.field_type) + + if logical_type == "array": + items_prop = _type_to_property("items", nested_field.field_type.element_type, nested_field.field_type.element_required) + elif isinstance(nested_field.field_type, iceberg_types.MapType): + # For map types, store key/value types in customProperties and use "map" as physicalType + physical_type = "map" + custom_props["mapKeyType"] = _data_type_from_iceberg(nested_field.field_type.key_type) + custom_props["mapValueType"] = _data_type_from_iceberg(nested_field.field_type.value_type) + custom_props["mapValueRequired"] = str(nested_field.field_type.value_required).lower() + # Handle nested maps in value type + if isinstance(nested_field.field_type.value_type, iceberg_types.MapType): + custom_props["mapValuePhysicalType"] = "map" + custom_props["mapNestedKeyType"] = _data_type_from_iceberg(nested_field.field_type.value_type.key_type) + custom_props["mapNestedValueType"] = _data_type_from_iceberg(nested_field.field_type.value_type.value_type) + custom_props["mapNestedValueRequired"] = str(nested_field.field_type.value_type.value_required).lower() + elif logical_type == "object" and hasattr(nested_field.field_type, "fields"): + nested_properties = [_property_from_nested_field(nf) for nf in nested_field.field_type.fields] + + return create_property( + name=nested_field.name, + logical_type=logical_type, + physical_type=physical_type, + description=nested_field.doc, + required=nested_field.required if nested_field.required else None, + properties=nested_properties, + items=items_prop, + custom_properties=custom_props if custom_props else None, + ) - if iceberg_field.initial_default is not None: - config["icebergInitialDefault"] = iceberg_field.initial_default - if iceberg_field.write_default is not None: - config["icebergWriteDefault"] = iceberg_field.write_default +def _type_to_property(name: str, iceberg_type: iceberg_types.IcebergType, required: bool = True) -> SchemaProperty: + """Convert an Iceberg type to an ODCS SchemaProperty.""" + logical_type = _data_type_from_iceberg(iceberg_type) - return config + nested_properties = None + items_prop = None + if logical_type == "array": + items_prop = _type_to_property("items", iceberg_type.element_type, iceberg_type.element_required) + elif logical_type == "object" and hasattr(iceberg_type, "fields"): + nested_properties = [_property_from_nested_field(nf) for nf in iceberg_type.fields] -def _data_type_from_iceberg(type: iceberg_types.IcebergType) -> str: - """ - Convert an Iceberg field type to a datacontract field type + return create_property( + name=name, + logical_type=logical_type, + physical_type=str(iceberg_type), + required=required if required else None, + properties=nested_properties, + items=items_prop, + ) - Args: - type: The Iceberg field type - Returns: - str: The datacontract field type - """ - if isinstance(type, iceberg_types.BooleanType): +def _data_type_from_iceberg(iceberg_type: iceberg_types.IcebergType) -> str: + """Convert an Iceberg field type to an ODCS logical type.""" + if isinstance(iceberg_type, iceberg_types.BooleanType): return "boolean" - if isinstance(type, iceberg_types.IntegerType): + if isinstance(iceberg_type, iceberg_types.IntegerType): return "integer" - if isinstance(type, iceberg_types.LongType): - return "long" - if isinstance(type, iceberg_types.FloatType): - return "float" - if isinstance(type, iceberg_types.DoubleType): - return "double" - if isinstance(type, iceberg_types.DecimalType): - return "decimal" - if isinstance(type, iceberg_types.DateType): + if isinstance(iceberg_type, iceberg_types.LongType): + return "integer" + if isinstance(iceberg_type, iceberg_types.FloatType): + return "number" + if isinstance(iceberg_type, iceberg_types.DoubleType): + return "number" + if isinstance(iceberg_type, iceberg_types.DecimalType): + return "number" + if isinstance(iceberg_type, iceberg_types.DateType): return "date" - if isinstance(type, iceberg_types.TimeType): - # there isn't a great mapping for the iceberg type "time", just map to string for now + if isinstance(iceberg_type, iceberg_types.TimeType): return "string" - if isinstance(type, iceberg_types.TimestampType): - return "timestamp_ntz" - if isinstance(type, iceberg_types.TimestamptzType): - return "timestamp_tz" - if isinstance(type, iceberg_types.StringType): + if isinstance(iceberg_type, iceberg_types.TimestampType): + return "date" + if isinstance(iceberg_type, iceberg_types.TimestamptzType): + return "date" + if isinstance(iceberg_type, iceberg_types.StringType): return "string" - if isinstance(type, iceberg_types.UUIDType): + if isinstance(iceberg_type, iceberg_types.UUIDType): return "string" - if isinstance(type, iceberg_types.BinaryType): - return "bytes" - if isinstance(type, iceberg_types.FixedType): - return "bytes" - if isinstance(type, iceberg_types.MapType): - return "map" - if isinstance(type, iceberg_types.ListType): + if isinstance(iceberg_type, iceberg_types.BinaryType): + return "array" + if isinstance(iceberg_type, iceberg_types.FixedType): + return "array" + if isinstance(iceberg_type, iceberg_types.MapType): + return "object" + if isinstance(iceberg_type, iceberg_types.ListType): return "array" - if isinstance(type, iceberg_types.StructType): + if isinstance(iceberg_type, iceberg_types.StructType): return "object" - raise ValueError(f"Unknown Iceberg type: {type}") + raise ValueError(f"Unknown Iceberg type: {iceberg_type}") diff --git a/datacontract/imports/importer.py b/datacontract/imports/importer.py index 61f1d2179..24961fb8c 100644 --- a/datacontract/imports/importer.py +++ b/datacontract/imports/importer.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from enum import Enum -from datacontract_specification.model import DataContractSpecification from open_data_contract_standard.model import OpenDataContractStandard @@ -12,10 +11,13 @@ def __init__(self, import_format) -> None: @abstractmethod def import_source( self, - data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str, import_args: dict, - ) -> DataContractSpecification | OpenDataContractStandard: + ) -> OpenDataContractStandard: + """Import a data contract from a source. + + All importers now return OpenDataContractStandard (ODCS) format. + """ pass @@ -40,12 +42,3 @@ class ImportFormat(str, Enum): @classmethod def get_supported_formats(cls): return list(map(lambda c: c.value, cls)) - - -class Spec(str, Enum): - datacontract_specification = "datacontract_specification" - odcs = "odcs" - - @classmethod - def get_supported_types(cls): - return list(map(lambda c: c.value, cls)) diff --git a/datacontract/imports/json_importer.py b/datacontract/imports/json_importer.py index 5bbfd65cc..219a910d3 100644 --- a/datacontract/imports/json_importer.py +++ b/datacontract/imports/json_importer.py @@ -3,15 +3,22 @@ import re from typing import Any, Dict, List, Optional, Tuple +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty + from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, +) class JsonImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_json(data_contract_specification, source) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_json(source) def is_ndjson(file_path: str) -> bool: @@ -29,15 +36,12 @@ def is_ndjson(file_path: str) -> bool: return False -def import_json( - data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False -) -> DataContractSpecification: - # use the file name as base model name +def import_json(source: str, include_examples: bool = False) -> OpenDataContractStandard: + """Import a JSON file and create an ODCS data contract.""" base_model_name = os.path.splitext(os.path.basename(source))[0] - # check if file is newline-delimited JSON + # Check if file is newline-delimited JSON if is_ndjson(source): - # load NDJSON data json_data = [] with open(source, "r", encoding="utf-8") as file: for line in file: @@ -48,108 +52,132 @@ def import_json( except json.JSONDecodeError: continue else: - # load regular JSON data with open(source, "r", encoding="utf-8") as file: json_data = json.load(file) - if data_contract_specification.servers is None: - data_contract_specification.servers = {} - - data_contract_specification.servers["production"] = Server(type="local", path=source, format="json") - - # initialisation - models = {} + odcs = create_odcs() + odcs.servers = [create_server(name="production", server_type="local", path=source, format="json")] if isinstance(json_data, list) and json_data: - # Array of items if all(isinstance(item, dict) for item in json_data[:5]): # Array of objects, as table - fields = {} + properties = [] + field_defs = {} for item in json_data[:20]: for key, value in item.items(): - field_def = generate_field_definition(value, key, base_model_name, models) - if key in fields: - fields[key] = merge_field_definitions(fields[key], field_def) + field_def = generate_field_definition(value, key) + if key in field_defs: + field_defs[key] = merge_field_definitions(field_defs[key], field_def) else: - fields[key] = field_def + field_defs[key] = field_def - models[base_model_name] = { - "type": "table", - "description": f"Generated from JSON array in {source}", - "fields": fields, - "examples": json_data[:3] if include_examples else None, - } + properties = [dict_to_property(name, field_def) for name, field_def in field_defs.items()] + + schema_obj = create_schema_object( + name=base_model_name, + physical_type="table", + description=f"Generated from JSON array in {source}", + properties=properties, + ) else: # Simple array item_type, item_format = infer_array_type(json_data[:20]) - models[base_model_name] = { - "type": "array", - "description": f"Generated from JSON array in {source}", - "items": {"type": item_type, "format": item_format} if item_format else {"type": item_type}, - "examples": [json_data[:5]] if include_examples else None, - } + schema_obj = create_schema_object( + name=base_model_name, + physical_type="array", + description=f"Generated from JSON array in {source}", + ) elif isinstance(json_data, dict): - # Single object - fields = {} + properties = [] for key, value in json_data.items(): - fields[key] = generate_field_definition(value, key, base_model_name, models) - - models[base_model_name] = { - "type": "object", - "description": f"Generated from JSON object in {source}", - "fields": fields, - "examples": [json_data] if include_examples else None, - } + field_def = generate_field_definition(value, key) + properties.append(dict_to_property(key, field_def)) + + schema_obj = create_schema_object( + name=base_model_name, + physical_type="object", + description=f"Generated from JSON object in {source}", + properties=properties, + ) else: - # Primitive value field_type, field_format = determine_type_and_format(json_data) - models[base_model_name] = { - "type": field_type, - "description": f"Generated from JSON primitive in {source}", - "format": field_format, - "examples": [json_data] if include_examples and field_type != "boolean" else None, - } - - for model_name, model_def in models.items(): - model_type = model_def.pop("type") - data_contract_specification.models[model_name] = Model(type=model_type, **model_def) - - return data_contract_specification - - -def generate_field_definition( - value: Any, field_name: str, parent_model: str, models: Dict[str, Dict[str, Any]] -) -> Dict[str, Any]: - """Generate a field definition for a JSON value, creating nested models.""" - + schema_obj = create_schema_object( + name=base_model_name, + physical_type=field_type, + description=f"Generated from JSON primitive in {source}", + ) + + odcs.schema_ = [schema_obj] + return odcs + + +def dict_to_property(name: str, field_def: Dict[str, Any]) -> SchemaProperty: + """Convert a field definition dict to an ODCS SchemaProperty.""" + logical_type = map_json_type_to_odcs(field_def.get("type", "string")) + + nested_properties = None + if field_def.get("type") == "object" and "fields" in field_def: + nested_properties = [dict_to_property(k, v) for k, v in field_def["fields"].items()] + + items_prop = None + if field_def.get("type") == "array" and "items" in field_def: + items_prop = dict_to_property("items", field_def["items"]) + + examples = field_def.get("examples") + + custom_props = {} + if field_def.get("format"): + custom_props["format"] = field_def.get("format") + + return create_property( + name=name, + logical_type=logical_type, + physical_type=field_def.get("type"), + examples=examples, + properties=nested_properties, + items=items_prop, + custom_properties=custom_props if custom_props else None, + ) + + +def map_json_type_to_odcs(json_type: str) -> str: + """Map JSON type to ODCS logical type.""" + type_mapping = { + "string": "string", + "integer": "integer", + "number": "number", + "boolean": "boolean", + "array": "array", + "object": "object", + "null": "string", + } + return type_mapping.get(json_type, "string") + + +def generate_field_definition(value: Any, field_name: str) -> Dict[str, Any]: + """Generate a field definition for a JSON value.""" if isinstance(value, dict): - # Handle object fields fields = {} for key, nested_value in value.items(): - fields[key] = generate_field_definition(nested_value, key, parent_model, models) - + fields[key] = generate_field_definition(nested_value, key) return {"type": "object", "fields": fields} elif isinstance(value, list): - # Handle array fields if not value: return {"type": "array", "items": {"type": "string"}} if all(isinstance(item, dict) for item in value): - # Array of objects fields = {} for item in value: for key, nested_value in item.items(): - field_def = generate_field_definition(nested_value, key, parent_model, models) + field_def = generate_field_definition(nested_value, key) if key in fields: fields[key] = merge_field_definitions(fields[key], field_def) else: fields[key] = field_def - return {"type": "array", "items": {"type": "object", "fields": fields}} elif all(isinstance(item, list) for item in value): - # Array of arrays inner_type, inner_format = infer_array_type(value[0]) return { "type": "array", @@ -157,36 +185,24 @@ def generate_field_definition( "type": "array", "items": {"type": inner_type, "format": inner_format} if inner_format else {"type": inner_type}, }, - "examples": value[:5], # Include examples for nested arrays } - else: - # Array of simple or mixed types item_type, item_format = infer_array_type(value) items_def = {"type": item_type} if item_format: items_def["format"] = item_format - field_def = {"type": "array", "items": items_def} - - # Add examples if appropriate sample_values = [item for item in value[:5] if item is not None] if sample_values: field_def["examples"] = sample_values - return field_def - else: - # Handle primitive types field_type, field_format = determine_type_and_format(value) field_def = {"type": field_type} if field_format: field_def["format"] = field_format - - # Add examples if value is not None and field_type != "boolean": field_def["examples"] = [value] - return field_def @@ -195,11 +211,9 @@ def infer_array_type(array: List) -> Tuple[str, Optional[str]]: if not array: return "string", None - # if all items are dictionaries with the same structure if all(isinstance(item, dict) for item in array): return "object", None - # if all items are of the same primitive type non_null_items = [item for item in array if item is not None] if not non_null_items: return "null", None @@ -208,7 +222,6 @@ def infer_array_type(array: List) -> Tuple[str, Optional[str]]: types = {t for t, _ in types_and_formats} formats = {f for _, f in types_and_formats if f is not None} - # simplify type combinations if types == {"integer", "number"}: return "number", None if len(types) == 1: @@ -216,17 +229,15 @@ def infer_array_type(array: List) -> Tuple[str, Optional[str]]: format_name = next(iter(formats)) if len(formats) == 1 else None return type_name, format_name if all(t in {"string", "integer", "number", "boolean", "null"} for t in types): - # If all string values have the same format, keep it if len(formats) == 1 and "string" in types: return "string", next(iter(formats)) return "string", None - # Mixed types return "string", None def determine_type_and_format(value: Any) -> Tuple[str, Optional[str]]: - """determine the datacontract type and format for a JSON value.""" + """Determine the datacontract type and format for a JSON value.""" if value is None: return "null", None elif isinstance(value, bool): @@ -264,22 +275,16 @@ def merge_field_definitions(field1: Dict[str, Any], field2: Dict[str, Any]) -> D return field1 if field2.get("type") == "object" and field1.get("type") != "object": return field2 - # Handle type differences + if field1.get("type") != field2.get("type"): - type1, _ = field1.get("type", "string"), field1.get("format") - type2, _ = field2.get("type", "string"), field2.get("format") + type1 = field1.get("type", "string") + type2 = field2.get("type", "string") - if type1 == "integer" and type2 == "number" or type1 == "number" and type2 == "integer": - common_type = "number" - common_format = None + if (type1 == "integer" and type2 == "number") or (type1 == "number" and type2 == "integer"): + result["type"] = "number" elif "string" in [type1, type2]: - common_type = "string" - common_format = None - elif all(t in ["string", "integer", "number", "boolean", "null"] for t in [type1, type2]): - common_type = "string" - common_format = None + result["type"] = "string" elif type1 == "array" and type2 == "array": - # Handle mixed array types items1 = field1.get("items", {}) items2 = field2.get("items", {}) if items1.get("type") == "object" or items2.get("type") == "object": @@ -292,28 +297,21 @@ def merge_field_definitions(field1: Dict[str, Any], field2: Dict[str, Any]) -> D merged_items = merge_field_definitions(items1, items2) return {"type": "array", "items": merged_items} else: - common_type = "array" if "array" in [type1, type2] else "object" - common_format = None + result["type"] = "array" if "array" in [type1, type2] else "object" - result["type"] = common_type - if common_format: - result["format"] = common_format - elif "format" in result: + if "format" in result: del result["format"] - # Merge examples if "examples" in field2: if "examples" in result: combined = result["examples"] + [ex for ex in field2["examples"] if ex not in result["examples"]] - result["examples"] = combined[:5] # Limit to 5 examples + result["examples"] = combined[:5] else: result["examples"] = field2["examples"] - # Handle nested structures if result.get("type") == "array" and "items" in field1 and "items" in field2: result["items"] = merge_field_definitions(field1["items"], field2["items"]) elif result.get("type") == "object" and "fields" in field1 and "fields" in field2: - # Merge fields from both objects merged_fields = field1["fields"].copy() for key, field_def in field2["fields"].items(): if key in merged_fields: diff --git a/datacontract/imports/jsonschema_importer.py b/datacontract/imports/jsonschema_importer.py index eb93c2e83..032eefec2 100644 --- a/datacontract/imports/jsonschema_importer.py +++ b/datacontract/imports/jsonschema_importer.py @@ -1,46 +1,54 @@ import json +from typing import Any, Dict, List import fastjsonschema +from open_data_contract_standard.model import DataQuality, OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Field, Model +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.model.exceptions import DataContractException class JsonSchemaImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_jsonschema(data_contract_specification, source) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_jsonschema(source) -def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: - if data_contract_specification.models is None: - data_contract_specification.models = {} - +def import_jsonschema(source: str) -> OpenDataContractStandard: + """Import a JSON Schema and create an ODCS data contract.""" json_schema = load_and_validate_json_schema(source) title = json_schema.get("title", "default_model") description = json_schema.get("description") - type_ = json_schema.get("type") - properties = json_schema.get("properties", {}) + type_ = json_schema.get("type", "object") + json_properties = json_schema.get("properties", {}) required_properties = json_schema.get("required", []) - fields_kwargs = jsonschema_to_args(properties, required_properties) - fields = {name: Field(**kwargs) for name, kwargs in fields_kwargs.items()} + odcs = create_odcs(name=title) + + properties = jsonschema_to_properties(json_properties, required_properties) - model = Model(description=description, type=type_, title=title, fields=fields) - data_contract_specification.models[title] = model + schema_obj = create_schema_object( + name=title, + physical_type=type_, + description=description, + business_name=title, + properties=properties, + ) - definitions = json_schema.get("definitions", {}) - for name, schema in definitions.items(): - kwargs = schema_to_args(schema) - data_contract_specification.definitions[name] = Definition(name=name, **kwargs) + odcs.schema_ = [schema_obj] - return data_contract_specification + return odcs -def load_and_validate_json_schema(source): +def load_and_validate_json_schema(source: str) -> dict: + """Load and validate a JSON Schema file.""" try: with open(source, "r") as file: json_schema = json.loads(file.read()) @@ -67,80 +75,160 @@ def load_and_validate_json_schema(source): return json_schema -def jsonschema_to_args(properties, required_properties): - args = {} - for property, property_schema in properties.items(): - is_required = property in required_properties - args[property] = schema_to_args(property_schema, is_required) - - return args - - -def schema_to_args(property_schema, is_required: bool = None) -> dict: - direct_mappings = { - "title", - "description", - "format", - "pattern", - "enum", - "tags", - "pii", - "minLength", - "maxLength", - "minimum", - "exclusiveMinimum", - "maximum", - "exclusiveMaximum", - } - - field_kwargs = {key: value for key, value in property_schema.items() if key in direct_mappings} - - if is_required is not None: - field_kwargs["required"] = is_required - - property_type = determine_type(property_schema) - if property_type is not None: - field_kwargs["type"] = property_type - - if property_type == "array": - nested_item_type, nested_items = determine_nested_item_type(property_schema) - - if nested_items is not None: - field_kwargs["items"] = schema_to_args(nested_item_type) - - nested_properties = property_schema.get("properties") - if nested_properties is not None: - # recursive call for complex nested properties - required = property_schema.get("required", []) - field_kwargs["fields"] = jsonschema_to_args(nested_properties, required) - - return field_kwargs - - -def determine_nested_item_type(property_schema): - nested_items = property_schema.get("items") - nested_items_is_list = isinstance(nested_items, list) - if nested_items_is_list and len(nested_items) != 1: - raise DataContractException( - type="schema", - name="Parse json schema", - reason=f"Union types for arrays are currently not supported ({nested_items})", - engine="datacontract", +def jsonschema_to_properties( + json_properties: Dict[str, Any], required_properties: List[str] +) -> List[SchemaProperty]: + """Convert JSON Schema properties to ODCS SchemaProperty list.""" + properties = [] + + for prop_name, prop_schema in json_properties.items(): + is_required = prop_name in required_properties + prop = schema_to_property(prop_name, prop_schema, is_required) + properties.append(prop) + + return properties + + +def schema_to_property( + name: str, prop_schema: Dict[str, Any], is_required: bool = None +) -> SchemaProperty: + """Convert a JSON Schema property to an ODCS SchemaProperty.""" + # Determine the type + property_type = determine_type(prop_schema) + logical_type = map_jsonschema_type_to_odcs(property_type) + + # Extract common attributes + title = prop_schema.get("title") + description = prop_schema.get("description") + pattern = prop_schema.get("pattern") + min_length = prop_schema.get("minLength") + max_length = prop_schema.get("maxLength") + minimum = prop_schema.get("minimum") + maximum = prop_schema.get("maximum") + format_val = prop_schema.get("format") + + # Handle exclusiveMinimum/exclusiveMaximum (draft-04: boolean, draft-06+: number) + exclusive_minimum = None + exclusive_maximum = None + raw_exclusive_min = prop_schema.get("exclusiveMinimum") + raw_exclusive_max = prop_schema.get("exclusiveMaximum") + + if isinstance(raw_exclusive_min, bool): + # Draft-04: boolean, use minimum value as exclusive + if raw_exclusive_min and minimum is not None: + exclusive_minimum = minimum + minimum = None + elif raw_exclusive_min is not None: + # Draft-06+: number value + exclusive_minimum = raw_exclusive_min + + if isinstance(raw_exclusive_max, bool): + # Draft-04: boolean, use maximum value as exclusive + if raw_exclusive_max and maximum is not None: + exclusive_maximum = maximum + maximum = None + elif raw_exclusive_max is not None: + # Draft-06+: number value + exclusive_maximum = raw_exclusive_max + + # Handle enum as quality rule (invalidValues with validValues, mustBe: 0) + quality_rules = [] + enum_values = prop_schema.get("enum") + if enum_values: + quality_rules.append( + DataQuality( + type="library", + metric="invalidValues", + arguments={"validValues": enum_values}, + mustBe=0, + ) ) - if nested_items_is_list and len(nested_items) == 1: - nested_item_type = nested_items[0] - elif not nested_items_is_list and nested_items is not None: - nested_item_type = nested_items - return nested_item_type, nested_items - -def determine_type(property_schema): - property_type = property_schema.get("type") - type_is_list = isinstance(property_type, list) - if type_is_list: + # Build custom properties for attributes not directly mapped + custom_props = {} + if prop_schema.get("pii"): + custom_props["pii"] = prop_schema.get("pii") + + # Handle nested properties for objects + nested_properties = None + if property_type == "object": + nested_json_props = prop_schema.get("properties") + if nested_json_props: + nested_required = prop_schema.get("required", []) + nested_properties = jsonschema_to_properties(nested_json_props, nested_required) + + # Handle array items + items_prop = None + if property_type == "array": + nested_items = prop_schema.get("items") + if nested_items: + if isinstance(nested_items, list): + if len(nested_items) == 1: + items_prop = schema_to_property("items", nested_items[0]) + elif len(nested_items) > 1: + raise DataContractException( + type="schema", + name="Parse json schema", + reason=f"Union types for arrays are currently not supported ({nested_items})", + engine="datacontract", + ) + else: + items_prop = schema_to_property("items", nested_items) + + prop = create_property( + name=name, + logical_type=logical_type, + physical_type=property_type, + description=description, + required=is_required if is_required else None, + pattern=pattern, + min_length=min_length, + max_length=max_length, + minimum=minimum, + maximum=maximum, + exclusive_minimum=exclusive_minimum, + exclusive_maximum=exclusive_maximum, + format=format_val, + properties=nested_properties, + items=items_prop, + custom_properties=custom_props if custom_props else None, + ) + + # Set title as businessName if present + if title: + prop.businessName = title + + # Attach quality rules to property + if quality_rules: + prop.quality = quality_rules + + return prop + + +def determine_type(prop_schema: Dict[str, Any]) -> str: + """Determine the type from a JSON Schema property.""" + property_type = prop_schema.get("type") + + if isinstance(property_type, list): + # Handle union types like ["string", "null"] non_null_types = [t for t in property_type if t != "null"] if non_null_types: property_type = non_null_types[0] else: - property_type = None - return property_type + property_type = "string" + + return property_type or "string" + + +def map_jsonschema_type_to_odcs(json_type: str) -> str: + """Map JSON Schema type to ODCS logical type.""" + type_mapping = { + "string": "string", + "integer": "integer", + "number": "number", + "boolean": "boolean", + "array": "array", + "object": "object", + "null": "string", + } + return type_mapping.get(json_type, "string") diff --git a/datacontract/imports/odcs_helper.py b/datacontract/imports/odcs_helper.py new file mode 100644 index 000000000..5c01daf0a --- /dev/null +++ b/datacontract/imports/odcs_helper.py @@ -0,0 +1,306 @@ +"""Helper functions for creating ODCS (OpenDataContractStandard) objects.""" + +from typing import Any, Dict, List + +from open_data_contract_standard.model import ( + CustomProperty, + OpenDataContractStandard, + SchemaObject, + SchemaProperty, + Server, +) + + +def create_odcs( + id: str = None, + name: str = None, + version: str = "1.0.0", + status: str = "draft", +) -> OpenDataContractStandard: + """Create a new OpenDataContractStandard instance with default values.""" + return OpenDataContractStandard( + apiVersion="v3.1.0", + kind="DataContract", + id=id or "my-data-contract", + name=name or "My Data Contract", + version=version, + status=status, + ) + + +def create_schema_object( + name: str, + physical_type: str = "table", + description: str = None, + business_name: str = None, + properties: List[SchemaProperty] = None, +) -> SchemaObject: + """Create a SchemaObject (equivalent to DCS Model).""" + schema = SchemaObject( + name=name, + physicalName=name, + logicalType="object", + physicalType=physical_type, + ) + if description: + schema.description = description + if business_name: + schema.businessName = business_name + if properties: + schema.properties = properties + return schema + + +def create_property( + name: str, + logical_type: str, + physical_type: str = None, + description: str = None, + required: bool = None, + primary_key: bool = None, + primary_key_position: int = None, + unique: bool = None, + classification: str = None, + tags: List[str] = None, + examples: List[Any] = None, + min_length: int = None, + max_length: int = None, + pattern: str = None, + minimum: float = None, + maximum: float = None, + exclusive_minimum: float = None, + exclusive_maximum: float = None, + precision: int = None, + scale: int = None, + format: str = None, + properties: List["SchemaProperty"] = None, + items: "SchemaProperty" = None, + custom_properties: Dict[str, Any] = None, +) -> SchemaProperty: + """Create a SchemaProperty (equivalent to DCS Field).""" + prop = SchemaProperty(name=name) + prop.logicalType = logical_type + + if physical_type: + prop.physicalType = physical_type + if description: + prop.description = description + if required is not None: + prop.required = required + if primary_key: + prop.primaryKey = primary_key + prop.primaryKeyPosition = primary_key_position or 1 + if unique: + prop.unique = unique + if classification: + prop.classification = classification + if tags: + prop.tags = tags + if examples: + prop.examples = examples + if properties: + prop.properties = properties + if items: + prop.items = items + + # Logical type options + logical_type_options = {} + if min_length is not None: + logical_type_options["minLength"] = min_length + if max_length is not None: + logical_type_options["maxLength"] = max_length + if pattern: + logical_type_options["pattern"] = pattern + if minimum is not None: + logical_type_options["minimum"] = minimum + if maximum is not None: + logical_type_options["maximum"] = maximum + if exclusive_minimum is not None: + logical_type_options["exclusiveMinimum"] = exclusive_minimum + if exclusive_maximum is not None: + logical_type_options["exclusiveMaximum"] = exclusive_maximum + if precision is not None: + logical_type_options["precision"] = precision + if scale is not None: + logical_type_options["scale"] = scale + if format: + logical_type_options["format"] = format + if logical_type_options: + prop.logicalTypeOptions = logical_type_options + + # Custom properties + if custom_properties: + prop.customProperties = [ + CustomProperty(property=k, value=v) for k, v in custom_properties.items() + ] + + return prop + + +def create_server( + name: str, + server_type: str, + environment: str = None, + host: str = None, + port: int = None, + database: str = None, + schema: str = None, + account: str = None, + project: str = None, + dataset: str = None, + path: str = None, + location: str = None, + catalog: str = None, + topic: str = None, + format: str = None, +) -> Server: + """Create a Server object.""" + server = Server(server=name, type=server_type) + if environment: + server.environment = environment + if host: + server.host = host + if port: + server.port = port + if database: + server.database = database + if schema: + server.schema_ = schema + if account: + server.account = account + if project: + server.project = project + if dataset: + server.dataset = dataset + if path: + server.path = path + if location: + server.location = location + if catalog: + server.catalog = catalog + if topic: + server.topic = topic + if format: + server.format = format + return server + + +# Type mapping from various SQL dialects to ODCS logical types +SQL_TO_LOGICAL_TYPE = { + # String types + "varchar": "string", + "char": "string", + "nvarchar": "string", + "nchar": "string", + "text": "string", + "ntext": "string", + "string": "string", + "clob": "string", + "nclob": "string", + # Integer types + "int": "integer", + "integer": "integer", + "smallint": "integer", + "tinyint": "integer", + "mediumint": "integer", + "int2": "integer", + "int4": "integer", + "bigint": "integer", + "int8": "integer", + "long": "integer", + # Float types + "float": "number", + "real": "number", + "float4": "number", + "float8": "number", + "double": "number", + "double precision": "number", + # Decimal types + "decimal": "number", + "numeric": "number", + "number": "number", + # Boolean types + "boolean": "boolean", + "bool": "boolean", + "bit": "boolean", + # Date/time types + "date": "date", + "timestamp": "date", + "datetime": "date", + "datetime2": "date", + "timestamptz": "date", + "timestamp_tz": "date", + "timestamp_ntz": "date", + "time": "string", + # Binary types + "binary": "array", + "varbinary": "array", + "blob": "array", + "bytes": "array", + "bytea": "array", + # Complex types + "array": "array", + "object": "object", + "struct": "object", + "record": "object", + "map": "object", + "json": "object", + "jsonb": "object", + "variant": "object", +} + + +def map_sql_type_to_logical(sql_type: str) -> str: + """Map a SQL type string to an ODCS logical type.""" + if sql_type is None: + return "string" + + sql_type_lower = sql_type.lower().strip() + + # Handle parameterized types (e.g., VARCHAR(255), DECIMAL(10,2)) + base_type = sql_type_lower.split("(")[0].strip() + + return SQL_TO_LOGICAL_TYPE.get(base_type, "string") + + +# Type mapping from Avro to ODCS logical types +AVRO_TO_LOGICAL_TYPE = { + "null": None, + "string": "string", + "bytes": "array", + "int": "integer", + "long": "integer", + "float": "number", + "double": "number", + "boolean": "boolean", + "record": "object", + "array": "array", + "map": "object", + "enum": "string", + "fixed": "array", +} + + +def map_avro_type_to_logical(avro_type: str) -> str: + """Map an Avro type string to an ODCS logical type.""" + return AVRO_TO_LOGICAL_TYPE.get(avro_type, "string") + + +# Avro logical type mapping +AVRO_LOGICAL_TYPE_MAPPING = { + "decimal": "number", + "date": "date", + "time-millis": "string", + "time-micros": "string", + "timestamp-millis": "date", + "timestamp-micros": "date", + "local-timestamp-millis": "date", + "local-timestamp-micros": "date", + "duration": "string", + "uuid": "string", +} + + +def map_avro_logical_type(avro_logical_type: str) -> str: + """Map an Avro logical type to an ODCS logical type.""" + return AVRO_LOGICAL_TYPE_MAPPING.get(avro_logical_type, "string") diff --git a/datacontract/imports/odcs_importer.py b/datacontract/imports/odcs_importer.py index 3e40ce090..a397ba1c5 100644 --- a/datacontract/imports/odcs_importer.py +++ b/datacontract/imports/odcs_importer.py @@ -1,24 +1,22 @@ import yaml +from open_data_contract_standard.model import OpenDataContractStandard from datacontract.imports.importer import Importer from datacontract.lint.resources import read_resource -from datacontract.model.data_contract_specification import ( - DataContractSpecification, -) from datacontract.model.exceptions import DataContractException class OdcsImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_odcs(data_contract_specification, source) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_odcs(source) -def import_odcs(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: +def import_odcs(source: str) -> OpenDataContractStandard: + """Import an ODCS file directly - since ODCS is now the internal format, this is simpler.""" try: - odcs_contract = yaml.safe_load(read_resource(source)) - + odcs_yaml = yaml.safe_load(read_resource(source)) except Exception as e: raise DataContractException( type="schema", @@ -28,10 +26,9 @@ def import_odcs(data_contract_specification: DataContractSpecification, source: original_exception=e, ) - odcs_kind = odcs_contract.get("kind") - odcs_api_version = odcs_contract.get("apiVersion") + odcs_kind = odcs_yaml.get("kind") + odcs_api_version = odcs_yaml.get("apiVersion") - # if odcs_kind is not DataContract throw exception if odcs_kind != "DataContract": raise DataContractException( type="schema", @@ -44,13 +41,12 @@ def import_odcs(data_contract_specification: DataContractSpecification, source: raise DataContractException( type="schema", name="Importing ODCS contract", - reason=f"Unsupported ODCS API version: {odcs_api_version}", + reason=f"Unsupported ODCS API version: {odcs_api_version}. Only v3.x is supported.", engine="datacontract", ) elif odcs_api_version.startswith("v3."): - from datacontract.imports.odcs_v3_importer import import_odcs_v3_as_dcs - - return import_odcs_v3_as_dcs(data_contract_specification, source) + # Parse directly as ODCS + return OpenDataContractStandard.model_validate(odcs_yaml) else: raise DataContractException( type="schema", diff --git a/datacontract/imports/odcs_v3_importer.py b/datacontract/imports/odcs_v3_importer.py deleted file mode 100644 index 80ee24904..000000000 --- a/datacontract/imports/odcs_v3_importer.py +++ /dev/null @@ -1,444 +0,0 @@ -import datetime -import logging -import re -from typing import Any, Dict, List -from venv import logger - -from datacontract_specification.model import Quality -from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard, SchemaProperty - -from datacontract.imports.importer import Importer -from datacontract.lint.resources import read_resource -from datacontract.model.data_contract_specification import ( - DATACONTRACT_TYPES, - Availability, - DataContractSpecification, - Field, - Info, - Model, - Retention, - Server, - ServerRole, - ServiceLevel, - Terms, -) -from datacontract.model.exceptions import DataContractException - - -class OdcsImporter(Importer): - def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_odcs_v3_as_dcs(data_contract_specification, source) - - -def import_odcs_v3_as_dcs( - data_contract_specification: DataContractSpecification, source: str -) -> DataContractSpecification: - source_str = read_resource(source) - odcs = parse_odcs_v3_from_str(source_str) - return import_from_odcs(data_contract_specification, odcs) - - -def parse_odcs_v3_from_str(source_str): - try: - odcs = OpenDataContractStandard.from_string(source_str) - except Exception as e: - raise DataContractException( - type="schema", - name="Parse ODCS contract", - reason=f"Failed to parse odcs contract from {source_str}", - engine="datacontract", - original_exception=e, - ) - return odcs - - -def import_from_odcs(data_contract_specification: DataContractSpecification, odcs: OpenDataContractStandard): - data_contract_specification.id = odcs.id - data_contract_specification.info = import_info(odcs) - data_contract_specification.servers = import_servers(odcs) - data_contract_specification.terms = import_terms(odcs) - data_contract_specification.servicelevels = import_servicelevels(odcs) - data_contract_specification.models = import_models(odcs) - data_contract_specification.tags = import_tags(odcs) - return data_contract_specification - - -def import_info(odcs: Any) -> Info: - info = Info() - - info.title = odcs.name if odcs.name is not None else "" - - if odcs.version is not None: - info.version = odcs.version - - # odcs.description.purpose => datacontract.description - if odcs.description is not None and odcs.description.purpose is not None: - info.description = odcs.description.purpose - - # odcs.domain => datacontract.owner - owner = get_owner(odcs.customProperties) - if owner is not None: - info.owner = owner - - # add dataProduct as custom property - if odcs.dataProduct is not None: - info.dataProduct = odcs.dataProduct - - # add tenant as custom property - if odcs.tenant is not None: - info.tenant = odcs.tenant - - return info - - -def import_server_roles(roles: List[Dict]) -> List[ServerRole] | None: - if roles is None: - return None - result = [] - for role in roles: - server_role = ServerRole() - server_role.name = role.role - server_role.description = role.description - result.append(server_role) - - -def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None: - if odcs.servers is None: - return None - servers = {} - for odcs_server in odcs.servers: - server_name = odcs_server.server - if server_name is None: - logger.warning("Server name is missing, skipping server") - continue - - server = Server() - server.type = odcs_server.type - server.description = odcs_server.description - server.environment = odcs_server.environment - server.format = odcs_server.format - server.project = odcs_server.project - server.dataset = odcs_server.dataset - server.path = odcs_server.path - server.delimiter = odcs_server.delimiter - server.endpointUrl = odcs_server.endpointUrl - server.location = odcs_server.location - server.account = odcs_server.account - server.database = odcs_server.database - server.schema_ = odcs_server.schema_ - server.host = odcs_server.host - server.port = odcs_server.port - server.catalog = odcs_server.catalog - server.stagingDir = odcs_server.stagingDir - server.topic = getattr(odcs_server, "topic", None) - server.http_path = getattr(odcs_server, "http_path", None) - server.token = getattr(odcs_server, "token", None) - server.driver = getattr(odcs_server, "driver", None) - server.roles = import_server_roles(odcs_server.roles) - server.storageAccount = ( - to_azure_storage_account(odcs_server.location) - if server.type == "azure" and "://" in server.location - else None - ) - - servers[server_name] = server - return servers - - -def import_terms(odcs: Any) -> Terms | None: - if odcs.description is None: - return None - if odcs.description.usage is not None or odcs.description.limitations is not None or odcs.price is not None: - terms = Terms() - if odcs.description.usage is not None: - terms.usage = odcs.description.usage - if odcs.description.limitations is not None: - terms.limitations = odcs.description.limitations - if odcs.price is not None: - terms.billing = f"{odcs.price.priceAmount} {odcs.price.priceCurrency} / {odcs.price.priceUnit}" - - return terms - else: - return None - - -def import_servicelevels(odcs: Any) -> ServiceLevel: - # find the two properties we can map (based on the examples) - sla_properties = odcs.slaProperties if odcs.slaProperties is not None else [] - availability = next((p for p in sla_properties if p.property == "generalAvailability"), None) - retention = next((p for p in sla_properties if p.property == "retention"), None) - - if availability is not None or retention is not None: - servicelevel = ServiceLevel() - - if availability is not None: - value = availability.value - if isinstance(value, datetime.datetime): - value = value.isoformat() - servicelevel.availability = Availability(description=value) - - if retention is not None: - servicelevel.retention = Retention(period=f"{retention.value}{retention.unit}") - - return servicelevel - else: - return None - - -def get_server_type(odcs: OpenDataContractStandard) -> str | None: - servers = import_servers(odcs) - if servers is None or len(servers) == 0: - return None - # get first server from map - server = next(iter(servers.values())) - return server.type - - -def import_models(odcs: Any) -> Dict[str, Model]: - custom_type_mappings = get_custom_type_mappings(odcs.customProperties) - - odcs_schemas = odcs.schema_ if odcs.schema_ is not None else [] - result = {} - - for odcs_schema in odcs_schemas: - schema_name = odcs_schema.name - schema_physical_name = odcs_schema.physicalName - schema_description = odcs_schema.description if odcs_schema.description is not None else "" - model_name = schema_physical_name if schema_physical_name is not None else schema_name - model = Model(description=" ".join(schema_description.splitlines()) if schema_description else "", type="table") - model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs)) - if odcs_schema.quality is not None: - model.quality = convert_quality_list(odcs_schema.quality) - model.title = schema_name - if odcs_schema.dataGranularityDescription is not None: - model.config = {"dataGranularityDescription": odcs_schema.dataGranularityDescription} - result[model_name] = model - - return result - - -def convert_quality_list(odcs_quality_list): - """Convert a list of ODCS DataQuality objects to datacontract Quality objects""" - quality_list = [] - - if odcs_quality_list is not None: - for odcs_quality in odcs_quality_list: - quality = Quality(type=odcs_quality.type) - - if odcs_quality.description is not None: - quality.description = odcs_quality.description - if odcs_quality.query is not None: - quality.query = odcs_quality.query - if odcs_quality.mustBe is not None: - quality.mustBe = odcs_quality.mustBe - if odcs_quality.mustNotBe is not None: - quality.mustNotBe = odcs_quality.mustNotBe - if odcs_quality.mustBeGreaterThan is not None: - quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan - if odcs_quality.mustBeGreaterOrEqualTo is not None: - quality.mustBeGreaterThanOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo - if odcs_quality.mustBeLessThan is not None: - quality.mustBeLessThan = odcs_quality.mustBeLessThan - if odcs_quality.mustBeLessOrEqualTo is not None: - quality.mustBeLessThanOrEqualTo = odcs_quality.mustBeLessOrEqualTo - if odcs_quality.mustBeBetween is not None: - quality.mustBeBetween = odcs_quality.mustBeBetween - if odcs_quality.mustNotBeBetween is not None: - quality.mustNotBeBetween = odcs_quality.mustNotBeBetween - if odcs_quality.engine is not None: - quality.engine = odcs_quality.engine - if odcs_quality.implementation is not None: - quality.implementation = odcs_quality.implementation - if odcs_quality.businessImpact is not None: - quality.model_extra["businessImpact"] = odcs_quality.businessImpact - if odcs_quality.dimension is not None: - quality.model_extra["dimension"] = odcs_quality.dimension - if odcs_quality.rule is not None: - quality.model_extra["rule"] = odcs_quality.rule - if odcs_quality.schedule is not None: - quality.model_extra["schedule"] = odcs_quality.schedule - if odcs_quality.scheduler is not None: - quality.model_extra["scheduler"] = odcs_quality.scheduler - if odcs_quality.severity is not None: - quality.model_extra["severity"] = odcs_quality.severity - if odcs_quality.method is not None: - quality.model_extra["method"] = odcs_quality.method - if odcs_quality.customProperties is not None: - quality.model_extra["customProperties"] = [] - for item in odcs_quality.customProperties: - quality.model_extra["customProperties"].append( - { - "property": item.property, - "value": item.value, - } - ) - - quality_list.append(quality) - - return quality_list - - -def import_field_config(odcs_property: SchemaProperty, server_type=None) -> Dict[str, Any]: - config = {} - if odcs_property.criticalDataElement is not None: - config["criticalDataElement"] = odcs_property.criticalDataElement - if odcs_property.encryptedName is not None: - config["encryptedName"] = odcs_property.encryptedName - if odcs_property.partitionKeyPosition is not None: - config["partitionKeyPosition"] = odcs_property.partitionKeyPosition - if odcs_property.partitioned is not None: - config["partitioned"] = odcs_property.partitioned - - if odcs_property.customProperties is not None: - for item in odcs_property.customProperties: - config[item.property] = item.value - - physical_type = odcs_property.physicalType - if physical_type is not None: - if server_type == "postgres" or server_type == "postgresql": - config["postgresType"] = physical_type - elif server_type == "bigquery": - config["bigqueryType"] = physical_type - elif server_type == "snowflake": - config["snowflakeType"] = physical_type - elif server_type == "redshift": - config["redshiftType"] = physical_type - elif server_type == "sqlserver": - config["sqlserverType"] = physical_type - elif server_type == "databricks": - config["databricksType"] = physical_type - else: - config["physicalType"] = physical_type - - return config - - -def has_composite_primary_key(odcs_properties: List[SchemaProperty]) -> bool: - primary_keys = [prop for prop in odcs_properties if prop.primaryKey is not None and prop.primaryKey] - return len(primary_keys) > 1 - - -def import_fields( - odcs_properties: List[SchemaProperty], custom_type_mappings: Dict[str, str], server_type -) -> Dict[str, Field]: - logger = logging.getLogger(__name__) - result = {} - - if odcs_properties is None: - return result - - for odcs_property in odcs_properties: - mapped_type = map_type(odcs_property.logicalType, custom_type_mappings) - if mapped_type is not None: - property_name = odcs_property.name - description = odcs_property.description if odcs_property.description is not None else None - field = Field( - description=" ".join(description.splitlines()) if description is not None else None, - type=mapped_type, - title=odcs_property.businessName, - required=odcs_property.required if odcs_property.required is not None else None, - primaryKey=odcs_property.primaryKey - if not has_composite_primary_key(odcs_properties) and odcs_property.primaryKey is not None - else False, - unique=odcs_property.unique if odcs_property.unique else None, - examples=odcs_property.examples if odcs_property.examples is not None else None, - classification=odcs_property.classification if odcs_property.classification is not None else None, - tags=odcs_property.tags if odcs_property.tags is not None else None, - quality=convert_quality_list(odcs_property.quality), - fields=import_fields(odcs_property.properties, custom_type_mappings, server_type) - if odcs_property.properties is not None - else {}, - config=import_field_config(odcs_property, server_type), - format=getattr(odcs_property, "format", None), - ) - # mapped_type is array - if field.type == "array" and odcs_property.items is not None: - # nested array object - if odcs_property.items.logicalType == "object": - field.items = Field( - type="object", - fields=import_fields(odcs_property.items.properties, custom_type_mappings, server_type), - ) - # array of simple type - elif odcs_property.items.logicalType is not None: - field.items = Field(type=odcs_property.items.logicalType) - - # enum from quality validValues as enum - if field.type == "string": - for q in field.quality: - if hasattr(q, "validValues"): - field.enum = q.validValues - - result[property_name] = field - else: - logger.info( - f"Can't map {odcs_property.name} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.logicalType}' that defines your expected type as the 'value'" - ) - - return result - - -def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None: - if odcs_type is None: - return None - t = odcs_type.lower() - if t in DATACONTRACT_TYPES: - return t - elif custom_mappings.get(t) is not None: - return custom_mappings.get(t) - else: - return None - - -def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]: - result = {} - if odcs_custom_properties is not None: - for prop in odcs_custom_properties: - if prop.property.startswith("dc_mapping_"): - odcs_type_name = prop.property[11:] # Changed substring to slice - datacontract_type = prop.value - result[odcs_type_name] = datacontract_type - - return result - - -def get_owner(odcs_custom_properties: List[CustomProperty]) -> str | None: - if odcs_custom_properties is not None: - for prop in odcs_custom_properties: - if prop.property == "owner": - return prop.value - - return None - - -def import_tags(odcs: OpenDataContractStandard) -> List[str] | None: - if odcs.tags is None: - return None - return odcs.tags - - -def to_azure_storage_account(location: str) -> str | None: - """ - Converts a storage location string to extract the storage account name. - ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI. - - This function parses a storage location string to identify and return the - storage account name. It handles two primary patterns: - 1. Protocol://containerName@storageAccountName - 2. Protocol://storageAccountName - - :param location: The storage location string to parse, typically following - the format protocol://containerName@storageAccountName. or - protocol://storageAccountName. - :return: The extracted storage account name if found, otherwise None - """ - # to catch protocol://containerName@storageAccountName. pattern from location - match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE) - if match: - return match.group() - else: - # to catch protocol://storageAccountName. pattern from location - match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE) - return match.group() if match else None diff --git a/datacontract/imports/parquet_importer.py b/datacontract/imports/parquet_importer.py index b19c79ab2..3265acda8 100644 --- a/datacontract/imports/parquet_importer.py +++ b/datacontract/imports/parquet_importer.py @@ -1,81 +1,96 @@ import os.path import pyarrow +from open_data_contract_standard.model import OpenDataContractStandard from pyarrow import parquet from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Field, - Model, +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, ) from datacontract.model.exceptions import DataContractException class ParquetImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_parquet(data_contract_specification, source) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_parquet(source) -def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification: +def import_parquet(source: str) -> OpenDataContractStandard: + """Import a Parquet file and create an ODCS data contract.""" # use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_") - fields: dict[str, Field] = {} + properties = [] arrow_schema = parquet.read_schema(source) for field_name in arrow_schema.names: parquet_field = arrow_schema.field(field_name) - field = map_pyarrow_field_to_specification_field(parquet_field, "parquet") + prop = map_pyarrow_field_to_property(parquet_field, field_name) if not parquet_field.nullable: - field.required = True + prop.required = True - fields[field_name] = field + properties.append(prop) - data_contract_specification.models[schema_name] = Model(fields=fields) + odcs = create_odcs() + schema_obj = create_schema_object( + name=schema_name, + physical_type="parquet", + properties=properties, + ) + odcs.schema_ = [schema_obj] - return data_contract_specification + return odcs -def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field: +def map_pyarrow_field_to_property(pyarrow_field: pyarrow.Field, field_name: str): + """Map a PyArrow field to an ODCS SchemaProperty.""" if pyarrow.types.is_boolean(pyarrow_field.type): - return Field(type="boolean") + return create_property(name=field_name, logical_type="boolean", physical_type="BOOLEAN") if pyarrow.types.is_int32(pyarrow_field.type): - return Field(type="int") + return create_property(name=field_name, logical_type="integer", physical_type="INT32") if pyarrow.types.is_int64(pyarrow_field.type): - return Field(type="long") + return create_property(name=field_name, logical_type="integer", physical_type="INT64") if pyarrow.types.is_integer(pyarrow_field.type): - return Field(type="number") + return create_property(name=field_name, logical_type="integer", physical_type=str(pyarrow_field.type)) if pyarrow.types.is_float32(pyarrow_field.type): - return Field(type="float") + return create_property(name=field_name, logical_type="number", physical_type="FLOAT") if pyarrow.types.is_float64(pyarrow_field.type): - return Field(type="double") + return create_property(name=field_name, logical_type="number", physical_type="DOUBLE") if pyarrow.types.is_decimal(pyarrow_field.type): - return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale) + return create_property( + name=field_name, + logical_type="number", + physical_type="DECIMAL", + precision=pyarrow_field.type.precision, + scale=pyarrow_field.type.scale, + ) if pyarrow.types.is_timestamp(pyarrow_field.type): - return Field(type="timestamp") + return create_property(name=field_name, logical_type="timestamp", physical_type="TIMESTAMP") if pyarrow.types.is_date(pyarrow_field.type): - return Field(type="date") + return create_property(name=field_name, logical_type="date", physical_type="DATE") if pyarrow.types.is_null(pyarrow_field.type): - return Field(type="null") + return create_property(name=field_name, logical_type="string", physical_type="NULL") if pyarrow.types.is_binary(pyarrow_field.type): - return Field(type="bytes") + return create_property(name=field_name, logical_type="array", physical_type="BINARY") if pyarrow.types.is_string(pyarrow_field.type): - return Field(type="string") + return create_property(name=field_name, logical_type="string", physical_type="STRING") if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type): - return Field(type="map") + return create_property(name=field_name, logical_type="object", physical_type="MAP") if pyarrow.types.is_struct(pyarrow_field.type): - return Field(type="struct") + return create_property(name=field_name, logical_type="object", physical_type="STRUCT") if pyarrow.types.is_list(pyarrow_field.type): - return Field(type="array") + return create_property(name=field_name, logical_type="array", physical_type="LIST") raise DataContractException( type="schema", - name=f"Parse {file_format} schema", + name="Parse parquet schema", reason=f"{pyarrow_field.type} currently not supported.", engine="datacontract", ) diff --git a/datacontract/imports/protobuf_importer.py b/datacontract/imports/protobuf_importer.py index cea238898..c88b05446 100644 --- a/datacontract/imports/protobuf_importer.py +++ b/datacontract/imports/protobuf_importer.py @@ -1,40 +1,45 @@ import os import re import tempfile +from typing import List from google.protobuf import descriptor_pb2 from grpc_tools import protoc +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, +) from datacontract.model.exceptions import DataContractException -def map_type_from_protobuf(field_type: int): +def map_type_from_protobuf(field_type: int) -> str: + """Map protobuf field type to ODCS logical type.""" protobuf_type_mapping = { - 1: "double", - 2: "float", - 3: "long", - 4: "long", # uint64 mapped to long - 5: "integer", # int32 mapped to integer - 6: "string", # fixed64 mapped to string - 7: "string", # fixed32 mapped to string - 8: "boolean", - 9: "string", - 12: "bytes", - 13: "integer", # uint32 mapped to integer - 15: "integer", # sfixed32 mapped to integer - 16: "long", # sfixed64 mapped to long - 17: "integer", # sint32 mapped to integer - 18: "long", # sint64 mapped to long + 1: "number", # double + 2: "number", # float + 3: "integer", # int64 + 4: "integer", # uint64 + 5: "integer", # int32 + 6: "string", # fixed64 + 7: "string", # fixed32 + 8: "boolean", # bool + 9: "string", # string + 12: "array", # bytes + 13: "integer", # uint32 + 15: "integer", # sfixed32 + 16: "integer", # sfixed64 + 17: "integer", # sint32 + 18: "integer", # sint64 } return protobuf_type_mapping.get(field_type, "string") -def parse_imports(proto_file: str) -> list: - """ - Parse import statements from a .proto file and return a list of imported file paths. - """ +def parse_imports_raw(proto_file: str) -> list: + """Parse import statements from a .proto file and return the raw import paths.""" try: with open(proto_file, "r") as f: content = f.read() @@ -46,19 +51,20 @@ def parse_imports(proto_file: str) -> list: engine="datacontract", original_exception=e, ) - imported_files = re.findall(r'import\s+"(.+?)";', content) - proto_dir = os.path.dirname(proto_file) - return [os.path.join(proto_dir, imp) for imp in imported_files] + return re.findall(r'import\s+"(.+?)";', content) -def compile_proto_to_binary(proto_files: list, output_file: str): - """ - Compile the provided proto files into a single descriptor set using grpc_tools.protoc. - """ - proto_dirs = set(os.path.dirname(proto) for proto in proto_files) +def compile_proto_to_binary(proto_files: list, output_file: str, proto_root: str = None): + """Compile the provided proto files into a single descriptor set.""" + if proto_root: + proto_dirs = {proto_root} + else: + proto_dirs = set(os.path.dirname(os.path.abspath(proto)) for proto in proto_files) proto_paths = [f"--proto_path={d}" for d in proto_dirs] - args = [""] + proto_paths + [f"--descriptor_set_out={output_file}"] + proto_files + abs_proto_files = [os.path.abspath(proto) for proto in proto_files] + + args = [""] + proto_paths + [f"--descriptor_set_out={output_file}"] + abs_proto_files ret = protoc.main(args) if ret != 0: raise DataContractException( @@ -71,16 +77,11 @@ def compile_proto_to_binary(proto_files: list, output_file: str): def extract_enum_values_from_fds(fds: descriptor_pb2.FileDescriptorSet, enum_name: str) -> dict: - """ - Search the FileDescriptorSet for an enum definition with the given name - and return a dictionary of its values (name to number). - """ + """Search the FileDescriptorSet for an enum definition.""" for file_descriptor in fds.file: - # Check top-level enums. for enum in file_descriptor.enum_type: if enum.name == enum_name: return {value.name: value.number for value in enum.value} - # Check enums defined inside messages. for message in file_descriptor.message_type: for enum in message.enum_type: if enum.name == enum_name: @@ -88,79 +89,94 @@ def extract_enum_values_from_fds(fds: descriptor_pb2.FileDescriptorSet, enum_nam return {} -def extract_message_fields_from_fds(fds: descriptor_pb2.FileDescriptorSet, message_name: str) -> dict: - """ - Given a FileDescriptorSet and a message name, return a dict with its field definitions. - This function recurses for nested messages and handles enums. - """ +def extract_message_fields_to_properties( + fds: descriptor_pb2.FileDescriptorSet, message_name: str +) -> List[SchemaProperty]: + """Extract message fields from a FileDescriptorSet as ODCS properties.""" for file_descriptor in fds.file: for msg in file_descriptor.message_type: if msg.name == message_name: - fields = {} + properties = [] for field in msg.field: - if field.type == 11: # TYPE_MESSAGE - nested_msg_name = field.type_name.split(".")[-1] - nested_fields = extract_message_fields_from_fds(fds, nested_msg_name) - if field.label == 3: # repeated field - field_info = { - "description": f"List of {nested_msg_name}", - "type": "array", - "items": {"type": "object", "fields": nested_fields}, - } - else: - field_info = { - "description": f"Nested object of {nested_msg_name}", - "type": "object", - "fields": nested_fields, - } - elif field.type == 14: # TYPE_ENUM - enum_name = field.type_name.split(".")[-1] - enum_values = extract_enum_values_from_fds(fds, enum_name) - field_info = { - "description": f"Enum field {field.name}", - "type": "string", - "values": enum_values, - "required": (field.label == 2), - } - else: - field_info = { - "description": f"Field {field.name}", - "type": map_type_from_protobuf(field.type), - "required": (field.label == 2), - } - fields[field.name] = field_info - return fields - return {} - + prop = convert_proto_field_to_property(fds, field) + properties.append(prop) + return properties + return [] + + +def convert_proto_field_to_property(fds: descriptor_pb2.FileDescriptorSet, field) -> SchemaProperty: + """Convert a protobuf field to an ODCS SchemaProperty.""" + if field.type == 11: # TYPE_MESSAGE + nested_msg_name = field.type_name.split(".")[-1] + nested_properties = extract_message_fields_to_properties(fds, nested_msg_name) + if field.label == 3: # repeated field + items_prop = create_property( + name="items", + logical_type="object", + physical_type="message", + properties=nested_properties, + ) + return create_property( + name=field.name, + logical_type="array", + physical_type="repeated message", + description=f"List of {nested_msg_name}", + items=items_prop, + ) + else: + return create_property( + name=field.name, + logical_type="object", + physical_type="message", + description=f"Nested object of {nested_msg_name}", + properties=nested_properties, + ) + elif field.type == 14: # TYPE_ENUM + enum_name = field.type_name.split(".")[-1] + enum_values = extract_enum_values_from_fds(fds, enum_name) + return create_property( + name=field.name, + logical_type="string", + physical_type="enum", + description=f"Enum field {field.name}", + required=field.label == 2, + custom_properties={"enumValues": enum_values} if enum_values else None, + ) + else: + return create_property( + name=field.name, + logical_type=map_type_from_protobuf(field.type), + physical_type=str(field.type), + description=f"Field {field.name}", + required=field.label == 2, + ) -def import_protobuf( - data_contract_specification: DataContractSpecification, sources: list, import_args: dict = None -) -> DataContractSpecification: - """ - Gather all proto files (including those imported), compile them into one descriptor, - then generate models with nested fields and enums resolved. - The generated data contract uses generic defaults instead of specific hardcoded ones. - """ +def import_protobuf(sources: list, import_args: dict = None) -> OpenDataContractStandard: + """Import protobuf files and generate an ODCS data contract.""" + proto_root = os.path.dirname(os.path.abspath(sources[0])) if sources else "" - # --- Step 1: Gather all proto files (main and imported) proto_files_set = set() queue = list(sources) while queue: proto = queue.pop(0) if proto not in proto_files_set: proto_files_set.add(proto) - for imp in parse_imports(proto): - if os.path.exists(imp) and imp not in proto_files_set: - queue.append(imp) + proto_dir = os.path.dirname(proto) + for imp in parse_imports_raw(proto): + resolved = os.path.join(proto_dir, imp) + if not os.path.exists(resolved): + resolved = os.path.join(proto_root, imp) + if os.path.exists(resolved) and resolved not in proto_files_set: + queue.append(resolved) all_proto_files = list(proto_files_set) - # --- Step 2: Compile all proto files into a single descriptor set. temp_descriptor = tempfile.NamedTemporaryFile(suffix=".pb", delete=False) descriptor_file = temp_descriptor.name - temp_descriptor.close() # Allow protoc to write to the file + temp_descriptor.close() + try: - compile_proto_to_binary(all_proto_files, descriptor_file) + compile_proto_to_binary(all_proto_files, descriptor_file, proto_root) with open(descriptor_file, "rb") as f: proto_data = f.read() @@ -176,89 +192,43 @@ def import_protobuf( original_exception=e, ) - # --- Step 3: Build models from the descriptor set. - all_models = {} - # Create a set of the main proto file basenames. + odcs = create_odcs() + odcs.schema_ = [] + source_proto_basenames = {os.path.basename(proto) for proto in sources} for file_descriptor in fds.file: - # Only process file descriptors that correspond to your main proto files. if os.path.basename(file_descriptor.name) not in source_proto_basenames: continue for message in file_descriptor.message_type: - fields = {} + properties = [] for field in message.field: - if field.type == 11: # TYPE_MESSAGE - nested_msg_name = field.type_name.split(".")[-1] - nested_fields = extract_message_fields_from_fds(fds, nested_msg_name) - if field.label == 3: - field_info = { - "description": f"List of {nested_msg_name}", - "type": "array", - "items": {"type": "object", "fields": nested_fields}, - } - else: - field_info = { - "description": f"Nested object of {nested_msg_name}", - "type": "object", - "fields": nested_fields, - } - fields[field.name] = field_info - elif field.type == 14: # TYPE_ENUM - enum_name = field.type_name.split(".")[-1] - enum_values = extract_enum_values_from_fds(fds, enum_name) - field_info = { - "description": f"Enum field {field.name}", - "type": "string", - "values": enum_values, - "required": (field.label == 2), - } - fields[field.name] = field_info - else: - field_info = { - "description": f"Field {field.name}", - "type": map_type_from_protobuf(field.type), - "required": (field.label == 2), - } - fields[field.name] = field_info - - all_models[message.name] = { - "description": f"Details of {message.name}.", - "type": "table", - "fields": fields, - } - - data_contract_specification.models = all_models - - return data_contract_specification + prop = convert_proto_field_to_property(fds, field) + properties.append(prop) + + schema_obj = create_schema_object( + name=message.name, + physical_type="message", + description=f"Details of {message.name}.", + properties=properties, + ) + odcs.schema_.append(schema_obj) + + return odcs finally: - # Clean up the temporary descriptor file. if os.path.exists(descriptor_file): os.remove(descriptor_file) class ProtoBufImporter(Importer): def __init__(self, name): - # 'name' is passed by the importer factory. self.name = name def import_source( self, - data_contract_specification: DataContractSpecification, source: str, import_args: dict = None, - ) -> DataContractSpecification: - """ - Import a protobuf file (and its imports) into the given DataContractSpecification. - - Parameters: - - data_contract_specification: the initial specification to update. - - source: the protobuf file path. - - import_args: optional dictionary with additional arguments (e.g. 'output_dir'). - - Returns: - The updated DataContractSpecification. - """ - # Wrap the source in a list because import_protobuf expects a list of sources. - return import_protobuf(data_contract_specification, [source], import_args) + ) -> OpenDataContractStandard: + """Import a protobuf file into an ODCS data contract.""" + return import_protobuf([source], import_args) diff --git a/datacontract/imports/spark_importer.py b/datacontract/imports/spark_importer.py index 79ce845b0..26ff8f189 100644 --- a/datacontract/imports/spark_importer.py +++ b/datacontract/imports/spark_importer.py @@ -3,14 +3,15 @@ import tempfile from databricks.sdk import WorkspaceClient +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from pyspark.sql import DataFrame, SparkSession, types from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Field, - Model, - Server, +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, ) logger = logging.getLogger(__name__) @@ -19,43 +20,21 @@ class SparkImporter(Importer): def import_source( self, - data_contract_specification: DataContractSpecification, source: str, import_args: dict, - ) -> DataContractSpecification: - """ - Imports data from a Spark source into the data contract specification. - - Args: - data_contract_specification: The data contract specification object. - source: The source string indicating the Spark tables to read. - import_args: Additional arguments for the import process. - Returns: - dict: The updated data contract specification. - """ + ) -> OpenDataContractStandard: + """Imports data from a Spark source into an ODCS data contract.""" dataframe = import_args.get("dataframe", None) description = import_args.get("description", None) - return import_spark(data_contract_specification, source, dataframe, description) + return import_spark(source, dataframe, description) def import_spark( - data_contract_specification: DataContractSpecification, source: str, dataframe: DataFrame | None = None, description: str | None = None, -) -> DataContractSpecification: - """ - Imports schema(s) from Spark into a Data Contract Specification. - - Args: - data_contract_specification (DataContractSpecification): The contract spec to update. - source (str): Comma-separated Spark table/view names. - dataframe (DataFrame | None): Optional Spark DataFrame to import. - description (str | None): Optional table-level description. - - Returns: - DataContractSpecification: The updated contract spec with imported models. - """ +) -> OpenDataContractStandard: + """Imports schema(s) from Spark into an ODCS data contract.""" tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark") atexit.register(tmp_dir.cleanup) @@ -66,169 +45,152 @@ def import_spark( .config("spark.ui.enabled", "false") .getOrCreate() ) - data_contract_specification.servers["local"] = Server(type="dataframe") + + odcs = create_odcs() + odcs.servers = [create_server(name="local", server_type="dataframe")] + odcs.schema_ = [] if dataframe is not None: if not isinstance(dataframe, DataFrame): raise TypeError("Expected 'dataframe' to be a pyspark.sql.DataFrame") - data_contract_specification.models[source] = import_from_spark_df(spark, source, dataframe, description) - return data_contract_specification + schema_obj = import_from_spark_df(spark, source, dataframe, description) + odcs.schema_.append(schema_obj) + return odcs if not source: raise ValueError("Either 'dataframe' or a valid 'source' must be provided") for table_name in map(str.strip, source.split(",")): df = spark.read.table(table_name) - data_contract_specification.models[table_name] = import_from_spark_df(spark, table_name, df, description) - - return data_contract_specification + schema_obj = import_from_spark_df(spark, table_name, df, description) + odcs.schema_.append(schema_obj) + return odcs -def import_from_spark_df(spark: SparkSession, source: str, df: DataFrame, description: str) -> Model: - """ - Converts a Spark DataFrame into a Model. - Args: - spark: SparkSession - source: A comma-separated string of Spark temporary views to read. - df: The Spark DataFrame to convert. - description: Table level comment - - Returns: - Model: The generated data contract model. - """ - model = Model() +def import_from_spark_df(spark: SparkSession, source: str, df: DataFrame, description: str): + """Converts a Spark DataFrame into an ODCS SchemaObject.""" schema = df.schema - if description is None: - model.description = _table_comment_from_spark(spark, source) - else: - model.description = description + table_description = description + if table_description is None: + table_description = _table_comment_from_spark(spark, source) + properties = [] for field in schema: - model.fields[field.name] = _field_from_struct_type(field) - - return model - - -def _field_from_struct_type(spark_field: types.StructField) -> Field: - """ - Converts a Spark StructField into a Field object for the data contract. - - Args: - spark_field: The Spark StructField to convert. - - Returns: - Field: The generated Field object. - """ - field = Field() - field.required = not spark_field.nullable - field.description = spark_field.metadata.get("comment") - - return _type_from_data_type(field, spark_field.dataType) - + prop = _property_from_struct_type(field) + properties.append(prop) + + return create_schema_object( + name=source, + physical_type="table", + description=table_description, + properties=properties, + ) -def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field: - """ - Maps Spark data types to the Data Contract type system and updates the field. - Args: - field: The Field object to update. - spark_type: The Spark data type to map. +def _property_from_struct_type(spark_field: types.StructField) -> SchemaProperty: + """Converts a Spark StructField into an ODCS SchemaProperty.""" + logical_type = _data_type_from_spark(spark_field.dataType) + description = spark_field.metadata.get("comment") if spark_field.metadata else None + required = not spark_field.nullable + + nested_properties = None + items_prop = None + + if logical_type == "array": + items_prop = _type_to_property("items", spark_field.dataType.elementType, not spark_field.dataType.containsNull) + elif logical_type == "object" and isinstance(spark_field.dataType, types.StructType): + nested_properties = [_property_from_struct_type(sf) for sf in spark_field.dataType.fields] + + return create_property( + name=spark_field.name, + logical_type=logical_type, + physical_type=str(spark_field.dataType), + description=description, + required=required if required else None, + properties=nested_properties, + items=items_prop, + ) - Returns: - Field: The updated Field object. - """ - field.type = _data_type_from_spark(spark_type) - if field.type == "array": - field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType) +def _type_to_property(name: str, spark_type: types.DataType, required: bool = True) -> SchemaProperty: + """Convert a Spark data type to an ODCS SchemaProperty.""" + logical_type = _data_type_from_spark(spark_type) - elif field.type == "map": - field.keys = _type_from_data_type(Field(required=True), spark_type.keyType) - field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType) + nested_properties = None + items_prop = None - elif field.type == "struct": - field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields} + if logical_type == "array": + items_prop = _type_to_property("items", spark_type.elementType, not spark_type.containsNull) + elif logical_type == "object" and isinstance(spark_type, types.StructType): + nested_properties = [_property_from_struct_type(sf) for sf in spark_type.fields] - return field + return create_property( + name=name, + logical_type=logical_type, + physical_type=str(spark_type), + required=required if required else None, + properties=nested_properties, + items=items_prop, + ) def _data_type_from_spark(spark_type: types.DataType) -> str: - """ - Maps Spark data types to the Data Contract type system. - - Args: - spark_type: The Spark data type to map. - - Returns: - str: The corresponding Data Contract type. - """ + """Maps Spark data types to ODCS logical types.""" if isinstance(spark_type, types.StringType): return "string" elif isinstance(spark_type, (types.IntegerType, types.ShortType)): return "integer" elif isinstance(spark_type, types.LongType): - return "long" + return "integer" elif isinstance(spark_type, types.FloatType): - return "float" + return "number" elif isinstance(spark_type, types.DoubleType): - return "double" + return "number" elif isinstance(spark_type, types.StructType): - return "struct" + return "object" elif isinstance(spark_type, types.ArrayType): return "array" elif isinstance(spark_type, types.MapType): - return "map" + return "object" elif isinstance(spark_type, types.TimestampType): - return "timestamp" + return "date" elif isinstance(spark_type, types.TimestampNTZType): - return "timestamp_ntz" + return "date" elif isinstance(spark_type, types.DateType): return "date" elif isinstance(spark_type, types.BooleanType): return "boolean" elif isinstance(spark_type, types.BinaryType): - return "bytes" + return "array" elif isinstance(spark_type, types.DecimalType): - return "decimal" + return "number" elif isinstance(spark_type, types.NullType): - return "null" + return "string" elif isinstance(spark_type, types.VarcharType): - return "varchar" + return "string" elif isinstance(spark_type, types.VariantType): - return "variant" + return "object" else: raise ValueError(f"Unsupported Spark type: {spark_type}") def _table_comment_from_spark(spark: SparkSession, source: str): - """ - Attempts to retrieve the table-level comment from a Spark table using multiple fallback methods. - - Args: - spark (SparkSession): The active Spark session. - source (str): The name of the table (without catalog or schema). - - Returns: - str or None: The table-level comment, if found. - """ - - # Get Current Catalog and Schema from Spark Session + """Attempts to retrieve the table-level comment from a Spark table.""" try: current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0] except Exception: - current_catalog = "hive_metastore" # Fallback for non-Unity Catalog clusters + current_catalog = "hive_metastore" try: current_schema = spark.catalog.currentDatabase() except Exception: current_schema = spark.sql("SELECT current_database()").collect()[0][0] - # Get table comment if it exists table_comment = "" source = f"{current_catalog}.{current_schema}.{source}" + try: - # Initialize WorkspaceClient for Unity Catalog API calls workspace_client = WorkspaceClient() created_table = workspace_client.tables.get(full_name=f"{source}") table_comment = created_table.comment @@ -237,7 +199,6 @@ def _table_comment_from_spark(spark: SparkSession, source: str): except Exception: pass - # Fallback to Spark Catalog API for Hive Metastore or Non-UC Tables try: table_comment = spark.catalog.getTable(f"{source}").description logger.info(f"'{source}' table comment retrieved using 'spark.catalog.getTable({source}).description'") @@ -245,7 +206,6 @@ def _table_comment_from_spark(spark: SparkSession, source: str): except Exception: pass - # Final Fallback Using DESCRIBE TABLE EXTENDED try: rows = spark.sql(f"DESCRIBE TABLE EXTENDED {source}").collect() for row in rows: @@ -258,5 +218,4 @@ def _table_comment_from_spark(spark: SparkSession, source: str): pass logger.info(f"{source} table comment could not be retrieved") - return None diff --git a/datacontract/imports/sql_importer.py b/datacontract/imports/sql_importer.py index e8efe3b90..5d3f4ac0c 100644 --- a/datacontract/imports/sql_importer.py +++ b/datacontract/imports/sql_importer.py @@ -2,26 +2,31 @@ import os import sqlglot +from open_data_contract_standard.model import OpenDataContractStandard from sqlglot.dialects.dialect import Dialects from datacontract.imports.importer import Importer -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, +) from datacontract.model.exceptions import DataContractException from datacontract.model.run import ResultEnum class SqlImporter(Importer): def import_source( - self, data_contract_specification: DataContractSpecification, source: str, import_args: dict - ) -> DataContractSpecification: - return import_sql(data_contract_specification, self.import_format, source, import_args) + self, source: str, import_args: dict + ) -> OpenDataContractStandard: + return import_sql(self.import_format, source, import_args) def import_sql( - data_contract_specification: DataContractSpecification, format: str, source: str, import_args: dict = None -) -> DataContractSpecification: + format: str, source: str, import_args: dict = None +) -> OpenDataContractStandard: sql = read_file(source) - dialect = to_dialect(import_args) try: @@ -36,48 +41,59 @@ def import_sql( result=ResultEnum.error, ) - server_type: str | None = to_server_type(source, dialect) + odcs = create_odcs() + odcs.schema_ = [] + + server_type = to_server_type(source, dialect) if server_type is not None: - data_contract_specification.servers[server_type] = Server(type=server_type) + odcs.servers = [create_server(name=server_type, server_type=server_type)] tables = parsed.find_all(sqlglot.expressions.Table) for table in tables: - if data_contract_specification.models is None: - data_contract_specification.models = {} - table_name = table.this.name + properties = [] - fields = {} + primary_key_position = 1 for column in parsed.find_all(sqlglot.exp.ColumnDef): if column.parent.this.name != table_name: continue - field = Field() col_name = column.this.name col_type = to_col_type(column, dialect) - field.type = map_type_from_sql(col_type) + logical_type = map_type_from_sql(col_type) col_description = get_description(column) - field.description = col_description - field.maxLength = get_max_length(column) + max_length = get_max_length(column) precision, scale = get_precision_scale(column) - field.precision = precision - field.scale = scale - field.primaryKey = get_primary_key(column) - field.required = column.find(sqlglot.exp.NotNullColumnConstraint) is not None or None - physical_type_key = to_physical_type_key(dialect) - field.config = { - physical_type_key: col_type, - } - - fields[col_name] = field - - data_contract_specification.models[table_name] = Model( - type="table", - fields=fields, + is_primary_key = get_primary_key(column) + is_required = column.find(sqlglot.exp.NotNullColumnConstraint) is not None or None + + prop = create_property( + name=col_name, + logical_type=logical_type, + physical_type=col_type, + description=col_description, + max_length=max_length, + precision=precision, + scale=scale, + primary_key=is_primary_key, + primary_key_position=primary_key_position if is_primary_key else None, + required=is_required if is_required else None, + ) + + if is_primary_key: + primary_key_position += 1 + + properties.append(prop) + + schema_obj = create_schema_object( + name=table_name, + physical_type="table", + properties=properties, ) + odcs.schema_.append(schema_obj) - return data_contract_specification + return odcs def get_primary_key(column) -> bool | None: @@ -201,6 +217,7 @@ def get_precision_scale(column): def map_type_from_sql(sql_type: str) -> str | None: + """Map SQL type to ODCS logical type.""" if sql_type is None: return None @@ -220,60 +237,58 @@ def map_type_from_sql(sql_type: str) -> str | None: return "string" elif sql_type_normed.startswith("ntext"): return "string" - elif sql_type_normed.startswith("int"): - return "int" + elif sql_type_normed.startswith("int") and not sql_type_normed.startswith("interval"): + return "integer" elif sql_type_normed.startswith("bigint"): - return "long" + return "integer" elif sql_type_normed.startswith("tinyint"): - return "int" + return "integer" elif sql_type_normed.startswith("smallint"): - return "int" + return "integer" elif sql_type_normed.startswith("float"): - return "float" + return "number" elif sql_type_normed.startswith("double"): - return "double" + return "number" elif sql_type_normed.startswith("decimal"): - return "decimal" + return "number" elif sql_type_normed.startswith("numeric"): - return "decimal" + return "number" elif sql_type_normed.startswith("bool"): return "boolean" elif sql_type_normed.startswith("bit"): return "boolean" elif sql_type_normed.startswith("binary"): - return "bytes" + return "array" elif sql_type_normed.startswith("varbinary"): - return "bytes" + return "array" + elif sql_type_normed.startswith("raw"): + return "array" + elif sql_type_normed == "blob" or sql_type_normed == "bfile": + return "array" elif sql_type_normed == "date": return "date" elif sql_type_normed == "time": return "string" - elif sql_type_normed == "timestamp": - return "timestamp_ntz" - elif ( - sql_type_normed == "timestamptz" - or sql_type_normed == "timestamp_tz" - or sql_type_normed == "timestamp with time zone" - ): - return "timestamp_tz" - elif sql_type_normed == "timestampntz" or sql_type_normed == "timestamp_ntz": - return "timestamp_ntz" + elif sql_type_normed.startswith("timestamp"): + return "date" + elif sql_type_normed == "datetime" or sql_type_normed == "datetime2": + return "date" elif sql_type_normed == "smalldatetime": - return "timestamp_ntz" - elif sql_type_normed == "datetime": - return "timestamp_ntz" - elif sql_type_normed == "datetime2": - return "timestamp_ntz" + return "date" elif sql_type_normed == "datetimeoffset": - return "timestamp_tz" + return "date" elif sql_type_normed == "uniqueidentifier": # tsql return "string" elif sql_type_normed == "json": - return "string" + return "object" elif sql_type_normed == "xml": # tsql return "string" + elif sql_type_normed.startswith("number"): + return "number" + elif sql_type_normed == "clob" or sql_type_normed == "nclob": + return "string" else: - return "variant" + return "object" def read_file(path): diff --git a/datacontract/imports/unity_importer.py b/datacontract/imports/unity_importer.py index bb02516fd..bd0fe27b3 100644 --- a/datacontract/imports/unity_importer.py +++ b/datacontract/imports/unity_importer.py @@ -4,59 +4,37 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service.catalog import ColumnInfo, TableInfo -from open_data_contract_standard.model import OpenDataContractStandard +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty from datacontract.imports.importer import Importer -from datacontract.imports.sql_importer import map_type_from_sql, to_physical_type_key -from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server +from datacontract.imports.odcs_helper import ( + create_odcs, + create_property, + create_schema_object, + create_server, +) +from datacontract.imports.sql_importer import map_type_from_sql from datacontract.model.exceptions import DataContractException class UnityImporter(Importer): - """ - UnityImporter class for importing data contract specifications from Unity Catalog. - """ + """UnityImporter class for importing data contract specifications from Unity Catalog.""" def import_source( self, - data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str, import_args: dict, - ) -> DataContractSpecification | OpenDataContractStandard: - """ - Import data contract specification from a source. - - :param data_contract_specification: The data contract specification to be imported. - :type data_contract_specification: DataContractSpecification - :param source: The source from which to import the data contract specification. - :type source: str - :param import_args: Additional arguments for the import process. - :type import_args: dict - :return: The imported data contract specification. - :rtype: DataContractSpecification - """ + ) -> OpenDataContractStandard: + """Import data contract specification from a source.""" if source is not None: - data_contract_specification = import_unity_from_json(data_contract_specification, source) + return import_unity_from_json(source) else: unity_table_full_name_list = import_args.get("unity_table_full_name") - data_contract_specification = import_unity_from_api(data_contract_specification, unity_table_full_name_list) - return data_contract_specification - - -def import_unity_from_json( - data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str -) -> DataContractSpecification | OpenDataContractStandard: - """ - Import data contract specification from a JSON file. - - :param data_contract_specification: The data contract specification to be imported. - :type data_contract_specification: DataContractSpecification - :param source: The path to the JSON file. - :type source: str - :return: The imported data contract specification. - :rtype: DataContractSpecification - :raises DataContractException: If there is an error parsing the JSON file. - """ + return import_unity_from_api(unity_table_full_name_list) + + +def import_unity_from_json(source: str) -> OpenDataContractStandard: + """Import data contract specification from a JSON file.""" try: with open(source, "r") as file: json_contents = json.loads(file.read()) @@ -69,42 +47,35 @@ def import_unity_from_json( engine="datacontract", original_exception=e, ) - return convert_unity_schema(data_contract_specification, unity_schema) - - -def import_unity_from_api( - data_contract_specification: DataContractSpecification, unity_table_full_name_list: List[str] = None -) -> DataContractSpecification: - """ - Import data contract specification from Unity Catalog API. - - :param data_contract_specification: The data contract specification to be imported. - :type data_contract_specification: DataContractSpecification - :param unity_table_full_name_list: The full name of the Unity table. - :type unity_table_full_name_list: list[str] - :return: The imported data contract specification. - :rtype: DataContractSpecification - :raises DataContractException: If there is an error retrieving the schema from the API. - """ + + odcs = create_odcs() + return convert_unity_schema(odcs, unity_schema) + + +def import_unity_from_api(unity_table_full_name_list: List[str] = None) -> OpenDataContractStandard: + """Import data contract specification from Unity Catalog API.""" try: - # print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}") + profile = os.getenv("DATACONTRACT_DATABRICKS_PROFILE") host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN") - # print(f"Databricks host: {host}, token: {'***' if token else 'not set'}") - if not host: - raise DataContractException( - type="configuration", - name="Databricks configuration", - reason="DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set", - engine="datacontract", - ) - if not token: - raise DataContractException( - type="configuration", - name="Databricks configuration", - reason="DATACONTRACT_DATABRICKS_TOKEN environment variable is not set", - engine="datacontract", - ) - workspace_client = WorkspaceClient(host=host, token=token) + exception = DataContractException( + type="configuration", + name="Databricks configuration", + reason="", + engine="datacontract", + ) + if not profile and not host and not token: + reason = "Either DATACONTRACT_DATABRICKS_PROFILE or both DATACONTRACT_DATABRICKS_SERVER_HOSTNAME and DATACONTRACT_DATABRICKS_TOKEN environment variables must be set" + exception.reason = reason + raise exception + if token and not host: + reason = "DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set" + exception.reason = reason + raise exception + if host and not token: + reason = "DATACONTRACT_DATABRICKS_TOKEN environment variable is not set" + exception.reason = reason + raise exception + workspace_client = WorkspaceClient(profile=profile) if profile else WorkspaceClient(host=host, token=token) except Exception as e: raise DataContractException( type="schema", @@ -114,6 +85,9 @@ def import_unity_from_api( original_exception=e, ) + odcs = create_odcs() + odcs.schema_ = [] + for unity_table_full_name in unity_table_full_name_list: try: unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name) @@ -125,90 +99,67 @@ def import_unity_from_api( engine="datacontract", original_exception=e, ) - data_contract_specification = convert_unity_schema(data_contract_specification, unity_schema) - - return data_contract_specification - + odcs = convert_unity_schema(odcs, unity_schema) -def convert_unity_schema( - data_contract_specification: DataContractSpecification | OpenDataContractStandard, unity_schema: TableInfo -) -> DataContractSpecification | OpenDataContractStandard: - """ - Convert Unity schema to data contract specification. + return odcs - :param data_contract_specification: The data contract specification to be converted. - :type data_contract_specification: DataContractSpecification - :param unity_schema: The Unity schema to be converted. - :type unity_schema: TableInfo - :return: The converted data contract specification. - :rtype: DataContractSpecification - """ - if data_contract_specification.models is None: - data_contract_specification.models = {} - if data_contract_specification.servers is None: - data_contract_specification.servers = {} +def convert_unity_schema(odcs: OpenDataContractStandard, unity_schema: TableInfo) -> OpenDataContractStandard: + """Convert Unity schema to ODCS data contract.""" + if odcs.schema_ is None: + odcs.schema_ = [] # Configure databricks server with catalog and schema from Unity table info schema_name = unity_schema.schema_name catalog_name = unity_schema.catalog_name + if catalog_name and schema_name: - server_name = "myserver" # Default server name + if odcs.servers is None: + odcs.servers = [] - data_contract_specification.servers[server_name] = Server( - type="databricks", + server = create_server( + name="databricks", + server_type="databricks", catalog=catalog_name, schema=schema_name, ) + odcs.servers = [server] - fields = import_table_fields(unity_schema.columns) + properties = import_table_fields(unity_schema.columns) table_id = unity_schema.name or unity_schema.table_id - data_contract_specification.models[table_id] = Model(fields=fields, type="table") + schema_obj = create_schema_object( + name=table_id, + physical_type="table", + description=unity_schema.comment, + properties=properties, + ) if unity_schema.name: - data_contract_specification.models[table_id].title = unity_schema.name - - if unity_schema.comment: - data_contract_specification.models[table_id].description = unity_schema.comment - - return data_contract_specification - - -def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]: - """ - Import table fields from Unity schema columns. + schema_obj.businessName = unity_schema.name - Here we are first converting the `ColumnInfo.type_json` to a Spark StructField object - so we can leave the complexity of the Spark field types to the Spark JSON schema parser, - then re-use the logic in `datacontract.imports.spark_importer` to convert the StructField - into a Field object. + odcs.schema_.append(schema_obj) - :param columns: The list of Unity schema columns. - :type columns: List[ColumnInfo] - :return: A dictionary of imported fields. - :rtype: dict[str, Field] - """ - imported_fields = {} + return odcs - for column in columns: - imported_fields[column.name] = _to_field(column) - return imported_fields +def import_table_fields(columns: List[ColumnInfo]) -> List[SchemaProperty]: + """Import table fields from Unity schema columns.""" + return [_to_property(column) for column in columns] -def _to_field(column: ColumnInfo) -> Field: - field = Field() - # The second condition evaluates for complex types (e.g. variant) - if column.type_name is not None or (column.type_name is None and column.type_text is not None): - sql_type = str(column.type_text) - field.type = map_type_from_sql(sql_type) - physical_type_key = to_physical_type_key("databricks") - field.config = { - physical_type_key: sql_type, - } - field.required = column.nullable is None or not column.nullable - field.description = column.comment if column.comment else None +def _to_property(column: ColumnInfo) -> SchemaProperty: + """Convert a Unity ColumnInfo to an ODCS SchemaProperty.""" + sql_type = str(column.type_text) if column.type_text else "string" + logical_type = map_type_from_sql(sql_type) + required = column.nullable is None or not column.nullable - return field + return create_property( + name=column.name, + logical_type=logical_type if logical_type else "string", + physical_type=sql_type, + description=column.comment, + required=required if required else None, + custom_properties={"databricksType": sql_type} if sql_type else None, + ) diff --git a/datacontract/init/init_template.py b/datacontract/init/init_template.py index 03ed4871e..7578e51b5 100644 --- a/datacontract/init/init_template.py +++ b/datacontract/init/init_template.py @@ -3,7 +3,7 @@ import requests -DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.0.init.yaml" +DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "odcs-3.1.0.init.yaml" def get_init_template(location: str = None) -> str: diff --git a/datacontract/integration/datamesh_manager.py b/datacontract/integration/datamesh_manager.py deleted file mode 100644 index 52020bf02..000000000 --- a/datacontract/integration/datamesh_manager.py +++ /dev/null @@ -1,86 +0,0 @@ -import os - -import requests - -from datacontract.model.run import Run - -# used to retrieve the HTML location of the published data contract or test results -RESPONSE_HEADER_LOCATION_HTML = "location-html" - - -def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool): - try: - if publish_url is None: - # this url supports Data Mesh Manager and Data Contract Manager - url = "https://api.datamesh-manager.com/api/test-results" - else: - url = publish_url - - api_key = os.getenv("DATAMESH_MANAGER_API_KEY") - if api_key is None: - api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY") - if api_key is None: - raise Exception( - "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set" - ) - - if run.dataContractId is None: - raise Exception("Cannot publish run results for unknown data contract ID") - - headers = {"Content-Type": "application/json", "x-api-key": api_key} - request_body = run.model_dump_json() - # print("Request Body:", request_body) - response = requests.post( - url, - data=request_body, - headers=headers, - verify=ssl_verification, - ) - # print("Status Code:", response.status_code) - # print("Response Body:", response.text) - if response.status_code != 200: - run.log_error(f"Error publishing test results to Data Mesh Manager: {response.text}") - return - run.log_info("Published test results successfully") - - location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) - if location_html is not None and len(location_html) > 0: - print(f"🚀 Open {location_html}") - - except Exception as e: - run.log_error(f"Failed publishing test results. Error: {str(e)}") - - -def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_verification: bool): - try: - api_key = os.getenv("DATAMESH_MANAGER_API_KEY") - host = "https://api.datamesh-manager.com" - if os.getenv("DATAMESH_MANAGER_HOST") is not None: - host = os.getenv("DATAMESH_MANAGER_HOST") - if api_key is None: - api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY") - if api_key is None: - raise Exception( - "Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set" - ) - headers = {"Content-Type": "application/json", "x-api-key": api_key} - id = data_contract_dict["id"] - url = f"{host}/api/datacontracts/{id}" - response = requests.put( - url=url, - json=data_contract_dict, - headers=headers, - verify=ssl_verification, - ) - if response.status_code != 200: - print(f"Error publishing data contract to Data Mesh Manager: {response.text}") - exit(1) - - print("✅ Published data contract successfully") - - location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) - if location_html is not None and len(location_html) > 0: - print(f"🚀 Open {location_html}") - - except Exception as e: - print(f"Failed publishing data contract. Error: {str(e)}") diff --git a/datacontract/integration/entropy_data.py b/datacontract/integration/entropy_data.py new file mode 100644 index 000000000..2f7e90a5b --- /dev/null +++ b/datacontract/integration/entropy_data.py @@ -0,0 +1,126 @@ +import os +from urllib.parse import urlparse + +import requests + +from datacontract.model.run import Run + +# used to retrieve the HTML location of the published data contract or test results +RESPONSE_HEADER_LOCATION_HTML = "location-html" + + +def publish_test_results_to_entropy_data(run: Run, publish_url: str, ssl_verification: bool): + try: + host = publish_url + if publish_url is None: + # this url supports Data Mesh Manager and Data Contract Manager + host = _get_host() + url = "%s/api/test-results" % host + else: + url = publish_url + + api_key = _get_api_key() + + if run.dataContractId is None: + raise Exception("Cannot publish run results for unknown data contract ID") + + headers = {"Content-Type": "application/json", "x-api-key": api_key} + request_body = run.model_dump_json() + # print("Request Body:", request_body) + response = requests.post( + url, + data=request_body, + headers=headers, + verify=ssl_verification, + ) + # print("Status Code:", response.status_code) + # print("Response Body:", response.text) + if response.status_code != 200: + display_host = _extract_hostname(host) + run.log_error(f"Error publishing test results to {display_host}: {response.text}") + return + run.log_info("Published test results successfully") + + location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) + if location_html is not None and len(location_html) > 0: + print(f"🚀 Open {location_html}") + + except Exception as e: + run.log_error(f"Failed publishing test results. Error: {str(e)}") + + +def publish_data_contract_to_entropy_data(data_contract_dict: dict, ssl_verification: bool): + try: + api_key = _get_api_key() + host = _get_host() + headers = {"Content-Type": "application/json", "x-api-key": api_key} + id = data_contract_dict["id"] + url = f"{host}/api/datacontracts/{id}" + response = requests.put( + url=url, + json=data_contract_dict, + headers=headers, + verify=ssl_verification, + ) + if response.status_code != 200: + display_host = _extract_hostname(host) + print(f"Error publishing data contract to {display_host}: {response.text}") + exit(1) + + print("✅ Published data contract successfully") + + location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML) + if location_html is not None and len(location_html) > 0: + print(f"🚀 Open {location_html}") + + except Exception as e: + print(f"Failed publishing data contract. Error: {str(e)}") + + +def _get_api_key() -> str: + """ + Get API key from environment variables with fallback priority: + 1. ENTROPY_DATA_API_KEY + 2. DATAMESH_MANAGER_API_KEY + 3. DATACONTRACT_MANAGER_API_KEY + """ + api_key = os.getenv("ENTROPY_DATA_API_KEY") + if api_key is None: + api_key = os.getenv("DATAMESH_MANAGER_API_KEY") + if api_key is None: + api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY") + if api_key is None: + raise Exception( + "Cannot publish, as neither ENTROPY_DATA_API_KEY, DATAMESH_MANAGER_API_KEY, nor DATACONTRACT_MANAGER_API_KEY is set" + ) + return api_key + + +def _get_host() -> str: + """ + Get host from environment variables with fallback priority: + 1. ENTROPY_DATA_HOST + 2. DATAMESH_MANAGER_HOST + 3. DATACONTRACT_MANAGER_HOST + 4. Default: https://api.entropy-data.com + """ + host = os.getenv("ENTROPY_DATA_HOST") + if host is None: + host = os.getenv("DATAMESH_MANAGER_HOST") + if host is None: + host = os.getenv("DATACONTRACT_MANAGER_HOST") + if host is None: + host = "https://api.entropy-data.com" + return host + + +def _extract_hostname(url: str) -> str: + """ + Extract the hostname (including subdomains and top-level domain) from a URL. + + Examples: + - https://app.entropy-data.com/path -> app.entropy-data.com + - http://api.example.com:8080/api -> api.example.com + """ + parsed = urlparse(url) + return parsed.netloc.split(":")[0] if parsed.netloc else url diff --git a/datacontract/lint/lint.py b/datacontract/lint/lint.py deleted file mode 100644 index 9c10bf602..000000000 --- a/datacontract/lint/lint.py +++ /dev/null @@ -1,142 +0,0 @@ -import abc -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Sequence, cast - -from datacontract.model.run import Check - -from ..model.data_contract_specification import DataContractSpecification - -"""This module contains linter definitions for linting a data contract. - -Lints are quality checks that can succeed, fail, or warn. They are -distinct from checks such as "valid yaml" or "file not found", which -will cause the processing of the data contract to stop. Lints can be -ignored, and are high-level requirements on the format of a data -contract.""" - - -class LintSeverity(Enum): - """The severity of a lint message. Generally, lint messages should be - emitted with a severity of ERROR. WARNING should be used when the linter - cannot determine a lint result, for example, when an unsupported model - type is used. - """ - - ERROR = 2 - WARNING = 1 - - -@dataclass -class LinterMessage: - """A single linter message with attached severity and optional "model" that - caused the message. - - Attributes: - outcome: The outcome of the linting, either ERROR or WARNING. Linting outcomes with level WARNING are discarded for now. - message: A message describing the error or warning in more detail. - model: The model that caused the lint to fail. Is optional. - - """ - - outcome: LintSeverity - message: str - model: Any = None - - @classmethod - def error(cls, message: str, model=None): - return LinterMessage(LintSeverity.ERROR, message, model) - - @classmethod - def warning(cls, message: str, model=None): - return LinterMessage(LintSeverity.WARNING, message, model) - - -@dataclass -class LinterResult: - """Result of linting a contract. Contains multiple LinterResults from - the same linter or lint phase. - - Attributes: - linter: The linter that produced these results - results: A list of linting results. Multiple identical linting - results can be present in the list. An empty list means that - the linter ran without producing warnings or errors. - """ - - results: Sequence[LinterMessage] = field(default_factory=list) - - @classmethod - def erroneous(cls, message, model=None): - return cls([LinterMessage.error(message, model)]) - - @classmethod - def cautious(cls, message, model=None): - return cls([LinterMessage.warning(message, model)]) - - def with_warning(self, message, model=None): - result = LinterMessage.warning(message, model) - return LinterResult(cast(list[LinterMessage], self.results) + [result]) - - def with_error(self, message, model=None): - result = LinterMessage.error(message, model) - return LinterResult(cast(list[LinterMessage], self.results) + [result]) - - def has_errors(self) -> bool: - return any(map(lambda result: result.outcome == LintSeverity.ERROR, self.results)) - - def has_warnings(self) -> bool: - return any(map(lambda result: result.outcome == LintSeverity.WARNING, self.results)) - - def error_results(self) -> Sequence[LinterMessage]: - return [result for result in self.results if result.outcome == LintSeverity.ERROR] - - def warning_results(self) -> Sequence[LinterMessage]: - return [result for result in self.results if result.outcome == LintSeverity.WARNING] - - def no_errors_or_warnings(self) -> bool: - return len(self.results) == 0 - - def combine(self, other: "LinterResult") -> "LinterResult": - return LinterResult(cast(list[Any], self.results) + cast(list[Any], other.results)) - - -class Linter(abc.ABC): - @property - @abc.abstractmethod - def name(self) -> str: - """Human-readable name of the linter.""" - pass - - @property - @abc.abstractmethod - def id(self) -> str: - """A linter ID for configuration (i.e. enabling and disabling).""" - pass - - @abc.abstractmethod - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - pass - - def lint(self, contract: DataContractSpecification) -> list[Check]: - """Call with a data contract to get a list of check results from the linter.""" - result = self.lint_implementation(contract) - checks = [] - if not result.error_results(): - checks.append(Check(type="lint", name=f"Linter '{self.name}'", result="passed", engine="datacontract")) - else: - # All linter messages are treated as warnings. Severity is - # currently ignored, but could be used in filtering in the future - # Linter messages with level WARNING are currently ignored, but might - # be logged or printed in the future. - for lint_error in result.error_results(): - checks.append( - Check( - type="lint", - name=f"Linter '{self.name}'", - result="warning", - engine="datacontract", - reason=lint_error.message, - ) - ) - return checks diff --git a/datacontract/lint/linters/__init__.py b/datacontract/lint/linters/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/datacontract/lint/linters/description_linter.py b/datacontract/lint/linters/description_linter.py deleted file mode 100644 index 2f88615b8..000000000 --- a/datacontract/lint/linters/description_linter.py +++ /dev/null @@ -1,33 +0,0 @@ -from datacontract.model.data_contract_specification import DataContractSpecification - -from ..lint import Linter, LinterResult - - -class DescriptionLinter(Linter): - """Check for a description on contracts, models, model fields, definitions and examples.""" - - @property - def name(self) -> str: - return "Objects have descriptions" - - @property - def id(self) -> str: - return "description" - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - result = LinterResult() - if not contract.info or not contract.info.description: - result = result.with_error("Contract has empty description.") - for model_name, model in contract.models.items(): - if not model.description: - result = result.with_error(f"Model '{model_name}' has empty description.") - for field_name, field in model.fields.items(): - if not field.description: - result = result.with_error(f"Field '{field_name}' in model '{model_name}' has empty description.") - for definition_name, definition in contract.definitions.items(): - if not definition.description: - result = result.with_error(f"Definition '{definition_name}' has empty description.") - for index, example in enumerate(contract.examples): - if not example.description: - result = result.with_error(f"Example {index + 1} has empty description.") - return result diff --git a/datacontract/lint/linters/field_pattern_linter.py b/datacontract/lint/linters/field_pattern_linter.py deleted file mode 100644 index b9add00ef..000000000 --- a/datacontract/lint/linters/field_pattern_linter.py +++ /dev/null @@ -1,34 +0,0 @@ -import re - -from datacontract.model.data_contract_specification import DataContractSpecification - -from ..lint import Linter, LinterResult - - -class FieldPatternLinter(Linter): - """Checks that all patterns defined for fields are correct Python regex - syntax. - - """ - - @property - def name(self): - return "Field pattern is correct regex" - - @property - def id(self) -> str: - return "field-pattern" - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - result = LinterResult() - for model_name, model in contract.models.items(): - for field_name, field in model.fields.items(): - if field.pattern: - try: - re.compile(field.pattern) - except re.error as e: - result = result.with_error( - f"Failed to compile pattern regex '{field.pattern}' for " - f"field '{field_name}' in model '{model_name}': {e.msg}" - ) - return result diff --git a/datacontract/lint/linters/field_reference_linter.py b/datacontract/lint/linters/field_reference_linter.py deleted file mode 100644 index 0a9a52435..000000000 --- a/datacontract/lint/linters/field_reference_linter.py +++ /dev/null @@ -1,47 +0,0 @@ -from datacontract.model.data_contract_specification import DataContractSpecification - -from ..lint import Linter, LinterResult - - -class FieldReferenceLinter(Linter): - """Checks that all references definitions in fields refer to existing - fields. - - """ - - @property - def name(self): - return "Field references existing field" - - @property - def id(self) -> str: - return "field-reference" - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - result = LinterResult() - for model_name, model in contract.models.items(): - for field_name, field in model.fields.items(): - if field.references: - reference_hierarchy = field.references.split(".") - if len(reference_hierarchy) != 2: - result = result.with_error( - f"Field '{field_name}' in model '{model_name}'" - f" references must follow the model.field syntax and refer to a field in a model in this data contract." - ) - continue - ref_model = reference_hierarchy[0] - ref_field = reference_hierarchy[1] - - if ref_model not in contract.models: - result = result.with_error( - f"Field '{field_name}' in model '{model_name}' references non-existing model '{ref_model}'." - ) - else: - ref_model_obj = contract.models[ref_model] - if ref_field not in ref_model_obj.fields: - result = result.with_error( - f"Field '{field_name}' in model '{model_name}'" - f" references non-existing field '{ref_field}'" - f" in model '{ref_model}'." - ) - return result diff --git a/datacontract/lint/linters/notice_period_linter.py b/datacontract/lint/linters/notice_period_linter.py deleted file mode 100644 index 7051bc4f6..000000000 --- a/datacontract/lint/linters/notice_period_linter.py +++ /dev/null @@ -1,55 +0,0 @@ -import re - -from datacontract.model.data_contract_specification import DataContractSpecification - -from ..lint import Linter, LinterResult - - -class NoticePeriodLinter(Linter): - @property - def name(self) -> str: - return "noticePeriod in ISO8601 format" - - @property - def id(self) -> str: - return "notice-period" - - # Regex matching the "simple" ISO8601 duration format - simple = re.compile( - r"""P # Introduces period - (:?[0-9\.,]+Y)? # Number of years - (:?[0-9\.,]+M)? # Number of months - (:?[0-9\.,]+W)? # Number of weeks - (:?[0-9\.,]+D)? # Number of days - (:? # Time part (optional) - T # Always starts with T - (:?[0-9\.,]+H)? # Number of hours - (:?[0-9\.,]+M)? # Number of minutes - (:?[0-9\.,]+S)? # Number of seconds - )? - """, - re.VERBOSE, - ) - datetime_basic = re.compile(r"P\d{8}T\d{6}") - datetime_extended = re.compile(r"P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - """Check whether the notice period is specified using ISO8601 duration syntax.""" - if not contract.terms: - return LinterResult.cautious("No terms defined.") - period = contract.terms.noticePeriod - if not period: - return LinterResult.cautious("No notice period defined.") - if not period.startswith("P"): - return LinterResult.erroneous(f"Notice period '{period}' is not a valid ISO8601 duration.") - if period == "P": - return LinterResult.erroneous( - "Notice period 'P' is not a valid ISO8601 duration, requires at least one duration to be specified." - ) - if ( - not self.simple.fullmatch(period) - and not self.datetime_basic.fullmatch(period) - and not self.datetime_extended.fullmatch(period) - ): - return LinterResult.erroneous(f"Notice period '{period}' is not a valid ISO8601 duration.") - return LinterResult() diff --git a/datacontract/lint/linters/valid_constraints_linter.py b/datacontract/lint/linters/valid_constraints_linter.py deleted file mode 100644 index 54afea84d..000000000 --- a/datacontract/lint/linters/valid_constraints_linter.py +++ /dev/null @@ -1,100 +0,0 @@ -from datacontract.model.data_contract_specification import DataContractSpecification, Field - -from ..lint import Linter, LinterResult - - -class ValidFieldConstraintsLinter(Linter): - """Check validity of field constraints. - - More precisely, check that only numeric constraints are specified on - fields of numeric type and string constraints on fields of string type. - Additionally, the linter checks that defined constraints make sense. - Minimum values should not be greater than maximum values, exclusive and - non-exclusive minimum and maximum should not be combined and string - pattern and format should not be combined. - - """ - - valid_types_for_constraint = { - "pattern": set(["string", "text", "varchar"]), - "format": set(["string", "text", "varchar"]), - "minLength": set(["string", "text", "varchar"]), - "maxLength": set(["string", "text", "varchar"]), - "minimum": set(["int", "integer", "number", "decimal", "numeric", "long", "bigint", "float", "double"]), - "exclusiveMinimum": set( - ["int", "integer", "number", "decimal", "numeric", "long", "bigint", "float", "double"] - ), - "maximum": set(["int", "integer", "number", "decimal", "numeric", "long", "bigint", "float", "double"]), - "exclusiveMaximum": set( - ["int", "integer", "number", "decimal", "numeric", "long", "bigint", "float", "double"] - ), - } - - def check_minimum_maximum(self, field: Field, field_name: str, model_name: str) -> LinterResult: - (min, max, xmin, xmax) = (field.minimum, field.maximum, field.exclusiveMinimum, field.exclusiveMaximum) - match ( - "minimum" in field.model_fields_set, - "maximum" in field.model_fields_set, - "exclusiveMinimum" in field.model_fields_set, - "exclusiveMaximum" in field.model_fields_set, - ): - case (True, True, _, _) if min > max: - return LinterResult.erroneous( - f"Minimum {min} is greater than maximum {max} on field '{field_name}' in model '{model_name}'." - ) - case (_, _, True, True) if xmin >= xmax: - return LinterResult.erroneous( - f"Exclusive minimum {xmin} is greater than exclusive" - f" maximum {xmax} on field '{field_name}' in model '{model_name}'." - ) - case (True, True, True, True): - return LinterResult.erroneous( - f"Both exclusive and non-exclusive minimum and maximum are " - f"defined on field '{field_name}' in model '{model_name}'." - ) - case (True, _, True, _): - return LinterResult.erroneous( - f"Both exclusive and non-exclusive minimum are " - f"defined on field '{field_name}' in model '{model_name}'." - ) - case (_, True, _, True): - return LinterResult.erroneous( - f"Both exclusive and non-exclusive maximum are " - f"defined on field '{field_name}' in model '{model_name}'." - ) - return LinterResult() - - def check_string_constraints(self, field: Field, field_name: str, model_name: str) -> LinterResult: - result = LinterResult() - if field.minLength and field.maxLength and field.minLength > field.maxLength: - result = result.with_error( - f"Minimum length is greater that maximum length on field '{field_name}' in model '{model_name}'." - ) - if field.pattern and field.format: - result = result.with_error( - f"Both a pattern and a format are defined for field '{field_name}' in model '{model_name}'." - ) - return result - - @property - def name(self): - return "Fields use valid constraints" - - @property - def id(self): - return "field-constraints" - - def lint_implementation(self, contract: DataContractSpecification) -> LinterResult: - result = LinterResult() - for model_name, model in contract.models.items(): - for field_name, field in model.fields.items(): - for _property, allowed_types in self.valid_types_for_constraint.items(): - if _property in field.model_fields_set and field.type not in allowed_types: - result = result.with_error( - f"Forbidden constraint '{_property}' defined on field " - f"'{field_name}' in model '{model_name}'. Field type " - f"is '{field.type}'." - ) - result = result.combine(self.check_minimum_maximum(field, field_name, model_name)) - result = result.combine(self.check_string_constraints(field, field_name, model_name)) - return result diff --git a/datacontract/lint/resolve.py b/datacontract/lint/resolve.py index 575fdc6a5..428d861e7 100644 --- a/datacontract/lint/resolve.py +++ b/datacontract/lint/resolve.py @@ -1,42 +1,44 @@ +import importlib.resources as resources import logging -import os -import warnings +from pathlib import Path import fastjsonschema import yaml from fastjsonschema import JsonSchemaValueException -from open_data_contract_standard.model import OpenDataContractStandard +from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty -from datacontract.imports.odcs_v3_importer import import_from_odcs, parse_odcs_v3_from_str from datacontract.lint.resources import read_resource from datacontract.lint.schema import fetch_schema -from datacontract.lint.urls import fetch_resource -from datacontract.model.data_contract_specification import ( - DataContractSpecification, - Definition, - DeprecatedQuality, -) from datacontract.model.exceptions import DataContractException -from datacontract.model.odcs import is_open_data_contract_standard +from datacontract.model.odcs import is_open_data_contract_standard, is_open_data_product_standard from datacontract.model.run import ResultEnum -def resolve_data_contract( +class _SafeLoaderNoTimestamp(yaml.SafeLoader): + """SafeLoader that keeps dates/timestamps as strings instead of converting to datetime objects.""" + + pass + + +# Remove the timestamp implicit resolver so dates like 2022-01-15 stay as strings +_SafeLoaderNoTimestamp.yaml_implicit_resolvers = { + k: [(tag, regexp) for tag, regexp in v if tag != "tag:yaml.org,2002:timestamp"] + for k, v in _SafeLoaderNoTimestamp.yaml_implicit_resolvers.copy().items() +} + + +def resolve_data_contract_dict( data_contract_location: str = None, data_contract_str: str = None, - data_contract: DataContractSpecification = None, - schema_location: str = None, - inline_definitions: bool = False, - inline_quality: bool = False, -) -> DataContractSpecification: + data_contract: OpenDataContractStandard = None, +) -> dict: + """Resolve a data contract and return it as a dictionary.""" if data_contract_location is not None: - return resolve_data_contract_from_location( - data_contract_location, schema_location, inline_definitions, inline_quality - ) + return _to_yaml(read_resource(data_contract_location)) elif data_contract_str is not None: - return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality) + return _to_yaml(data_contract_str) elif data_contract is not None: - return data_contract + return data_contract.model_dump() else: raise DataContractException( type="lint", @@ -47,22 +49,20 @@ def resolve_data_contract( ) -def resolve_data_contract_v2( +def resolve_data_contract( data_contract_location: str = None, data_contract_str: str = None, - data_contract: DataContractSpecification | OpenDataContractStandard = None, + data_contract: OpenDataContractStandard = None, schema_location: str = None, inline_definitions: bool = False, - inline_quality: bool = False, -) -> DataContractSpecification | OpenDataContractStandard: +) -> OpenDataContractStandard: + """Resolve and parse a data contract from various sources.""" if data_contract_location is not None: - return resolve_data_contract_from_location_v2( - data_contract_location, schema_location, inline_definitions, inline_quality + return resolve_data_contract_from_location( + data_contract_location, schema_location, inline_definitions ) elif data_contract_str is not None: - return _resolve_data_contract_from_str_v2( - data_contract_str, schema_location, inline_definitions, inline_quality - ) + return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions) elif data_contract is not None: return data_contract else: @@ -75,269 +75,95 @@ def resolve_data_contract_v2( ) -def resolve_data_contract_dict( - data_contract_location: str = None, - data_contract_str: str = None, - data_contract: DataContractSpecification = None, -) -> dict: - if data_contract_location is not None: - return _to_yaml(read_resource(data_contract_location)) - elif data_contract_str is not None: - return _to_yaml(data_contract_str) - elif data_contract is not None: - return data_contract.model_dump() - else: - raise DataContractException( - type="lint", - result=ResultEnum.failed, - name="Check that data contract YAML is valid", - reason="Data contract needs to be provided", - engine="datacontract", - ) - - -def resolve_data_contract_from_location_v2( - location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False -) -> DataContractSpecification | OpenDataContractStandard: - data_contract_str = read_resource(location) - return _resolve_data_contract_from_str_v2(data_contract_str, schema_location, inline_definitions, inline_quality) - - def resolve_data_contract_from_location( - location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False -) -> DataContractSpecification: + location, schema_location: str = None, inline_definitions: bool = False +) -> OpenDataContractStandard: data_contract_str = read_resource(location) - return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality) - - -def inline_definitions_into_data_contract(spec: DataContractSpecification): - for model in spec.models.values(): - for field in model.fields.values(): - inline_definition_into_field(field, spec) + return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions) -def inline_definition_into_field(field, spec): - # iterate recursively over arrays - if field.items is not None: - inline_definition_into_field(field.items, spec) - - # iterate recursively over nested fields - if field.fields is not None: - for nested_field_name, nested_field in field.fields.items(): - inline_definition_into_field(nested_field, spec) - - if not field.ref: +def inline_definitions_into_data_contract(data_contract: OpenDataContractStandard): + """Inline any definition references into the schema properties.""" + if data_contract.schema_ is None: return - definition = _resolve_definition_ref(field.ref, spec) - for field_name in field.model_fields.keys(): - if field_name in definition.model_fields_set and field_name not in field.model_fields_set: - setattr(field, field_name, getattr(definition, field_name)) - # extras - for extra_field_name, extra_field_value in definition.model_extra.items(): - if extra_field_name not in field.model_extra.keys(): - setattr(field, extra_field_name, extra_field_value) - - -def _resolve_definition_ref(ref, spec) -> Definition: - logging.info(f"Resolving definition ref {ref}") - - if "#" in ref: - path, definition_path = ref.split("#") - else: - path, definition_path = ref, None - - if path.startswith("http://") or path.startswith("https://"): - logging.info(f"Resolving definition url {path}") - - definition_str = fetch_resource(path) - definition_dict = _to_yaml(definition_str) - definition = Definition(**definition_dict) - if definition_path is not None: - return _find_by_path_in_definition(definition_path, definition) - else: - return definition - elif path.startswith("file://"): - logging.info(f"Resolving definition file path {path}") - - path = path.replace("file://", "") - definition_str = _fetch_file(path) - definition_dict = _to_yaml(definition_str) - if definition_path: - path_parts = [part for part in definition_path.split("/") if part != ""] - for path_part in path_parts: - definition_dict = definition_dict.get(path_part, None) - if not definition_dict: - raise DataContractException( - type="lint", - result="failed", - name="Check that data contract YAML is valid", - reason=f"Cannot resolve definition {definition_path}, {path_part} not found", - engine="datacontract", - ) - # this assumes that definitions_dict is a definitions dict, however, - # all we know is that it is a file! - definition = Definition(**definition_dict) - # if definition_path is not None: - # definition = _find_by_path_in_definition(definition_path, definition) - return definition - elif ref.startswith("#"): - logging.info(f"Resolving definition local path {path}") - - definition_path = ref[1:] - - return _find_by_path_in_spec(definition_path, spec) - else: - raise DataContractException( - type="lint", - result=ResultEnum.failed, - name="Check that data contract YAML is valid", - reason=f"Cannot resolve reference {ref}", - engine="datacontract", - ) - - -def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification): - path_elements = definition_path.split("/") - definition_key = path_elements[2] - if definition_key not in spec.definitions: - raise DataContractException( - type="lint", - result=ResultEnum.failed, - name="Check that data contract YAML is valid", - reason=f"Cannot resolve definition {definition_key}", - engine="datacontract", - ) - definition = spec.definitions[definition_key] - definition = _find_subfield_in_definition(definition, path_elements[3:]) - return definition - + for schema_obj in data_contract.schema_: + if schema_obj.properties: + for prop in schema_obj.properties: + inline_definition_into_property(prop, data_contract) -def _find_by_path_in_definition(definition_path: str, definition: Definition): - if definition_path == "" or definition_path == "/": - return definition - path_elements = definition_path.split("/") - return _find_subfield_in_definition(definition, path_elements[1:]) +def inline_definition_into_property(prop: SchemaProperty, data_contract: OpenDataContractStandard): + """Recursively inline definitions into a property and its nested properties.""" + # Iterate over items for arrays + if prop.items is not None: + inline_definition_into_property(prop.items, data_contract) + # Iterate over nested properties + if prop.properties is not None: + for nested_prop in prop.properties: + inline_definition_into_property(nested_prop, data_contract) -def _find_subfield_in_definition(definition: Definition, path_elements): - while len(path_elements) > 0 and path_elements[0] == "fields": - definition = definition.fields[path_elements[1]] - path_elements = path_elements[2:] + # No definition $ref support in ODCS at the moment + # ODCS uses a different approach - definitions would be handled differently - return definition +def _resolve_data_contract_from_str( + data_contract_str, schema_location: str = None, inline_definitions: bool = False +) -> OpenDataContractStandard: + yaml_dict = _to_yaml(data_contract_str) -def _fetch_file(path) -> str: - if not os.path.exists(path): + if is_open_data_product_standard(yaml_dict): + logging.info("Cannot import ODPS, as not supported") raise DataContractException( - type="export", + type="schema", result=ResultEnum.failed, - name="Check that data contract definition is valid", - reason=f"Cannot resolve reference {path}", + name="Parse ODCS contract", + reason="Cannot parse ODPS product", engine="datacontract", ) - with open(path, "r") as file: - return file.read() - - -def _resolve_quality_ref(quality: DeprecatedQuality): - """ - Return the content of a ref file path - @param quality data contract quality specification - """ - if isinstance(quality.specification, dict): - specification = quality.specification - if quality.type == "great-expectations": - for model, model_quality in specification.items(): - specification[model] = _get_quality_ref_file(model_quality) - else: - if "$ref" in specification: - quality.specification = _get_quality_ref_file(specification) - - -def _get_quality_ref_file(quality_spec: str | object) -> str | object: - """ - Get the file associated with a quality reference - @param quality_spec quality specification - @returns: the content of the quality file - """ - if isinstance(quality_spec, dict) and "$ref" in quality_spec: - ref = quality_spec["$ref"] - if not os.path.exists(ref): - raise DataContractException( - type="export", - result=ResultEnum.failed, - name="Check that data contract quality is valid", - reason=f"Cannot resolve reference {ref}", - engine="datacontract", - ) - with open(ref, "r") as file: - quality_spec = file.read() - return quality_spec - - -def _resolve_data_contract_from_str_v2( - data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False -) -> DataContractSpecification | OpenDataContractStandard: - yaml_dict = _to_yaml(data_contract_str) if is_open_data_contract_standard(yaml_dict): logging.info("Importing ODCS v3") - # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly - odcs = parse_odcs_v3_from_str(data_contract_str) + # Validate the ODCS schema + if schema_location is None: + schema_location = resources.files("datacontract").joinpath("schemas", "odcs-3.1.0.schema.json") + _validate_json_schema(yaml_dict, schema_location) + + # Parse ODCS directly + odcs = _parse_odcs_from_dict(yaml_dict) + if inline_definitions: + inline_definitions_into_data_contract(odcs) return odcs - logging.info("Importing DCS") - return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict) - - -def _resolve_data_contract_from_str( - data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False -) -> DataContractSpecification: - yaml_dict = _to_yaml(data_contract_str) - - if is_open_data_contract_standard(yaml_dict): - logging.info("Importing ODCS v3") - # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly - odcs = parse_odcs_v3_from_str(data_contract_str) - - data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.0") - return import_from_odcs(data_contract_specification, odcs) + # For DCS format, we need to convert it to ODCS + logging.info("Importing DCS format - converting to ODCS") + from datacontract.imports.dcs_importer import convert_dcs_to_odcs, parse_dcs_from_dict - logging.info("Importing DCS") - return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict) + dcs = parse_dcs_from_dict(yaml_dict) + odcs = convert_dcs_to_odcs(dcs) + if inline_definitions: + inline_definitions_into_data_contract(odcs) + return odcs -def _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict): - _validate_data_contract_specification_schema(yaml_dict, schema_location) - data_contract_specification = yaml_dict - spec = DataContractSpecification(**data_contract_specification) - if inline_definitions: - inline_definitions_into_data_contract(spec) - ## Suppress DeprecationWarning when accessing spec.quality, - ## iif it is in fact *not* used. - with warnings.catch_warnings(record=True) as recorded_warnings: - spec_quality = spec.quality - for w in recorded_warnings: - if not issubclass(w.category, DeprecationWarning) or spec_quality is not None: - warnings.warn_explicit( - message=w.message, - category=w.category, - filename=w.filename, - lineno=w.lineno, - source=w.source, - ) - if spec_quality and inline_quality: - _resolve_quality_ref(spec_quality) - return spec +def _parse_odcs_from_dict(yaml_dict: dict) -> OpenDataContractStandard: + """Parse ODCS from a dictionary.""" + try: + return OpenDataContractStandard(**yaml_dict) + except Exception as e: + raise DataContractException( + type="schema", + name="Parse ODCS contract", + reason=f"Failed to parse ODCS contract: {str(e)}", + engine="datacontract", + original_exception=e, + ) def _to_yaml(data_contract_str) -> dict: try: - yaml_dict = yaml.safe_load(data_contract_str) - return yaml_dict + return yaml.load(data_contract_str, Loader=_SafeLoaderNoTimestamp) except Exception as e: logging.warning(f"Cannot parse YAML. Error: {str(e)}") raise DataContractException( @@ -349,16 +175,17 @@ def _to_yaml(data_contract_str) -> dict: ) -def _validate_data_contract_specification_schema(data_contract_yaml, schema_location: str = None): +def _validate_json_schema(yaml_str, schema_location: str | Path = None): + logging.debug(f"Linting data contract with schema at {schema_location}") schema = fetch_schema(schema_location) try: - fastjsonschema.validate(schema, data_contract_yaml, use_default=False) + fastjsonschema.validate(schema, yaml_str, use_default=False) logging.debug("YAML data is valid.") except JsonSchemaValueException as e: logging.warning(f"Data Contract YAML is invalid. Validation error: {e.message}") raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason=e.message, engine="datacontract", @@ -367,7 +194,7 @@ def _validate_data_contract_specification_schema(data_contract_yaml, schema_loca logging.warning(f"Data Contract YAML is invalid. Validation error: {str(e)}") raise DataContractException( type="lint", - result="failed", + result=ResultEnum.failed, name="Check that data contract YAML is valid", reason=str(e), engine="datacontract", diff --git a/datacontract/lint/schema.py b/datacontract/lint/schema.py index 52a2eec38..b0e7867aa 100644 --- a/datacontract/lint/schema.py +++ b/datacontract/lint/schema.py @@ -2,16 +2,18 @@ import json import logging import os +from pathlib import Path from typing import Any, Dict import requests from datacontract.model.exceptions import DataContractException +from datacontract.model.run import ResultEnum -DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.0.schema.json" +DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.1.schema.json" -def fetch_schema(location: str = None) -> Dict[str, Any]: +def fetch_schema(location: str | Path = None) -> Dict[str, Any]: """ Fetch and return a JSON schema from a given location. @@ -36,19 +38,26 @@ def fetch_schema(location: str = None) -> Dict[str, Any]: schema_file = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_SCHEMA) with schema_file.open("r") as file: schema = json.load(file) - elif location.startswith("http://") or location.startswith("https://"): - response = requests.get(location) - schema = response.json() else: - if not os.path.exists(location): - raise DataContractException( - type="lint", - name=f"Reading schema from {location}", - reason=f"The file '{location}' does not exist.", - engine="datacontract", - result="error", - ) - with open(location, "r") as file: - schema = json.load(file) + # Convert Path objects to strings for string operations + location_str = str(location) + + if location_str.startswith("http://") or location_str.startswith("https://"): + logging.debug(f"Downloading schema from {location_str}") + response = requests.get(location_str) + schema = response.json() + else: + if not os.path.exists(location): + raise DataContractException( + type="lint", + name=f"Reading schema from {location}", + reason=f"The file '{location}' does not exist.", + engine="datacontract", + result=ResultEnum.error, + ) + + logging.debug(f"Loading JSON schema locally at {location}") + with open(location, "r") as file: + schema = json.load(file) return schema diff --git a/datacontract/lint/urls.py b/datacontract/lint/urls.py index 6279e6187..ce943118e 100644 --- a/datacontract/lint/urls.py +++ b/datacontract/lint/urls.py @@ -28,10 +28,22 @@ def fetch_resource(url: str): def _set_api_key(headers, url): hostname = urlparse(url).hostname + entropy_data_api_key = os.getenv("ENTROPY_DATA_API_KEY") datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY") datacontract_manager_api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY") - if hostname == "datamesh-manager.com" or hostname.endswith(".datamesh-manager.com"): + if hostname == "entropy-data.com" or hostname.endswith(".entropy-data.com"): + if entropy_data_api_key is None or entropy_data_api_key == "": + print("Error: Entropy Data API key is not set. Set env variable ENTROPY_DATA_API_KEY.") + raise DataContractException( + type="lint", + name=f"Reading data contract from {url}", + reason="Error: Entropy Data API key is not set. Set env variable ENTROPY_DATA_API_KEY.", + engine="datacontract", + result="error", + ) + headers["x-api-key"] = entropy_data_api_key + elif hostname == "datamesh-manager.com" or hostname.endswith(".datamesh-manager.com"): if datamesh_manager_api_key is None or datamesh_manager_api_key == "": print("Error: Data Mesh Manager API key is not set. Set env variable DATAMESH_MANAGER_API_KEY.") raise DataContractException( @@ -54,7 +66,9 @@ def _set_api_key(headers, url): ) headers["x-api-key"] = datacontract_manager_api_key - if datamesh_manager_api_key is not None and datamesh_manager_api_key != "": - headers["x-api-key"] = datamesh_manager_api_key if datacontract_manager_api_key is not None and datacontract_manager_api_key != "": headers["x-api-key"] = datacontract_manager_api_key + if datamesh_manager_api_key is not None and datamesh_manager_api_key != "": + headers["x-api-key"] = datamesh_manager_api_key + if entropy_data_api_key is not None and entropy_data_api_key != "": + headers["x-api-key"] = entropy_data_api_key diff --git a/datacontract/model/data_contract_specification/__init__.py b/datacontract/model/data_contract_specification/__init__.py deleted file mode 100644 index 2141d2da0..000000000 --- a/datacontract/model/data_contract_specification/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from datacontract_specification.model import * diff --git a/datacontract/model/odcs.py b/datacontract/model/odcs.py index 3b040918e..9f7cd6675 100644 --- a/datacontract/model/odcs.py +++ b/datacontract/model/odcs.py @@ -9,3 +9,16 @@ def is_open_data_contract_standard(odcs: dict) -> bool: bool: True if the dictionary is an OpenDataContractStandard, False otherwise. """ return odcs.get("kind") == "DataContract" and odcs.get("apiVersion", "").startswith("v3") + + +def is_open_data_product_standard(odcs: dict) -> bool: + """ + Check if the given dictionary is an open data product standard. + + Args: + odcs (dict): The dictionary to check. + + Returns: + bool: True if the dictionary is an open data product standard, False otherwise. + """ + return odcs.get("kind") == "DataProduct" and odcs.get("apiVersion", "").startswith("v1") diff --git a/datacontract/output/test_results_writer.py b/datacontract/output/test_results_writer.py index 56d9f3379..7cae4888a 100644 --- a/datacontract/output/test_results_writer.py +++ b/datacontract/output/test_results_writer.py @@ -1,6 +1,8 @@ from pathlib import Path +from typing import Optional import typer +from open_data_contract_standard.model import OpenDataContractStandard from rich import box from rich.console import Console from rich.table import Table @@ -10,10 +12,48 @@ from datacontract.output.output_format import OutputFormat -def write_test_result(run: Run, console: Console, output_format: OutputFormat, output_path: Path): +def write_test_result( + run: Run, + console: Console, + output_format: OutputFormat, + output_path: Path, + data_contract: Optional[OpenDataContractStandard] = None, +): if output_format == OutputFormat.junit: write_junit_test_results(run, console, output_path) + if run.server and data_contract and data_contract.servers: + server = next((s for s in data_contract.servers if s.server == run.server), None) + if server: + details = [] + if server.type: + details.append(f"type={server.type}") + if server.format: + details.append(f"format={server.format}") + if server.host: + details.append(f"host={server.host}") + if server.port: + details.append(f"port={server.port}") + if server.database: + details.append(f"database={server.database}") + if server.schema_: + details.append(f"schema={server.schema_}") + if server.catalog: + details.append(f"catalog={server.catalog}") + if server.dataset: + details.append(f"dataset={server.dataset}") + if server.project: + details.append(f"project={server.project}") + if server.account: + details.append(f"account={server.account}") + if server.location: + details.append(f"location={server.location}") + if server.path: + details.append(f"path={server.path}") + details_str = ", ".join(details) if details else "" + if details_str: + console.print(f"Server: {run.server} ({details_str})") + _print_table(run, console) if run.result == "passed": console.print( diff --git a/datacontract/schemas/datacontract-1.1.0.init.yaml b/datacontract/schemas/datacontract-1.1.0.init.yaml index 2528401a2..29baf9bf8 100644 --- a/datacontract/schemas/datacontract-1.1.0.init.yaml +++ b/datacontract/schemas/datacontract-1.1.0.init.yaml @@ -1,4 +1,4 @@ -dataContractSpecification: 1.2.0 +dataContractSpecification: 1.2.1 id: my-data-contract-id info: title: My Data Contract diff --git a/datacontract/schemas/datacontract-1.2.0.init.yaml b/datacontract/schemas/datacontract-1.2.0.init.yaml index 2528401a2..29baf9bf8 100644 --- a/datacontract/schemas/datacontract-1.2.0.init.yaml +++ b/datacontract/schemas/datacontract-1.2.0.init.yaml @@ -1,4 +1,4 @@ -dataContractSpecification: 1.2.0 +dataContractSpecification: 1.2.1 id: my-data-contract-id info: title: My Data Contract diff --git a/datacontract/schemas/datacontract-1.2.1.init.yaml b/datacontract/schemas/datacontract-1.2.1.init.yaml new file mode 100644 index 000000000..29baf9bf8 --- /dev/null +++ b/datacontract/schemas/datacontract-1.2.1.init.yaml @@ -0,0 +1,91 @@ +dataContractSpecification: 1.2.1 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +# description: +# owner: +# contact: +# name: +# url: +# email: + + +### servers + +#servers: +# production: +# type: s3 +# location: s3:// +# format: parquet +# delimiter: new_line + +### terms + +#terms: +# usage: +# limitations: +# billing: +# noticePeriod: + + +### models + +# models: +# my_model: +# description: +# type: +# fields: +# my_field: +# type: +# description: + + +### definitions + +# definitions: +# my_field: +# domain: +# name: +# title: +# type: +# description: +# example: +# pii: +# classification: + + +### servicelevels + +#servicelevels: +# availability: +# description: The server is available during support hours +# percentage: 99.9% +# retention: +# description: Data is retained for one year because! +# period: P1Y +# unlimited: false +# latency: +# description: Data is available within 25 hours after the order was placed +# threshold: 25h +# sourceTimestampField: orders.order_timestamp +# processedTimestampField: orders.processed_timestamp +# freshness: +# description: The age of the youngest row in a table. +# threshold: 25h +# timestampField: orders.order_timestamp +# frequency: +# description: Data is delivered once a day +# type: batch # or streaming +# interval: daily # for batch, either or cron +# cron: 0 0 * * * # for batch, either or interval +# support: +# description: The data is available during typical business hours at headquarters +# time: 9am to 5pm in EST on business days +# responseTime: 1h +# backup: +# description: Data is backed up once a week, every Sunday at 0:00 UTC. +# interval: weekly +# cron: 0 0 * * 0 +# recoveryTime: 24 hours +# recoveryPoint: 1 week diff --git a/datacontract/schemas/datacontract-1.2.1.schema.json b/datacontract/schemas/datacontract-1.2.1.schema.json new file mode 100644 index 000000000..b888b1e9c --- /dev/null +++ b/datacontract/schemas/datacontract-1.2.1.schema.json @@ -0,0 +1,2058 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "title": "DataContractSpecification", + "properties": { + "dataContractSpecification": { + "type": "string", + "title": "DataContractSpecificationVersion", + "enum": [ + "1.2.1", + "1.2.0", + "1.1.0", + "0.9.3", + "0.9.2", + "0.9.1", + "0.9.0" + ], + "description": "Specifies the Data Contract Specification being used." + }, + "id": { + "type": "string", + "description": "Specifies the identifier of the data contract." + }, + "info": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of the data contract." + }, + "version": { + "type": "string", + "description": "The version of the data contract document (which is distinct from the Data Contract Specification version or the Data Product implementation version)." + }, + "status": { + "type": "string", + "description": "The status of the data contract. Can be proposed, in development, active, retired.", + "examples": [ + "proposed", + "in development", + "active", + "deprecated", + "retired" + ] + }, + "description": { + "type": "string", + "description": "A description of the data contract." + }, + "owner": { + "type": "string", + "description": "The owner or team responsible for managing the data contract and providing the data." + }, + "contact": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The identifying name of the contact person/organization." + }, + "url": { + "type": "string", + "format": "uri", + "description": "The URL pointing to the contact information. This MUST be in the form of a URL." + }, + "email": { + "type": "string", + "format": "email", + "description": "The email address of the contact person/organization. This MUST be in the form of an email address." + } + }, + "description": "Contact information for the data contract.", + "additionalProperties": true + } + }, + "additionalProperties": true, + "required": [ + "title", + "version" + ], + "description": "Metadata and life cycle information about the data contract." + }, + "servers": { + "type": "object", + "description": "Information about the servers.", + "additionalProperties": { + "$ref": "#/$defs/BaseServer", + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "bigquery" + } + } + }, + "then": { + "$ref": "#/$defs/BigQueryServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgres" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "s3" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/S3Server" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sftp" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/SftpServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "redshift" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/RedshiftServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "azure" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/AzureServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sqlserver" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/SqlserverServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "snowflake" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/SnowflakeServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "databricks" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/DatabricksServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "dataframe" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/DataframeServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "glue" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/GlueServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgres" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "oracle" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/OracleServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kafka" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/KafkaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "pubsub" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/PubSubServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kinesis" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/KinesisDataStreamsServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "trino" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/TrinoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "clickhouse" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/ClickhouseServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "local" + } + }, + "required": [ + "type" + ] + }, + "then": { + "$ref": "#/$defs/LocalServer" + } + } + ] + } + }, + "terms": { + "type": "object", + "description": "The terms and conditions of the data contract.", + "properties": { + "usage": { + "type": "string", + "description": "The usage describes the way the data is expected to be used. Can contain business and technical information." + }, + "limitations": { + "type": "string", + "description": "The limitations describe the restrictions on how the data can be used, can be technical or restrictions on what the data may not be used for." + }, + "policies": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The type of the policy.", + "examples": [ + "privacy", + "security", + "retention", + "compliance" + ] + }, + "description": { + "type": "string", + "description": "A description of the policy." + }, + "url": { + "type": "string", + "format": "uri", + "description": "A URL to the policy document." + } + }, + "additionalProperties": true + }, + "description": "The limitations describe the restrictions on how the data can be used, can be technical or restrictions on what the data may not be used for." + }, + "billing": { + "type": "string", + "description": "The billing describes the pricing model for using the data, such as whether it's free, having a monthly fee, or metered pay-per-use." + }, + "noticePeriod": { + "type": "string", + "description": "The period of time that must be given by either party to terminate or modify a data usage agreement. Uses ISO-8601 period format, e.g., 'P3M' for a period of three months." + } + }, + "additionalProperties": true + }, + "models": { + "description": "Specifies the logical data model. Use the models name (e.g., the table name) as the key.", + "type": "object", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "object", + "title": "Model", + "properties": { + "description": { + "type": "string" + }, + "type": { + "description": "The type of the model. Examples: table, view, object. Default: table.", + "type": "string", + "title": "ModelType", + "default": "table", + "enum": [ + "table", + "view", + "object" + ] + }, + "title": { + "type": "string", + "description": "An optional string providing a human readable name for the model. Especially useful if the model name is cryptic or contains abbreviations.", + "examples": [ + "Purchase Orders", + "Air Shipments" + ] + }, + "fields": { + "description": "Specifies a field in the data model. Use the field name (e.g., the column name) as the key.", + "type": "object", + "additionalProperties": { + "type": "object", + "title": "Field", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the semantic of the data in this field." + }, + "title": { + "type": "string", + "description": "An optional string providing a human readable name for the field. Especially useful if the field name is cryptic or contains abbreviations." + }, + "type": { + "$ref": "#/$defs/FieldType" + }, + "required": { + "type": "boolean", + "default": false, + "description": "An indication, if this field must contain a value and may not be null." + }, + "fields": { + "description": "The nested fields (e.g. columns) of the object, record, or struct.", + "type": "object", + "additionalProperties": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + } + }, + "items": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "keys": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "values": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "primary": { + "type": "boolean", + "deprecationMessage": "Use the primaryKey field instead." + }, + "primaryKey": { + "type": "boolean", + "default": false, + "description": "If this field is a primary key." + }, + "references": { + "type": "string", + "description": "The reference to a field in another model. E.g. use 'orders.order_id' to reference the order_id field of the model orders. Think of defining a foreign key relationship.", + "examples": [ + "orders.order_id", + "model.nested_field.field" + ] + }, + "unique": { + "type": "boolean", + "default": false, + "description": "An indication, if the value must be unique within the model." + }, + "enum": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true, + "description": "A value must be equal to one of the elements in this array value. Only evaluated if the value is not null." + }, + "minLength": { + "type": "integer", + "description": "A value must greater than, or equal to, the value of this. Only applies to string types." + }, + "maxLength": { + "type": "integer", + "description": "A value must less than, or equal to, the value of this. Only applies to string types." + }, + "format": { + "type": "string", + "description": "A specific format the value must comply with (e.g., 'email', 'uri', 'uuid').", + "examples": [ + "email", + "uri", + "uuid" + ] + }, + "precision": { + "type": "number", + "examples": [ + 38 + ], + "description": "The maximum number of digits in a number. Only applies to numeric values. Defaults to 38." + }, + "scale": { + "type": "number", + "examples": [ + 0 + ], + "description": "The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0." + }, + "pattern": { + "type": "string", + "description": "A regular expression the value must match. Only applies to string types.", + "examples": [ + "^[a-zA-Z0-9_-]+$" + ] + }, + "minimum": { + "type": "number", + "description": "A value of a number must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "exclusiveMinimum": { + "type": "number", + "description": "A value of a number must greater than the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "maximum": { + "type": "number", + "description": "A value of a number must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "exclusiveMaximum": { + "type": "number", + "description": "A value of a number must less than the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "example": { + "type": "string", + "description": "An example value for this field.", + "deprecationMessage": "Use the examples field instead." + }, + "examples": { + "type": "array", + "description": "A examples value for this field." + }, + "pii": { + "type": "boolean", + "description": "An indication, if this field contains Personal Identifiable Information (PII)." + }, + "classification": { + "type": "string", + "description": "The data class defining the sensitivity level for this field, according to the organization's classification scheme.", + "examples": [ + "sensitive", + "restricted", + "internal", + "public" + ] + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Custom metadata to provide additional context." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + }, + "$ref": { + "type": "string", + "description": "A reference URI to a definition in the specification, internally or externally. Properties will be inherited from the definition." + }, + "quality": { + "type": "array", + "items": { + "$ref": "#/$defs/Quality" + } + }, + "lineage": { + "$ref": "#/$defs/Lineage" + }, + "config": { + "type": "object", + "description": "Additional metadata for field configuration.", + "additionalProperties": { + "type": [ + "string", + "number", + "boolean", + "object", + "array", + "null" + ] + }, + "properties": { + "avroType": { + "type": "string", + "description": "Specify the field type to use when exporting the data model to Apache Avro." + }, + "avroLogicalType": { + "type": "string", + "description": "Specify the logical field type to use when exporting the data model to Apache Avro." + }, + "bigqueryType": { + "type": "string", + "description": "Specify the physical column type that is used in a BigQuery table, e.g., `NUMERIC(5, 2)`." + }, + "snowflakeType": { + "type": "string", + "description": "Specify the physical column type that is used in a Snowflake table, e.g., `TIMESTAMP_LTZ`." + }, + "redshiftType": { + "type": "string", + "description": "Specify the physical column type that is used in a Redshift table, e.g., `SMALLINT`." + }, + "sqlserverType": { + "type": "string", + "description": "Specify the physical column type that is used in a SQL Server table, e.g., `DATETIME2`." + }, + "databricksType": { + "type": "string", + "description": "Specify the physical column type that is used in a Databricks Unity Catalog table." + }, + "glueType": { + "type": "string", + "description": "Specify the physical column type that is used in an AWS Glue Data Catalog table." + } + } + } + } + } + }, + "primaryKey": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The compound primary key of the model." + }, + "quality": { + "type": "array", + "items": { + "$ref": "#/$defs/Quality" + } + }, + "examples": { + "type": "array" + }, + "additionalFields": { + "type": "boolean", + "description": " Specify, if the model can have additional fields that are not defined in the contract. ", + "default": false + }, + "config": { + "type": "object", + "description": "Additional metadata for model configuration.", + "additionalProperties": { + "type": [ + "string", + "number", + "boolean", + "object", + "array", + "null" + ] + }, + "properties": { + "avroNamespace": { + "type": "string", + "description": "The namespace to use when importing and exporting the data model from / to Apache Avro." + } + } + } + } + } + }, + "definitions": { + "description": "Clear and concise explanations of syntax, semantic, and classification of business objects in a given domain.", + "type": "object", + "propertyNames": { + "pattern": "^[a-zA-Z0-9/_-]+$" + }, + "additionalProperties": { + "type": "object", + "title": "Definition", + "properties": { + "domain": { + "type": "string", + "description": "The domain in which this definition is valid.", + "default": "global", + "deprecationMessage": "This field is deprecated. Encode the domain into the ID using slashes." + }, + "name": { + "type": "string", + "description": "The technical name of this definition.", + "deprecationMessage": "This field is deprecated. Encode the name into the ID using slashes." + }, + "title": { + "type": "string", + "description": "The business name of this definition." + }, + "description": { + "type": "string", + "description": "Clear and concise explanations related to the domain." + }, + "type": { + "$ref": "#/$defs/FieldType" + }, + "fields": { + "description": "The nested fields (e.g. columns) of the object, record, or struct.", + "type": "object", + "additionalProperties": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + } + }, + "items": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "keys": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "values": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "minLength": { + "type": "integer", + "description": "A value must be greater than or equal to this value. Applies only to string types." + }, + "maxLength": { + "type": "integer", + "description": "A value must be less than or equal to this value. Applies only to string types." + }, + "format": { + "type": "string", + "description": "Specific format requirements for the value (e.g., 'email', 'uri', 'uuid')." + }, + "precision": { + "type": "integer", + "examples": [ + 38 + ], + "description": "The maximum number of digits in a number. Only applies to numeric values. Defaults to 38." + }, + "scale": { + "type": "integer", + "examples": [ + 0 + ], + "description": "The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0." + }, + "pattern": { + "type": "string", + "description": "A regular expression pattern the value must match. Applies only to string types." + }, + "minimum": { + "type": "number", + "description": "A value of a number must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "exclusiveMinimum": { + "type": "number", + "description": "A value of a number must greater than the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "maximum": { + "type": "number", + "description": "A value of a number must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "exclusiveMaximum": { + "type": "number", + "description": "A value of a number must less than the value of this. Only evaluated if the value is not null. Only applies to numeric values." + }, + "example": { + "type": "string", + "description": "An example value.", + "deprecationMessage": "Use the examples field instead." + }, + "examples": { + "type": "array", + "description": "Example value." + }, + "pii": { + "type": "boolean", + "description": "Indicates if the field contains Personal Identifiable Information (PII)." + }, + "classification": { + "type": "string", + "description": "The data class defining the sensitivity level for this field." + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Custom metadata to provide additional context." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + } + }, + "required": [ + "type" + ] + } + }, + "servicelevels": { + "type": "object", + "description": "Specifies the service level agreements for the provided data, including availability, data retention policies, latency requirements, data freshness, update frequency, support availability, and backup policies.", + "properties": { + "availability": { + "type": "object", + "description": "Availability refers to the promise or guarantee by the service provider about the uptime of the system that provides the data.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the availability service level.", + "example": "The server is available during support hours" + }, + "percentage": { + "type": "string", + "description": "An optional string describing the guaranteed uptime in percent (e.g., `99.9%`)", + "pattern": "^\\d+(\\.\\d+)?%$", + "example": "99.9%" + } + } + }, + "retention": { + "type": "object", + "description": "Retention covers the period how long data will be available.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the retention service level.", + "example": "Data is retained for one year." + }, + "period": { + "type": "string", + "description": "An optional period of time, how long data is available. Supported formats: Simple duration (e.g., `1 year`, `30d`) and ISO 8601 duration (e.g, `P1Y`).", + "example": "P1Y" + }, + "unlimited": { + "type": "boolean", + "description": "An optional indicator that data is kept forever.", + "example": false + }, + "timestampField": { + "type": "string", + "description": "An optional reference to the field that contains the timestamp that the period refers to.", + "example": "orders.order_timestamp" + } + } + }, + "latency": { + "type": "object", + "description": "Latency refers to the maximum amount of time from the source to its destination.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the latency service level.", + "example": "Data is available within 25 hours after the order was placed." + }, + "threshold": { + "type": "string", + "description": "An optional maximum duration between the source timestamp and the processed timestamp. Supported formats: Simple duration (e.g., `24 hours`, `5s`) and ISO 8601 duration (e.g, `PT24H`).", + "example": "25h" + }, + "sourceTimestampField": { + "type": "string", + "description": "An optional reference to the field that contains the timestamp when the data was provided at the source.", + "example": "orders.order_timestamp" + }, + "processedTimestampField": { + "type": "string", + "description": "An optional reference to the field that contains the processing timestamp, which denotes when the data is made available to consumers of this data contract.", + "example": "orders.processed_timestamp" + } + } + }, + "freshness": { + "type": "object", + "description": "The maximum age of the youngest row in a table.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the freshness service level.", + "example": "The age of the youngest row in a table is within 25 hours." + }, + "threshold": { + "type": "string", + "description": "An optional maximum age of the youngest entry. Supported formats: Simple duration (e.g., `24 hours`, `5s`) and ISO 8601 duration (e.g., `PT24H`).", + "example": "25h" + }, + "timestampField": { + "type": "string", + "description": "An optional reference to the field that contains the timestamp that the threshold refers to.", + "example": "orders.order_timestamp" + } + } + }, + "frequency": { + "type": "object", + "description": "Frequency describes how often data is updated.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the frequency service level.", + "example": "Data is delivered once a day." + }, + "type": { + "type": "string", + "enum": [ + "batch", + "micro-batching", + "streaming", + "manual" + ], + "description": "The method of data processing.", + "example": "batch" + }, + "interval": { + "type": "string", + "description": "Optional. Only for batch: How often the pipeline is triggered, e.g., `daily`.", + "example": "daily" + }, + "cron": { + "type": "string", + "description": "Optional. Only for batch: A cron expression when the pipelines is triggered. E.g., `0 0 * * *`.", + "example": "0 0 * * *" + } + } + }, + "support": { + "type": "object", + "description": "Support describes the times when support will be available for contact.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the support service level.", + "example": "The data is available during typical business hours at headquarters." + }, + "time": { + "type": "string", + "description": "An optional string describing the times when support will be available for contact such as `24/7` or `business hours only`.", + "example": "9am to 5pm in EST on business days" + }, + "responseTime": { + "type": "string", + "description": "An optional string describing the time it takes for the support team to acknowledge a request. This does not mean the issue will be resolved immediately, but it assures users that their request has been received and will be dealt with.", + "example": "24 hours" + } + } + }, + "backup": { + "type": "object", + "description": "Backup specifies details about data backup procedures.", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the backup service level.", + "example": "Data is backed up once a week, every Sunday at 0:00 UTC." + }, + "interval": { + "type": "string", + "description": "An optional interval that defines how often data will be backed up, e.g., `daily`.", + "example": "weekly" + }, + "cron": { + "type": "string", + "description": "An optional cron expression when data will be backed up, e.g., `0 0 * * *`.", + "example": "0 0 * * 0" + }, + "recoveryTime": { + "type": "string", + "description": "An optional Recovery Time Objective (RTO) specifies the maximum amount of time allowed to restore data from a backup after a failure or loss event (e.g., 4 hours, 24 hours).", + "example": "24 hours" + }, + "recoveryPoint": { + "type": "string", + "description": "An optional Recovery Point Objective (RPO) defines the maximum acceptable age of files that must be recovered from backup storage for normal operations to resume after a disaster or data loss event. This essentially measures how much data you can afford to lose, measured in time (e.g., 4 hours, 24 hours).", + "example": "1 week" + } + } + } + } + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "description": "Tags to facilitate searching and filtering.", + "examples": [ + "databricks", + "pii", + "sensitive" + ] + }, + "description": "Tags to facilitate searching and filtering." + } + }, + "required": [ + "dataContractSpecification", + "id", + "info" + ], + "$defs": { + "FieldType": { + "type": "string", + "title": "FieldType", + "description": "The logical data type of the field.", + "enum": [ + "number", + "decimal", + "numeric", + "int", + "integer", + "long", + "bigint", + "float", + "double", + "string", + "text", + "varchar", + "boolean", + "timestamp", + "timestamp_tz", + "timestamp_ntz", + "date", + "time", + "array", + "map", + "object", + "record", + "struct", + "bytes", + "variant", + "json", + "null" + ] + }, + "BaseServer": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the servers." + }, + "environment": { + "type": "string", + "description": "The environment in which the servers are running. Examples: prod, sit, stg." + }, + "type": { + "type": "string", + "description": "The type of the data product technology that implements the data contract.", + "examples": [ + "azure", + "bigquery", + "BigQuery", + "clickhouse", + "databricks", + "dataframe", + "glue", + "kafka", + "kinesis", + "local", + "oracle", + "postgres", + "pubsub", + "redshift", + "sftp", + "sqlserver", + "snowflake", + "s3", + "trino" + ] + }, + "roles": { + "description": " An optional array of roles that are available and can be requested to access the server for role-based access control. E.g. separate roles for different regions or sensitive data.", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The name of the role." + }, + "description": { + "type": "string", + "description": "A description of the role and what access the role provides." + } + }, + "required": [ + "name" + ] + } + } + }, + "additionalProperties": true, + "required": [ + "type" + ] + }, + "BigQueryServer": { + "type": "object", + "title": "BigQueryServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + }, + "dataset": { + "type": "string", + "description": "The GCP dataset name." + } + }, + "required": [ + "project", + "dataset" + ] + }, + "S3Server": { + "type": "object", + "title": "S3Server", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "S3 URL, starting with `s3://`", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}/*.json" + ] + }, + "endpointUrl": { + "type": "string", + "format": "uri", + "description": "The server endpoint for S3-compatible servers.", + "examples": [ + "https://minio.example.com" + ] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "SftpServer": { + "type": "object", + "title": "SftpServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "pattern": "^sftp://.*", + "description": "SFTP URL, starting with `sftp://`", + "examples": [ + "sftp://123.123.12.123/{model}/*.json" + ] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "RedshiftServer": { + "type": "object", + "title": "RedshiftServer", + "properties": { + "account": { + "type": "string", + "description": "An optional string describing the server." + }, + "host": { + "type": "string", + "description": "An optional string describing the host name." + }, + "database": { + "type": "string", + "description": "An optional string describing the server." + }, + "schema": { + "type": "string", + "description": "An optional string describing the server." + }, + "clusterIdentifier": { + "type": "string", + "description": "An optional string describing the cluster's identifier.", + "examples": [ + "redshift-prod-eu", + "analytics-cluster" + ] + }, + "port": { + "type": "integer", + "description": "An optional string describing the cluster's port.", + "examples": [ + 5439 + ] + }, + "endpoint": { + "type": "string", + "description": "An optional string describing the cluster's endpoint.", + "examples": [ + "analytics-cluster.example.eu-west-1.redshift.amazonaws.com:5439/analytics" + ] + } + }, + "additionalProperties": true, + "required": [ + "account", + "database", + "schema" + ] + }, + "AzureServer": { + "type": "object", + "title": "AzureServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "Path to Azure Blob Storage or Azure Data Lake Storage (ADLS), supports globs. Recommended pattern is 'abfss:///'", + "examples": [ + "abfss://my_container_name/path", + "abfss://my_container_name/path/*.json", + "az://my_storage_account_name.blob.core.windows.net/my_container/path/*.parquet", + "abfss://my_storage_account_name.dfs.core.windows.net/my_container_name/path/*.parquet" + ] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location", + "format" + ] + }, + "SqlserverServer": { + "type": "object", + "title": "SqlserverServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server.", + "default": 1433, + "examples": [ + 1433 + ] + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "database" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "dbo" + ] + } + }, + "required": [ + "host", + "database", + "schema" + ] + }, + "SnowflakeServer": { + "type": "object", + "title": "SnowflakeServer", + "properties": { + "account": { + "type": "string", + "description": "An optional string describing the server." + }, + "database": { + "type": "string", + "description": "An optional string describing the server." + }, + "schema": { + "type": "string", + "description": "An optional string describing the server." + } + }, + "required": [ + "account", + "database", + "schema" + ] + }, + "DatabricksServer": { + "type": "object", + "title": "DatabricksServer", + "properties": { + "host": { + "type": "string", + "description": "The Databricks host", + "examples": [ + "dbc-abcdefgh-1234.cloud.databricks.com" + ] + }, + "catalog": { + "type": "string", + "description": "The name of the Hive or Unity catalog" + }, + "schema": { + "type": "string", + "description": "The schema name in the catalog" + } + }, + "required": [ + "catalog", + "schema" + ] + }, + "DataframeServer": { + "type": "object", + "title": "DataframeServer", + "required": [ + "type" + ] + }, + "GlueServer": { + "type": "object", + "title": "GlueServer", + "properties": { + "account": { + "type": "string", + "description": "The AWS Glue account", + "examples": [ + "1234-5678-9012" + ] + }, + "database": { + "type": "string", + "description": "The AWS Glue database name", + "examples": [ + "my_database" + ] + }, + "location": { + "type": "string", + "format": "uri", + "description": "The AWS S3 path. Must be in the form of a URL.", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}" + ] + }, + "format": { + "type": "string", + "description": "The format of the files", + "examples": [ + "parquet", + "csv", + "json", + "delta" + ] + } + }, + "required": [ + "account", + "database" + ] + }, + "PostgresServer": { + "type": "object", + "title": "PostgresServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server." + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "postgres" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "public" + ] + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + }, + "OracleServer": { + "type": "object", + "title": "OracleServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the oracle server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the oracle server.", + "examples": [ + 1523 + ] + }, + "serviceName": { + "type": "string", + "description": "The name of the service.", + "examples": [ + "service" + ] + } + }, + "required": [ + "host", + "port", + "serviceName" + ] + }, + "KafkaServer": { + "type": "object", + "title": "KafkaServer", + "description": "Kafka Server", + "properties": { + "host": { + "type": "string", + "description": "The bootstrap server of the kafka cluster." + }, + "topic": { + "type": "string", + "description": "The topic name." + }, + "format": { + "type": "string", + "description": "The format of the message. Examples: json, avro, protobuf.", + "default": "json" + } + }, + "required": [ + "host", + "topic" + ] + }, + "PubSubServer": { + "type": "object", + "title": "PubSubServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + }, + "topic": { + "type": "string", + "description": "The topic name." + } + }, + "required": [ + "project", + "topic" + ] + }, + "KinesisDataStreamsServer": { + "type": "object", + "title": "KinesisDataStreamsServer", + "description": "Kinesis Data Streams Server", + "properties": { + "stream": { + "type": "string", + "description": "The name of the Kinesis data stream." + }, + "region": { + "type": "string", + "description": "AWS region.", + "examples": [ + "eu-west-1" + ] + }, + "format": { + "type": "string", + "description": "The format of the record", + "examples": [ + "json", + "avro", + "protobuf" + ] + } + }, + "required": [ + "stream" + ] + }, + "TrinoServer": { + "type": "object", + "title": "TrinoServer", + "properties": { + "host": { + "type": "string", + "description": "The Trino host URL.", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The Trino port." + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "hive" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "my_schema" + ] + } + }, + "required": [ + "host", + "port", + "catalog", + "schema" + ] + }, + "ClickhouseServer": { + "type": "object", + "title": "ClickhouseServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server." + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "postgres" + ] + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "LocalServer": { + "type": "object", + "title": "LocalServer", + "properties": { + "path": { + "type": "string", + "description": "The relative or absolute path to the data file(s).", + "examples": [ + "./folder/data.parquet", + "./folder/*.parquet" + ] + }, + "format": { + "type": "string", + "description": "The format of the file(s)", + "examples": [ + "json", + "parquet", + "delta", + "csv" + ] + } + }, + "required": [ + "path", + "format" + ] + }, + "Quality": { + "allOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The type of quality check", + "enum": [ + "text", + "library", + "sql", + "custom" + ] + }, + "description": { + "type": "string", + "description": "A plain text describing the quality attribute in natural language." + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "text" + } + } + }, + "then": { + "required": [ + "description" + ] + } + }, + { + "if": { + "properties": { + "type": { + "const": "sql" + } + } + }, + "then": { + "properties": { + "query": { + "type": "string", + "description": "A SQL query that returns a single number to compare with the threshold." + }, + "dialect": { + "type": "string", + "description": "The SQL dialect that is used for the query. Should be compatible to the server.type.", + "examples": [ + "athena", + "bigquery", + "redshift", + "snowflake", + "trino", + "postgres", + "oracle" + ] + }, + "mustBe": { + "type": "number" + }, + "mustNotBe": { + "type": "number" + }, + "mustBeGreaterThan": { + "type": "number" + }, + "mustBeGreaterOrEqualTo": { + "type": "number" + }, + "mustBeGreaterThanOrEqualTo": { + "type": "number", + "deprecated": true + }, + "mustBeLessThan": { + "type": "number" + }, + "mustBeLessThanOrEqualTo": { + "type": "number", + "deprecated": true + }, + "mustBeLessOrEqualTo": { + "type": "number" + }, + "mustBeBetween": { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 2, + "maxItems": 2 + }, + "mustNotBeBetween": { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 2, + "maxItems": 2 + } + }, + "required": [ + "query" + ] + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "type": { + "const": "library" + } + } + }, + { + "properties": { + "metric": { + "type": "string" + } + }, + "required": ["metric"] + } + ] + }, + "then": { + "properties": { + "metric": { + "type": "string", + "description": "The DataQualityLibrary metric to use for the quality check.", + "enum": ["nullValues", "missingValues", "invalidValues", "duplicateValues", "rowCount"] + }, + "rule": { + "type": "string", + "deprecated": true, + "description": "Deprecated. Use metric instead" + }, + "arguments": { + "type": "object", + "description": "Additional metric-specific parameters for the quality check.", + "additionalProperties": { + "type": ["string", "number", "boolean", "array", "object"] + } + }, + "mustBe": { + "description": "Must be equal to the value to be valid. When using numbers, it is equivalent to '='." + }, + "mustNotBe": { + "description": "Must not be equal to the value to be valid. When using numbers, it is equivalent to '!='." + }, + "mustBeGreaterThan": { + "type": "number", + "description": "Must be greater than the value to be valid. It is equivalent to '>'." + }, + "mustBeGreaterOrEqualTo": { + "type": "number", + "description": "Must be greater than or equal to the value to be valid. It is equivalent to '>='." + }, + "mustBeLessThan": { + "type": "number", + "description": "Must be less than the value to be valid. It is equivalent to '<'." + }, + "mustBeLessOrEqualTo": { + "type": "number", + "description": "Must be less than or equal to the value to be valid. It is equivalent to '<='." + }, + "mustBeBetween": { + "type": "array", + "description": "Must be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + }, + "mustNotBeBetween": { + "type": "array", + "description": "Must not be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + } + }, + "required": [ + "metric" + ] + } + }, + { + "if": { + "properties": { + "type": { + "const": "custom" + } + } + }, + "then": { + "properties": { + "description": { + "type": "string", + "description": "A plain text describing the quality attribute in natural language." + }, + "engine": { + "type": "string", + "examples": [ + "soda", + "great-expectations" + ], + "description": "The engine used for custom quality checks." + }, + "implementation": { + "type": [ + "object", + "array", + "string" + ], + "description": "Engine-specific quality checks and expectations." + } + }, + "required": [ + "engine" + ] + } + } + ] + }, + "Lineage": { + "type": "object", + "properties": { + "inputFields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "namespace": { + "type": "string", + "description": "The input dataset namespace" + }, + "name": { + "type": "string", + "description": "The input dataset name" + }, + "field": { + "type": "string", + "description": "The input field" + }, + "transformations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "description": "The type of the transformation. Allowed values are: DIRECT, INDIRECT", + "type": "string" + }, + "subtype": { + "type": "string", + "description": "The subtype of the transformation" + }, + "description": { + "type": "string", + "description": "a string representation of the transformation applied" + }, + "masking": { + "type": "boolean", + "description": "is transformation masking the data or not" + } + }, + "required": [ + "type" + ], + "additionalProperties": true + } + } + }, + "additionalProperties": true, + "required": [ + "namespace", + "name", + "field" + ] + } + }, + "transformationDescription": { + "type": "string", + "description": "a string representation of the transformation applied", + "deprecated": true + }, + "transformationType": { + "type": "string", + "description": "IDENTITY|MASKED reflects a clearly defined behavior. IDENTITY: exact same as input; MASKED: no original data available (like a hash of PII for example)", + "deprecated": true + } + }, + "additionalProperties": true, + "required": [ + "inputFields" + ] + } + } +} diff --git a/datacontract/schemas/download b/datacontract/schemas/download index b152c7693..7b740e0fb 100644 --- a/datacontract/schemas/download +++ b/datacontract/schemas/download @@ -1,7 +1,7 @@ #!/bin/bash set -e -curl -o datacontract-1.2.0.init.yaml https://datacontract.com/datacontract.init.yaml -curl -o datacontract-1.2.0.schema.json https://datacontract.com/datacontract.schema.json -curl -o odcs-3.0.1.schema.json https://raw.githubusercontent.com/bitol-io/open-data-contract-standard/refs/heads/main/schema/odcs-json-schema-v3.0.1.json +curl -o datacontract-1.2.1.init.yaml https://datacontract.com/datacontract.init.yaml +curl -o datacontract-1.2.1.schema.json https://datacontract.com/datacontract.schema.json +curl -o odcs-3.0.2.schema.json https://raw.githubusercontent.com/bitol-io/open-data-contract-standard/refs/heads/main/schema/odcs-json-schema-v3.0.2.json diff --git a/datacontract/schemas/odcs-3.0.2.schema.json b/datacontract/schemas/odcs-3.0.2.schema.json new file mode 100644 index 000000000..cb9bfb9d8 --- /dev/null +++ b/datacontract/schemas/odcs-3.0.2.schema.json @@ -0,0 +1,2382 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "title": "Open Data Contract Standard (ODCS)", + "description": "An open data contract specification to establish agreement between data producers and consumers.", + "type": "object", + "properties": { + "version": { + "type": "string", + "description": "Current version of the data contract." + }, + "kind": { + "type": "string", + "default": "DataContract", + "description": "The kind of file this is. Valid value is `DataContract`.", + "enum": ["DataContract"] + }, + "apiVersion": { + "type": "string", + "default": "v3.0.2", + "description": "Version of the standard used to build data contract. Default value is v3.0.2.", + "enum": ["v3.0.2","v3.0.1", "v3.0.0", "v2.2.2", "v2.2.1", "v2.2.0"] + }, + "id": { + "type": "string", + "description": "A unique identifier used to reduce the risk of dataset name collisions, such as a UUID." + }, + "name": { + "type": "string", + "description": "Name of the data contract." + }, + "tenant": { + "type": "string", + "description": "Indicates the property the data is primarily associated with. Value is case insensitive." + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "status": { + "type": "string", + "description": "Current status of the dataset.", + "examples": [ + "proposed", "draft", "active", "deprecated", "retired" + ] + }, + "servers": { + "type": "array", + "description": "List of servers where the datasets reside.", + "items": { + "$ref": "#/$defs/Server" + } + }, + "dataProduct": { + "type": "string", + "description": "The name of the data product." + }, + "description": { + "type": "object", + "description": "High level description of the dataset.", + "properties": { + "usage": { + "type": "string", + "description": "Intended usage of the dataset." + }, + "purpose": { + "type": "string", + "description": "Purpose of the dataset." + }, + "limitations": { + "type": "string", + "description": "Limitations of the dataset." + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + } + }, + "domain": { + "type": "string", + "description": "Name of the logical data domain.", + "examples": ["imdb_ds_aggregate", "receiver_profile_out", "transaction_profile_out"] + }, + "schema": { + "type": "array", + "description": "A list of elements within the schema to be cataloged.", + "items": { + "$ref": "#/$defs/SchemaObject" + } + }, + "support": { + "$ref": "#/$defs/Support" + }, + "price": { + "$ref": "#/$defs/Pricing" + }, + "team": { + "type": "array", + "items": { + "$ref": "#/$defs/Team" + } + }, + "roles": { + "type": "array", + "description": "A list of roles that will provide user access to the dataset.", + "items": { + "$ref": "#/$defs/Role" + } + }, + "slaDefaultElement": { + "type": "string", + "description": "Element (using the element path notation) to do the checks on." + }, + "slaProperties": { + "type": "array", + "description": "A list of key/value pairs for SLA specific properties. There is no limit on the type of properties (more details to come).", + "items": { + "$ref": "#/$defs/ServiceLevelAgreementProperty" + } + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + }, + "contractCreatedTs": { + "type": "string", + "format": "date-time", + "description": "Timestamp in UTC of when the data contract was created." + } + }, + "required": ["version", "apiVersion", "kind", "id", "status"], + "additionalProperties": false, + "$defs": { + "Server": { + "type": "object", + "description": "Data source details of where data is physically stored.", + "properties": { + "server": { + "type": "string", + "description": "Identifier of the server." + }, + "type": { + "type": "string", + "description": "Type of the server.", + "enum": [ + "api", "athena", "azure", "bigquery", "clickhouse", "databricks", "denodo", "dremio", + "duckdb", "glue", "cloudsql", "db2", "informix", "kafka", "kinesis", "local", + "mysql", "oracle", "postgresql", "postgres", "presto", "pubsub", + "redshift", "s3", "sftp", "snowflake", "sqlserver", "synapse", "trino", "vertica", "custom" + ] + }, + "description": { + "type": "string", + "description": "Description of the server." + }, + "environment": { + "type": "string", + "description": "Environment of the server.", + "examples": ["prod", "preprod", "dev", "uat"] + }, + "roles": { + "type": "array", + "description": "List of roles that have access to the server.", + "items": { + "$ref": "#/$defs/Role" + } + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "api" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ApiServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "athena" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/AthenaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "azure" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/AzureServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "bigquery" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/BigQueryServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "clickhouse" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ClickHouseServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "databricks" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DatabricksServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "denodo" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DenodoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "dremio" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DremioServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "duckdb" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DuckdbServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "glue" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/GlueServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "cloudsql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/GoogleCloudSqlServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "db2" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/IBMDB2Server" + } + }, + { + "if": { + "properties": { + "type": { + "const": "informix" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/InformixServer" + } + }, + + { + "if": { + "properties": { + "type": { + "const": "custom" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/CustomServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kafka" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/KafkaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kinesis" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/KinesisServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "local" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/LocalServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "mysql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/MySqlServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "oracle" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/OracleServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgresql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgres" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "presto" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PrestoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "pubsub" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PubSubServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "redshift" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/RedshiftServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "s3" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/S3Server" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sftp" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SftpServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "snowflake" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SnowflakeServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sqlserver" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SqlserverServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "synapse" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SynapseServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "trino" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/TrinoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "vertica" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/VerticaServer" + } + } + ], + "required": ["server", "type"] + }, + "ServerSource": { + "ApiServer": { + "type": "object", + "title": "AthenaServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "The url to the API.", + "examples": [ + "https://api.example.com/v1" + ] + } + }, + "required": [ + "location" + ] + }, + "AthenaServer": { + "type": "object", + "title": "AthenaServer", + "properties": { + "stagingDir": { + "type": "string", + "format": "uri", + "description": "Amazon Athena automatically stores query results and metadata information for each query that runs in a query result location that you can specify in Amazon S3.", + "examples": [ + "s3://my_storage_account_name/my_container/path" + ] + }, + "schema": { + "type": "string", + "description": "Identify the schema in the data source in which your tables exist." + }, + "catalog": { + "type": "string", + "description": "Identify the name of the Data Source, also referred to as a Catalog.", + "default": "awsdatacatalog" + }, + "regionName": { + "type": "string", + "description": "The region your AWS account uses.", + "examples": ["eu-west-1"] + } + }, + "required": [ + "stagingDir", + "schema" + ] + }, + "AzureServer": { + "type": "object", + "title": "AzureServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "Fully qualified path to Azure Blob Storage or Azure Data Lake Storage (ADLS), supports globs.", + "examples": [ + "az://my_storage_account_name.blob.core.windows.net/my_container/path/*.parquet", + "abfss://my_storage_account_name.dfs.core.windows.net/my_container_name/path/*.parquet" + ] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location", + "format" + ] + }, + "BigQueryServer": { + "type": "object", + "title": "BigQueryServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + }, + "dataset": { + "type": "string", + "description": "The GCP dataset name." + } + }, + "required": [ + "project", + "dataset" + ] + }, + "ClickHouseServer": { + "type": "object", + "title": "ClickHouseServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the ClickHouse server." + }, + "port": { + "type": "integer", + "description": "The port to the ClickHouse server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "DatabricksServer": { + "type": "object", + "title": "DatabricksServer", + "properties": { + "host": { + "type": "string", + "description": "The Databricks host", + "examples": [ + "dbc-abcdefgh-1234.cloud.databricks.com" + ] + }, + "catalog": { + "type": "string", + "description": "The name of the Hive or Unity catalog" + }, + "schema": { + "type": "string", + "description": "The schema name in the catalog" + } + }, + "required": [ + "catalog", + "schema" + ] + }, + "DenodoServer": { + "type": "object", + "title": "DenodoServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Denodo server." + }, + "port": { + "type": "integer", + "description": "The port of the Denodo server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port" + ] + }, + "DremioServer": { + "type": "object", + "title": "DremioServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Dremio server." + }, + "port": { + "type": "integer", + "description": "The port of the Dremio server." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port" + ] + }, + "DuckdbServer": { + "type": "object", + "title": "DuckdbServer", + "properties": { + "database": { + "type": "string", + "description": "Path to duckdb database file." + }, + "schema": { + "type": "integer", + "description": "The name of the schema." + } + }, + "required": [ + "database" + ] + }, + "GlueServer": { + "type": "object", + "title": "GlueServer", + "properties": { + "account": { + "type": "string", + "description": "The AWS Glue account", + "examples": [ + "1234-5678-9012" + ] + }, + "database": { + "type": "string", + "description": "The AWS Glue database name", + "examples": [ + "my_database" + ] + }, + "location": { + "type": "string", + "format": "uri", + "description": "The AWS S3 path. Must be in the form of a URL.", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}" + ] + }, + "format": { + "type": "string", + "description": "The format of the files", + "examples": [ + "parquet", + "csv", + "json", + "delta" + ] + } + }, + "required": [ + "account", + "database" + ] + }, + "GoogleCloudSqlServer": { + "type": "object", + "title": "GoogleCloudSqlServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Google Cloud Sql server." + }, + "port": { + "type": "integer", + "description": "The port of the Google Cloud Sql server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + }, + "IBMDB2Server": { + "type": "object", + "title": "IBMDB2Server", + "properties": { + "host": { + "type": "string", + "description": "The host of the IBM DB2 server." + }, + "port": { + "type": "integer", + "description": "The port of the IBM DB2 server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "InformixServer": { + "type": "object", + "title": "InformixServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Informix server. " + }, + "port": { + "type": "integer", + "description": "The port to the Informix server. Defaults to 9088." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "database" + ] + }, + "CustomServer": { + "type": "object", + "title": "CustomServer", + "properties": { + "account": { + "type": "string", + "description": "Account used by the server." + }, + "catalog": { + "type": "string", + "description": "Name of the catalog." + }, + "database": { + "type": "string", + "description": "Name of the database." + }, + "dataset": { + "type": "string", + "description": "Name of the dataset." + }, + "delimiter": { + "type": "string", + "description": "Delimiter." + }, + "endpointUrl": { + "type": "string", + "description": "Server endpoint.", + "format": "uri" + }, + "format": { + "type": "string", + "description": "File format." + }, + "host": { + "type": "string", + "description": "Host name or IP address." + }, + "location": { + "type": "string", + "description": "A URL to a location.", + "format": "uri" + }, + "path": { + "type": "string", + "description": "Relative or absolute path to the data file(s)." + }, + "port": { + "type": "integer", + "description": "Port to the server. No default value is assumed for custom servers." + }, + "project": { + "type": "string", + "description": "Project name." + }, + "region": { + "type": "string", + "description": "Cloud region." + }, + "regionName": { + "type": "string", + "description": "Region name." + }, + "schema": { + "type": "string", + "description": "Name of the schema." + }, + "serviceName": { + "type": "string", + "description": "Name of the service." + }, + "stagingDir": { + "type": "string", + "description": "Staging directory." + }, + "warehouse": { + "type": "string", + "description": "Name of the cluster or warehouse." + } + } + }, + "KafkaServer": { + "type": "object", + "title": "KafkaServer", + "description": "Kafka Server", + "properties": { + "host": { + "type": "string", + "description": "The bootstrap server of the kafka cluster." + }, + "format": { + "type": "string", + "description": "The format of the messages.", + "examples": ["json", "avro", "protobuf", "xml"], + "default": "json" + } + }, + "required": [ + "host" + ] + }, + "KinesisServer": { + "type": "object", + "title": "KinesisDataStreamsServer", + "description": "Kinesis Data Streams Server", + "properties": { + "region": { + "type": "string", + "description": "AWS region.", + "examples": [ + "eu-west-1" + ] + }, + "format": { + "type": "string", + "description": "The format of the record", + "examples": [ + "json", + "avro", + "protobuf" + ] + } + } + }, + "LocalServer": { + "type": "object", + "title": "LocalServer", + "properties": { + "path": { + "type": "string", + "description": "The relative or absolute path to the data file(s).", + "examples": [ + "./folder/data.parquet", + "./folder/*.parquet" + ] + }, + "format": { + "type": "string", + "description": "The format of the file(s)", + "examples": [ + "json", + "parquet", + "delta", + "csv" + ] + } + }, + "required": [ + "path", + "format" + ] + }, + "MySqlServer": { + "type": "object", + "title": "MySqlServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the MySql server." + }, + "port": { + "type": "integer", + "description": "The port of the MySql server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "OracleServer": { + "type": "object", + "title": "OracleServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the oracle server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the oracle server.", + "examples": [ + 1523 + ] + }, + "serviceName": { + "type": "string", + "description": "The name of the service.", + "examples": [ + "service" + ] + } + }, + "required": [ + "host", + "port", + "serviceName" + ] + }, + "PostgresServer": { + "type": "object", + "title": "PostgresServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Postgres server" + }, + "port": { + "type": "integer", + "description": "The port to the Postgres server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + }, + "PrestoServer": { + "type": "object", + "title": "PrestoServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Presto server", + "examples": [ + "localhost:8080" + ] + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "postgres" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema.", + "examples": [ + "public" + ] + } + }, + "required": [ + "host" + ] + }, + "PubSubServer": { + "type": "object", + "title": "PubSubServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + } + }, + "required": [ + "project" + ] + }, + "RedshiftServer": { + "type": "object", + "title": "RedshiftServer", + "properties": { + "host": { + "type": "string", + "description": "An optional string describing the server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + }, + "region": { + "type": "string", + "description": "AWS region of Redshift server.", + "examples": ["us-east-1"] + }, + "account": { + "type": "string", + "description": "The account used by the server." + } + }, + "required": [ + "database", + "schema" + ] + }, + "S3Server": { + "type": "object", + "title": "S3Server", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "S3 URL, starting with `s3://`", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}/*.json" + ] + }, + "endpointUrl": { + "type": "string", + "format": "uri", + "description": "The server endpoint for S3-compatible servers.", + "examples": ["https://minio.example.com"] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "SftpServer": { + "type": "object", + "title": "SftpServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "pattern": "^sftp://.*", + "description": "SFTP URL, starting with `sftp://`", + "examples": [ + "sftp://123.123.12.123/{model}/*.json" + ] + }, + "format": { + "type": "string", + "enum": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "enum": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "SnowflakeServer": { + "type": "object", + "title": "SnowflakeServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Snowflake server" + }, + "port": { + "type": "integer", + "description": "The port to the Snowflake server." + }, + "account": { + "type": "string", + "description": "The Snowflake account used by the server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + }, + "warehouse": { + "type": "string", + "description": "The name of the cluster of resources that is a Snowflake virtual warehouse." + } + }, + "required": [ + "account", + "database", + "schema" + ] + }, + "SqlserverServer": { + "type": "object", + "title": "SqlserverServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server.", + "default": 1433, + "examples": [ + 1433 + ] + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "database" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "dbo" + ] + } + }, + "required": [ + "host", + "database", + "schema" + ] + }, + "SynapseServer": { + "type": "object", + "title": "SynapseServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Synapse server." + }, + "port": { + "type": "integer", + "description": "The port of the Synapse server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "TrinoServer": { + "type": "object", + "title": "TrinoServer", + "properties": { + "host": { + "type": "string", + "description": "The Trino host URL.", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The Trino port." + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "hive" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "my_schema" + ] + } + }, + "required": [ + "host", + "port", + "catalog", + "schema" + ] + }, + "VerticaServer": { + "type": "object", + "title": "VerticaServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Vertica server." + }, + "port": { + "type": "integer", + "description": "The port of the Vertica server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + } + }, + "SchemaElement": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Name of the element." + }, + "physicalType": { + "type": "string", + "description": "The physical element data type in the data source.", + "examples": ["table", "view", "topic", "file"] + }, + "description": { + "type": "string", + "description": "Description of the element." + }, + "businessName": { + "type": "string", + "description": "The business name of the element." + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + } + }, + "SchemaObject": { + "type": "object", + "properties": { + "logicalType": { + "type": "string", + "description": "The logical element data type.", + "enum": ["object"] + }, + "physicalName": { + "type": "string", + "description": "Physical name.", + "examples": ["table_1_2_0"] + }, + "dataGranularityDescription": { + "type": "string", + "description": "Granular level of the data in the object.", + "examples": ["Aggregation by country"] + }, + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + }, + "quality": { + "$ref": "#/$defs/DataQualityChecks" + } + }, + "allOf": [ + { + "$ref": "#/$defs/SchemaElement" + } + ], + "required": ["name"], + "unevaluatedProperties": false + }, + "SchemaBaseProperty": { + "type": "object", + "properties": { + "primaryKey": { + "type": "boolean", + "description": "Boolean value specifying whether the element is primary or not. Default is false." + }, + "primaryKeyPosition": { + "type": "integer", + "default": -1, + "description": "If element is a primary key, the position of the primary key element. Starts from 1. Example of `account_id, name` being primary key columns, `account_id` has primaryKeyPosition 1 and `name` primaryKeyPosition 2. Default to -1." + }, + "logicalType": { + "type": "string", + "description": "The logical element data type.", + "enum": ["string", "date", "number", "integer", "object", "array", "boolean"] + }, + "logicalTypeOptions": { + "type": "object", + "description": "Additional optional metadata to describe the logical type." + }, + "physicalType": { + "type": "string", + "description": "The physical element data type in the data source. For example, VARCHAR(2), DOUBLE, INT." + }, + "physicalName": { + "type": "string", + "description": "Physical name.", + "examples": ["col_str_a"] + }, + "required": { + "type": "boolean", + "default": false, + "description": "Indicates if the element may contain Null values; possible values are true and false. Default is false." + }, + "unique": { + "type": "boolean", + "default": false, + "description": "Indicates if the element contains unique values; possible values are true and false. Default is false." + }, + "partitioned": { + "type": "boolean", + "default": false, + "description": "Indicates if the element is partitioned; possible values are true and false." + }, + "partitionKeyPosition": { + "type": "integer", + "default": -1, + "description": "If element is used for partitioning, the position of the partition element. Starts from 1. Example of `country, year` being partition columns, `country` has partitionKeyPosition 1 and `year` partitionKeyPosition 2. Default to -1." + }, + "classification": { + "type": "string", + "description": "Can be anything, like confidential, restricted, and public to more advanced categorization. Some companies like PayPal, use data classification indicating the class of data in the element; expected values are 1, 2, 3, 4, or 5.", + "examples": ["confidential", "restricted", "public"] + }, + "encryptedName": { + "type": "string", + "description": "The element name within the dataset that contains the encrypted element value. For example, unencrypted element `email_address` might have an encryptedName of `email_address_encrypt`." + }, + "transformSourceObjects": { + "type": "array", + "description": "List of objects in the data source used in the transformation.", + "items": { + "type": "string" + } + }, + "transformLogic": { + "type": "string", + "description": "Logic used in the element transformation." + }, + "transformDescription": { + "type": "string", + "description": "Describes the transform logic in very simple terms." + }, + "examples": { + "type": "array", + "description": "List of sample element values.", + "items": { + "$ref": "#/$defs/AnyType" + } + }, + "criticalDataElement": { + "type": "boolean", + "default": false, + "description": "True or false indicator; If element is considered a critical data element (CDE) then true else false." + }, + "quality": { + "$ref": "#/$defs/DataQualityChecks" + } + }, + "allOf": [ + { + "$ref": "#/$defs/SchemaElement" + }, + { + "if": { + "properties": { + "logicalType": { + "const": "string" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "minLength": { + "type": "integer", + "minimum": 0, + "description": "Minimum length of the string." + }, + "maxLength": { + "type": "integer", + "minimum": 0, + "description": "Maximum length of the string." + }, + "pattern": { + "type": "string", + "description": "Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (https://262.ecma-international.org/5.1/#sec-15.10.1)." + }, + "format": { + "type": "string", + "examples": ["password", "byte", "binary", "email", "uuid", "uri", "hostname", "ipv4", "ipv6"], + "description": "Provides extra context about what format the string follows." + } + }, + "additionalProperties": false + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "date" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "format": { + "type": "string", + "examples": ["yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss", "HH:mm:ss"], + "description": "Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). For example, format 'yyyy-MM-dd'." + }, + "exclusiveMaximum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum)." + }, + "maximum": { + "type": "string", + "description": "All date values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMinimum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum)." + }, + "minimum": { + "type": "string", + "description": "All date values are greater than or equal to this value (values >= minimum)." + } + }, + "additionalProperties": false + } + } + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "logicalType": { + "const": "integer" + } + } + } + ] + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "multipleOf": { + "type": "number", + "exclusiveMinimum": 0, + "description": "Values must be multiples of this number. For example, multiple of 5 has valid values 0, 5, 10, -5." + }, + "maximum": { + "type": "number", + "description": "All values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMaximum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum)." + }, + "minimum": { + "type": "number", + "description": "All values are greater than or equal to this value (values >= minimum)." + }, + "exclusiveMinimum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum)." + }, + "format": { + "type": "string", + "default": "i32", + "description": "Format of the value in terms of how many bits of space it can use and whether it is signed or unsigned (follows the Rust integer types).", + "enum": ["i8", "i16", "i32", "i64", "i128", "u8", "u16", "u32", "u64", "u128"] + } + }, + "additionalProperties": false + } + } + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "logicalType": { + "const": "number" + } + } + } + ] + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "multipleOf": { + "type": "number", + "exclusiveMinimum": 0, + "description": "Values must be multiples of this number. For example, multiple of 5 has valid values 0, 5, 10, -5." + }, + "maximum": { + "type": "number", + "description": "All values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMaximum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum)." + }, + "minimum": { + "type": "number", + "description": "All values are greater than or equal to this value (values >= minimum)." + }, + "exclusiveMinimum": { + "type": "boolean", + "default": false, + "description": "If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum)." + }, + "format": { + "type": "string", + "default": "i32", + "description": "Format of the value in terms of how many bits of space it can use (follows the Rust float types).", + "enum": ["f32", "f64"] + } + }, + "additionalProperties": false + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "object" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "maxProperties": { + "type": "integer", + "minimum": 0, + "description": "Maximum number of properties." + }, + "minProperties": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Minimum number of properties." + }, + "required": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": true, + "description": "Property names that are required to exist in the object." + } + }, + "additionalProperties": false + }, + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "array" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "properties": { + "maxItems": { + "type": "integer", + "minimum": 0, + "description": "Maximum number of items." + }, + "minItems": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Minimum number of items" + }, + "uniqueItems": { + "type": "boolean", + "default": false, + "description": "If set to true, all items in the array are unique." + } + }, + "additionalProperties": false + }, + "items": { + "$ref": "#/$defs/SchemaItemProperty", + "description": "List of items in an array (only applicable when `logicalType: array`)." + } + } + } + } + ] + }, + "SchemaProperty": { + "type": "object", + "$ref": "#/$defs/SchemaBaseProperty", + "required": ["name"], + "unevaluatedProperties": false + }, + "SchemaItemProperty": { + "type": "object", + "$ref": "#/$defs/SchemaBaseProperty", + "properties": { + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + } + }, + "unevaluatedProperties": false + }, + "Tags": { + "type": "array", + "description": "A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`.", + "examples": ["finance", "sensitive", "employee_record"], + "items": { + "type": "string" + } + }, + "DataQuality": { + "type": "object", + "properties": { + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "businessImpact": { + "type": "string", + "description": "Consequences of the rule failure.", + "examples": ["operational", "regulatory"] + }, + "customProperties": { + "type": "array", + "description": "Additional properties required for rule execution.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "description": { + "type": "string", + "description": "Describe the quality check to be completed." + }, + "dimension": { + "type": "string", + "description": "The key performance indicator (KPI) or dimension for data quality.", + "enum": ["accuracy", "completeness", "conformity", "consistency", "coverage", "timeliness", "uniqueness"] + }, + "method": { + "type": "string", + "examples": ["reconciliation"] + }, + "name": { + "type": "string", + "description": "Name of the data quality check." + }, + "schedule": { + "type": "string", + "description": "Rule execution schedule details.", + "examples": ["0 20 * * *"] + }, + "scheduler": { + "type": "string", + "description": "The name or type of scheduler used to start the data quality check.", + "examples": ["cron"] + }, + "severity": { + "type": "string", + "description": "The severance of the quality rule.", + "examples": ["info", "warning", "error"] + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "type": { + "type": "string", + "description": "The type of quality check. 'text' is human-readable text that describes the quality of the data. 'library' is a set of maintained predefined quality attributes such as row count or unique. 'sql' is an individual SQL query that returns a value that can be compared. 'custom' is quality attributes that are vendor-specific, such as Soda or Great Expectations.", + "enum": ["text", "library", "sql", "custom"], + "default": "library" + }, + "unit": { + "type": "string", + "description": "Unit the rule is using, popular values are `rows` or `percent`, but any value is allowed.", + "examples": ["rows", "percent"] + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "library" + } + } + }, + "then": { + "$ref": "#/$defs/DataQualityLibrary" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/DataQualitySql" + } + }, + { + "if": { + "properties": { + "type": { + "const": "custom" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/DataQualityCustom" + } + } + ] + }, + "DataQualityChecks": { + "type": "array", + "description": "Data quality rules with all the relevant information for rule setup and execution.", + "items": { + "$ref": "#/$defs/DataQuality" + } + }, + "DataQualityLibrary": { + "type": "object", + "properties": { + "rule": { + "type": "string", + "description": "Define a data quality check based on the predefined rules as per ODCS.", + "examples": ["duplicateCount", "validValues", "rowCount"] + }, + "mustBe": { + "description": "Must be equal to the value to be valid. When using numbers, it is equivalent to '='." + }, + "mustNotBe": { + "description": "Must not be equal to the value to be valid. When using numbers, it is equivalent to '!='." + }, + "mustBeGreaterThan": { + "type": "number", + "description": "Must be greater than the value to be valid. It is equivalent to '>'." + }, + "mustBeGreaterOrEqualTo": { + "type": "number", + "description": "Must be greater than or equal to the value to be valid. It is equivalent to '>='." + }, + "mustBeLessThan": { + "type": "number", + "description": "Must be less than the value to be valid. It is equivalent to '<'." + }, + "mustBeLessOrEqualTo": { + "type": "number", + "description": "Must be less than or equal to the value to be valid. It is equivalent to '<='." + }, + "mustBeBetween": { + "type": "array", + "description": "Must be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + }, + "mustNotBeBetween": { + "type": "array", + "description": "Must not be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + } + }, + "required": ["rule"] + }, + "DataQualitySql": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Query string that adheres to the dialect of the provided server.", + "examples": ["SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL"] + } + }, + "required": ["query"] + }, + "DataQualityCustom": { + "type": "object", + "properties": { + "engine": { + "type": "string", + "description": "Name of the engine which executes the data quality checks.", + "examples": ["soda", "great-expectations", "monte-carlo", "dbt"] + }, + "implementation": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object" + } + ] + } + }, + "required": ["engine", "implementation"] + }, + "AuthoritativeDefinitions": { + "type": "array", + "description": "List of links to sources that provide more details on the dataset; examples would be a link to an external definition, a training video, a git repo, data catalog, or another tool. Authoritative definitions follow the same structure in the standard.", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "URL to the authority." + }, + "type": { + "type": "string", + "description": "Type of definition for authority: v2.3 adds standard values: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`.", + "examples": ["businessDefinition", "transformationImplementation", "videoTutorial", "tutorial", "implementation"] + } + }, + "required": ["url", "type"] + } + }, + "Support": { + "type": "array", + "description": "Top level for support channels.", + "items": { + "$ref": "#/$defs/SupportItem" + } + }, + "SupportItem": { + "type": "object", + "properties": { + "channel": { + "type": "string", + "description": "Channel name or identifier." + }, + "url": { + "type": "string", + "description": "Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.)." + }, + "description": { + "type": "string", + "description": "Description of the channel, free text." + }, + "tool": { + "type": "string", + "description": "Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, or `other`.", + "examples": ["email", "slack", "teams", "discord", "ticket", "other"] + }, + "scope": { + "type": "string", + "description": "Scope can be: `interactive`, `announcements`, `issues`.", + "examples": ["interactive", "announcements", "issues"] + }, + "invitationUrl": { + "type": "string", + "description": "Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax)." + } + }, + "required": ["channel", "url"] + }, + "Pricing": { + "type": "object", + "properties": { + "priceAmount": { + "type": "number", + "description": "Subscription price per unit of measure in `priceUnit`." + }, + "priceCurrency": { + "type": "string", + "description": "Currency of the subscription price in `price.priceAmount`." + }, + "priceUnit": { + "type": "string", + "description": "The unit of measure for calculating cost. Examples megabyte, gigabyte." + } + } + }, + "Team": { + "type": "object", + "properties": { + "username": { + "type": "string", + "description": "The user's username or email.", + "examples": [ + "mail@example.com", + "uid12345678" + ] + }, + "name": { + "type": "string", + "description": "The user's name.", + "examples": [ + "Jane Doe" + ] + }, + "description": { + "type": "string", + "description": "The user's description." + }, + "role": { + "type": "string", + "description": "The user's job role; Examples might be owner, data steward. There is no limit on the role." + }, + "dateIn": { + "type": "string", + "format": "date", + "description": "The date when the user joined the team." + }, + "dateOut": { + "type": "string", + "format": "date", + "description": "The date when the user ceased to be part of the team." + }, + "replacedByUsername": { + "type": "string", + "description": "The username of the user who replaced the previous user." + } + } + }, + "Role": { + "type": "object", + "properties": { + "role": { + "type": "string", + "description": "Name of the IAM role that provides access to the dataset." + }, + "description": { + "type": "string", + "description": "Description of the IAM role and its permissions." + }, + "access": { + "type": "string", + "description": "The type of access provided by the IAM role." + }, + "firstLevelApprovers": { + "type": "string", + "description": "The name(s) of the first-level approver(s) of the role." + }, + "secondLevelApprovers": { + "type": "string", + "description": "The name(s) of the second-level approver(s) of the role." + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + }, + "required": ["role"] + }, + "ServiceLevelAgreementProperty": { + "type": "object", + "properties": { + "property": { + "type": "string", + "description": "Specific property in SLA, check the periodic table. May requires units (more details to come)." + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "description": "Agreement value. The label will change based on the property itself." + }, + "valueExt": { + "$ref": "#/$defs/AnyNonCollectionType", + "description": "Extended agreement value. The label will change based on the property itself." + }, + "unit": { + "type": "string", + "description": "**d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard." + }, + "element": { + "type": "string", + "description": "Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas." + }, + "driver": { + "type": "string", + "description": "Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`.", + "examples": ["regulatory", "analytics", "operational"] + } + }, + "required": ["property", "value"] + }, + "CustomProperties": { + "type": "array", + "description": "A list of key/value pairs for custom properties.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "CustomProperty": { + "type": "object", + "properties": { + "property": { + "type": "string", + "description": "The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract." + }, + "value": { + "$ref": "#/$defs/AnyType", + "description": "The value of the key." + } + } + }, + "AnyType": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "AnyNonCollectionType": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + } +} diff --git a/datacontract/schemas/odcs-3.1.0.init.yaml b/datacontract/schemas/odcs-3.1.0.init.yaml new file mode 100644 index 000000000..05745ee75 --- /dev/null +++ b/datacontract/schemas/odcs-3.1.0.init.yaml @@ -0,0 +1,36 @@ +apiVersion: v3.1.0 +kind: DataContract +id: my-data-contract-id +name: My Data Contract +version: 1.0.0 +status: draft + +#team: +# name: Owner +# +#description: +# purpose: Purpose of the dataset. +# usage: Intended usage of the dataset. +# limitations: Limitations of the dataset. +# +#schema: +# - name: my_table +# description: Description of the table +# properties: +# - name: my_field +# logicalType: string +# description: Description of the field +# +#servers: +# - server: production +# type: postgres +# host: localhost +# port: 5432 +# database: my_database +# schema: public +# +#slaProperties: +# - property: latency +# value: 1h +# - property: availability +# value: 99.9% \ No newline at end of file diff --git a/datacontract/schemas/odcs-3.1.0.schema.json b/datacontract/schemas/odcs-3.1.0.schema.json new file mode 100644 index 000000000..3596e68b0 --- /dev/null +++ b/datacontract/schemas/odcs-3.1.0.schema.json @@ -0,0 +1,2928 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "title": "Open Data Contract Standard (ODCS)", + "description": "An open data contract specification to establish agreement between data producers and consumers.", + "type": "object", + "properties": { + "version": { + "type": "string", + "description": "Current version of the data contract." + }, + "kind": { + "type": "string", + "default": "DataContract", + "description": "The kind of file this is. Valid value is `DataContract`.", + "enum": ["DataContract"] + }, + "apiVersion": { + "type": "string", + "default": "v3.1.0", + "description": "Version of the standard used to build data contract. Default value is v3.1.0.", + "enum": ["v3.1.0", "v3.0.2", "v3.0.1", "v3.0.0", "v2.2.2", "v2.2.1", "v2.2.0"] + }, + "id": { + "type": "string", + "description": "A unique identifier used to reduce the risk of dataset name collisions, such as a UUID." + }, + "name": { + "type": "string", + "description": "Name of the data contract." + }, + "tenant": { + "type": "string", + "description": "Indicates the property the data is primarily associated with. Value is case insensitive." + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "status": { + "type": "string", + "description": "Current status of the dataset.", + "examples": [ + "proposed", "draft", "active", "deprecated", "retired" + ] + }, + "servers": { + "type": "array", + "description": "List of servers where the datasets reside.", + "items": { + "$ref": "#/$defs/Server" + } + }, + "dataProduct": { + "type": "string", + "description": "The name of the data product." + }, + "description": { + "type": "object", + "description": "High level description of the dataset.", + "properties": { + "usage": { + "type": "string", + "description": "Intended usage of the dataset." + }, + "purpose": { + "type": "string", + "description": "Purpose of the dataset." + }, + "limitations": { + "type": "string", + "description": "Limitations of the dataset." + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + } + }, + "domain": { + "type": "string", + "description": "Name of the logical data domain.", + "examples": ["imdb_ds_aggregate", "receiver_profile_out", "transaction_profile_out"] + }, + "schema": { + "type": "array", + "description": "A list of elements within the schema to be cataloged.", + "items": { + "$ref": "#/$defs/SchemaObject" + } + }, + "support": { + "$ref": "#/$defs/Support" + }, + "price": { + "$ref": "#/$defs/Pricing" + }, + "team": { + "oneOf": [ + { + "$ref": "#/$defs/Team", + "description": "Team information object with members array (v3.1.0+)." + }, + { + "type": "array", + "description": "DEPRECATED: Array of team members. Use the Team object structure instead. This array format is maintained for backward compatibility with v3.0.2 and earlier versions and will be removed in ODCS 4.0.", + "deprecated": true, + "items": { + "$ref": "#/$defs/TeamMember" + } + } + ] + }, + "roles": { + "type": "array", + "description": "A list of roles that will provide user access to the dataset.", + "items": { + "$ref": "#/$defs/Role" + } + }, + "slaDefaultElement": { + "type": "string", + "description": "DEPRECATED SINCE 3.1. WILL BE REMOVED IN ODCS 4.0. Element (using the element path notation) to do the checks on.", + "deprecated": true + }, + "slaProperties": { + "type": "array", + "description": "A list of key/value pairs for SLA specific properties. There is no limit on the type of properties (more details to come).", + "items": { + "$ref": "#/$defs/ServiceLevelAgreementProperty" + } + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + }, + "contractCreatedTs": { + "type": "string", + "format": "date-time", + "description": "Timestamp in UTC of when the data contract was created." + } + }, + "required": ["version", "apiVersion", "kind", "id", "status"], + "additionalProperties": false, + "unevaluatedProperties": false, + "$defs": { + "ShorthandReference": { + "type": "string", + "description": "Shorthand notation using name fields (table_name.column_name)", + "pattern": "^[A-Za-z_][A-Za-z0-9_]*\\.[A-Za-z_][A-Za-z0-9_]*$" + }, + "FullyQualifiedReference": { + "type": "string", + "description": "Fully qualified notation using id fields (section/id/properties/id), optionally prefixed with external file reference", + "pattern": "^(?:(?:https?:\\/\\/)?[A-Za-z0-9._\\-\\/]+\\.yaml#)?\\/?[A-Za-z_][A-Za-z0-9_]*\\/[A-Za-z0-9_-]+(?:\\/[A-Za-z_][A-Za-z0-9_]*\\/[A-Za-z0-9_-]+)*$" + }, + "StableId": { + "type": "string", + "description": "Stable technical identifier for references. Must be unique within its containing array. Cannot contain special characters ('-', '_' allowed).", + "pattern": "^[A-Za-z0-9_-]+$" + }, + "Server": { + "type": "object", + "description": "Data source details of where data is physically stored.", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "server": { + "type": "string", + "description": "Identifier of the server." + }, + "type": { + "type": "string", + "description": "Type of the server.", + "enum": [ + "api", "athena", "azure", "bigquery", "clickhouse", "databricks", "denodo", "dremio", + "duckdb", "glue", "cloudsql", "db2", "hive", "impala", "informix", "kafka", "kinesis", "local", + "mysql", "oracle", "postgresql", "postgres", "presto", "pubsub", + "redshift", "s3", "sftp", "snowflake", "sqlserver", "synapse", "trino", "vertica", "zen", "custom" + ] + }, + "description": { + "type": "string", + "description": "Description of the server." + }, + "environment": { + "type": "string", + "description": "Environment of the server.", + "examples": ["prod", "preprod", "dev", "uat"] + }, + "roles": { + "type": "array", + "description": "List of roles that have access to the server.", + "items": { + "$ref": "#/$defs/Role" + } + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "api" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ApiServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "athena" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/AthenaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "azure" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/AzureServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "bigquery" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/BigQueryServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "clickhouse" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ClickHouseServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "databricks" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DatabricksServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "denodo" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DenodoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "dremio" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DremioServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "duckdb" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/DuckdbServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "glue" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/GlueServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "cloudsql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/GoogleCloudSqlServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "db2" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/IBMDB2Server" + } + }, + { + "if": { + "properties": { + "type": { + "const": "hive" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/HiveServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "impala" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ImpalaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "informix" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/InformixServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "zen" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/ZenServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "custom" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/CustomServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kafka" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/KafkaServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "kinesis" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/KinesisServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "local" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/LocalServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "mysql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/MySqlServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "oracle" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/OracleServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgresql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "postgres" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PostgresServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "presto" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PrestoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "pubsub" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/PubSubServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "redshift" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/RedshiftServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "s3" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/S3Server" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sftp" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SftpServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "snowflake" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SnowflakeServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sqlserver" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SqlserverServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "synapse" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/SynapseServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "trino" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/TrinoServer" + } + }, + { + "if": { + "properties": { + "type": { + "const": "vertica" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/ServerSource/VerticaServer" + } + } + ], + "required": ["server", "type"], + "unevaluatedProperties": false + }, + "ServerSource": { + "ApiServer": { + "type": "object", + "title": "AthenaServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "The url to the API.", + "examples": [ + "https://api.example.com/v1" + ] + } + }, + "required": [ + "location" + ] + }, + "AthenaServer": { + "type": "object", + "title": "AthenaServer", + "properties": { + "stagingDir": { + "type": "string", + "format": "uri", + "description": "Amazon Athena automatically stores query results and metadata information for each query that runs in a query result location that you can specify in Amazon S3.", + "examples": [ + "s3://my_storage_account_name/my_container/path" + ] + }, + "schema": { + "type": "string", + "description": "Identify the schema in the data source in which your tables exist." + }, + "catalog": { + "type": "string", + "description": "Identify the name of the Data Source, also referred to as a Catalog.", + "default": "awsdatacatalog" + }, + "regionName": { + "type": "string", + "description": "The region your AWS account uses.", + "examples": ["eu-west-1"] + } + }, + "required": [ + "stagingDir", + "schema" + ] + }, + "AzureServer": { + "type": "object", + "title": "AzureServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "Fully qualified path to Azure Blob Storage or Azure Data Lake Storage (ADLS), supports globs.", + "examples": [ + "az://my_storage_account_name.blob.core.windows.net/my_container/path/*.parquet", + "abfss://my_storage_account_name.dfs.core.windows.net/my_container_name/path/*.parquet" + ] + }, + "format": { + "type": "string", + "examples": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "examples": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location", + "format" + ] + }, + "BigQueryServer": { + "type": "object", + "title": "BigQueryServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + }, + "dataset": { + "type": "string", + "description": "The GCP dataset name." + } + }, + "required": [ + "project", + "dataset" + ] + }, + "ClickHouseServer": { + "type": "object", + "title": "ClickHouseServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the ClickHouse server." + }, + "port": { + "type": "integer", + "description": "The port to the ClickHouse server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "DatabricksServer": { + "type": "object", + "title": "DatabricksServer", + "properties": { + "host": { + "type": "string", + "description": "The Databricks host", + "examples": [ + "dbc-abcdefgh-1234.cloud.databricks.com" + ] + }, + "catalog": { + "type": "string", + "description": "The name of the Hive or Unity catalog" + }, + "schema": { + "type": "string", + "description": "The schema name in the catalog" + } + }, + "required": [ + "catalog", + "schema" + ] + }, + "DenodoServer": { + "type": "object", + "title": "DenodoServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Denodo server." + }, + "port": { + "type": "integer", + "description": "The port of the Denodo server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port" + ] + }, + "DremioServer": { + "type": "object", + "title": "DremioServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Dremio server." + }, + "port": { + "type": "integer", + "description": "The port of the Dremio server." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port" + ] + }, + "DuckdbServer": { + "type": "object", + "title": "DuckdbServer", + "properties": { + "database": { + "type": "string", + "description": "Path to duckdb database file." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "database" + ] + }, + "GlueServer": { + "type": "object", + "title": "GlueServer", + "properties": { + "account": { + "type": "string", + "description": "The AWS Glue account", + "examples": [ + "1234-5678-9012" + ] + }, + "database": { + "type": "string", + "description": "The AWS Glue database name", + "examples": [ + "my_database" + ] + }, + "location": { + "type": "string", + "format": "uri", + "description": "The AWS S3 path. Must be in the form of a URL.", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}" + ] + }, + "format": { + "type": "string", + "description": "The format of the files", + "examples": [ + "parquet", + "csv", + "json", + "delta" + ] + } + }, + "required": [ + "account", + "database" + ] + }, + "GoogleCloudSqlServer": { + "type": "object", + "title": "GoogleCloudSqlServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Google Cloud Sql server." + }, + "port": { + "type": "integer", + "description": "The port of the Google Cloud Sql server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + }, + "IBMDB2Server": { + "type": "object", + "title": "IBMDB2Server", + "properties": { + "host": { + "type": "string", + "description": "The host of the IBM DB2 server." + }, + "port": { + "type": "integer", + "description": "The port of the IBM DB2 server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "HiveServer": { + "type": "object", + "title": "HiveServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Hive server. " + }, + "port": { + "type": "integer", + "description": "The port to the Hive server. Defaults to 10000." + }, + "database": { + "type": "string", + "description": "The name of the Hive database." + } + }, + "required": [ + "host", + "database" + ] + }, + "ImpalaServer": { + "type": "object", + "title": "ImpalaServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Impala server." + }, + "port": { + "type": "integer", + "description": "The port to the Impala server. Defaults to 21050." + }, + "database": { + "type": "string", + "description": "The name of the Impala database." + } + }, + "required": [ + "host", + "database" + ] + }, + "InformixServer": { + "type": "object", + "title": "InformixServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Informix server. " + }, + "port": { + "type": "integer", + "description": "The port to the Informix server. Defaults to 9088." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "database" + ] + }, + "ZenServer": { + "type": "object", + "title": "ZenServer", + "properties": { + "host": { + "type": "string", + "description": "Hostname or IP address of the Zen server." + }, + "port": { + "type": "integer", + "description": "Zen server SQL connections port. Defaults to 1583." + }, + "database": { + "type": "string", + "description": "Database name to connect to on the Zen server." + } + }, + "required": ["host", "database"] + }, + "CustomServer": { + "type": "object", + "title": "CustomServer", + "properties": { + "account": { + "type": "string", + "description": "Account used by the server." + }, + "catalog": { + "type": "string", + "description": "Name of the catalog." + }, + "database": { + "type": "string", + "description": "Name of the database." + }, + "dataset": { + "type": "string", + "description": "Name of the dataset." + }, + "delimiter": { + "type": "string", + "description": "Delimiter." + }, + "endpointUrl": { + "type": "string", + "description": "Server endpoint.", + "format": "uri" + }, + "format": { + "type": "string", + "description": "File format." + }, + "host": { + "type": "string", + "description": "Host name or IP address." + }, + "location": { + "type": "string", + "description": "A URL to a location.", + "format": "uri" + }, + "path": { + "type": "string", + "description": "Relative or absolute path to the data file(s)." + }, + "port": { + "type": "integer", + "description": "Port to the server. No default value is assumed for custom servers." + }, + "project": { + "type": "string", + "description": "Project name." + }, + "region": { + "type": "string", + "description": "Cloud region." + }, + "regionName": { + "type": "string", + "description": "Region name." + }, + "schema": { + "type": "string", + "description": "Name of the schema." + }, + "serviceName": { + "type": "string", + "description": "Name of the service." + }, + "stagingDir": { + "type": "string", + "description": "Staging directory." + }, + "warehouse": { + "type": "string", + "description": "Name of the cluster or warehouse." + }, + "stream": { + "type": "string", + "description": "Name of the data stream." + } + } + }, + "KafkaServer": { + "type": "object", + "title": "KafkaServer", + "description": "Kafka Server", + "properties": { + "host": { + "type": "string", + "description": "The bootstrap server of the kafka cluster." + }, + "format": { + "type": "string", + "description": "The format of the messages.", + "examples": ["json", "avro", "protobuf", "xml"], + "default": "json" + } + }, + "required": [ + "host" + ] + }, + "KinesisServer": { + "type": "object", + "title": "KinesisDataStreamsServer", + "description": "Kinesis Data Streams Server", + "properties": { + "region": { + "type": "string", + "description": "AWS region.", + "examples": [ + "eu-west-1" + ] + }, + "format": { + "type": "string", + "description": "The format of the record", + "examples": [ + "json", + "avro", + "protobuf" + ] + } + } + }, + "LocalServer": { + "type": "object", + "title": "LocalServer", + "properties": { + "path": { + "type": "string", + "description": "The relative or absolute path to the data file(s).", + "examples": [ + "./folder/data.parquet", + "./folder/*.parquet" + ] + }, + "format": { + "type": "string", + "description": "The format of the file(s)", + "examples": [ + "json", + "parquet", + "delta", + "csv" + ] + } + }, + "required": [ + "path", + "format" + ] + }, + "MySqlServer": { + "type": "object", + "title": "MySqlServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the MySql server." + }, + "port": { + "type": "integer", + "description": "The port of the MySql server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "OracleServer": { + "type": "object", + "title": "OracleServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the oracle server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the oracle server.", + "examples": [ + 1523 + ] + }, + "serviceName": { + "type": "string", + "description": "The name of the service.", + "examples": [ + "service" + ] + } + }, + "required": [ + "host", + "port", + "serviceName" + ] + }, + "PostgresServer": { + "type": "object", + "title": "PostgresServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Postgres server" + }, + "port": { + "type": "integer", + "description": "The port to the Postgres server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + }, + "PrestoServer": { + "type": "object", + "title": "PrestoServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Presto server", + "examples": [ + "localhost:8080" + ] + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "postgres" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema.", + "examples": [ + "public" + ] + } + }, + "required": [ + "host" + ] + }, + "PubSubServer": { + "type": "object", + "title": "PubSubServer", + "properties": { + "project": { + "type": "string", + "description": "The GCP project name." + } + }, + "required": [ + "project" + ] + }, + "RedshiftServer": { + "type": "object", + "title": "RedshiftServer", + "properties": { + "host": { + "type": "string", + "description": "An optional string describing the server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + }, + "region": { + "type": "string", + "description": "AWS region of Redshift server.", + "examples": ["us-east-1"] + }, + "account": { + "type": "string", + "description": "The account used by the server." + } + }, + "required": [ + "database", + "schema" + ] + }, + "S3Server": { + "type": "object", + "title": "S3Server", + "properties": { + "location": { + "type": "string", + "format": "uri", + "description": "S3 URL, starting with `s3://`", + "examples": [ + "s3://datacontract-example-orders-latest/data/{model}/*.json" + ] + }, + "endpointUrl": { + "type": "string", + "format": "uri", + "description": "The server endpoint for S3-compatible servers.", + "examples": ["https://minio.example.com"] + }, + "format": { + "type": "string", + "examples": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "examples": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "SftpServer": { + "type": "object", + "title": "SftpServer", + "properties": { + "location": { + "type": "string", + "format": "uri", + "pattern": "^sftp://.*", + "description": "SFTP URL, starting with `sftp://`", + "examples": [ + "sftp://123.123.12.123/{model}/*.json" + ] + }, + "format": { + "type": "string", + "examples": [ + "parquet", + "delta", + "json", + "csv" + ], + "description": "File format." + }, + "delimiter": { + "type": "string", + "examples": [ + "new_line", + "array" + ], + "description": "Only for format = json. How multiple json documents are delimited within one file" + } + }, + "required": [ + "location" + ] + }, + "SnowflakeServer": { + "type": "object", + "title": "SnowflakeServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the Snowflake server" + }, + "port": { + "type": "integer", + "description": "The port to the Snowflake server." + }, + "account": { + "type": "string", + "description": "The Snowflake account used by the server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + }, + "warehouse": { + "type": "string", + "description": "The name of the cluster of resources that is a Snowflake virtual warehouse." + } + }, + "required": [ + "account", + "database", + "schema" + ] + }, + "SqlserverServer": { + "type": "object", + "title": "SqlserverServer", + "properties": { + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server.", + "default": 1433, + "examples": [ + 1433 + ] + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "database" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "dbo" + ] + } + }, + "required": [ + "host", + "database", + "schema" + ] + }, + "SynapseServer": { + "type": "object", + "title": "SynapseServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Synapse server." + }, + "port": { + "type": "integer", + "description": "The port of the Synapse server." + }, + "database": { + "type": "string", + "description": "The name of the database." + } + }, + "required": [ + "host", + "port", + "database" + ] + }, + "TrinoServer": { + "type": "object", + "title": "TrinoServer", + "properties": { + "host": { + "type": "string", + "description": "The Trino host URL.", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The Trino port." + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "hive" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "my_schema" + ] + } + }, + "required": [ + "host", + "port", + "catalog", + "schema" + ] + }, + "VerticaServer": { + "type": "object", + "title": "VerticaServer", + "properties": { + "host": { + "type": "string", + "description": "The host of the Vertica server." + }, + "port": { + "type": "integer", + "description": "The port of the Vertica server." + }, + "database": { + "type": "string", + "description": "The name of the database." + }, + "schema": { + "type": "string", + "description": "The name of the schema." + } + }, + "required": [ + "host", + "port", + "database", + "schema" + ] + } + }, + "SchemaElement": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "name": { + "type": "string", + "description": "Name of the element." + }, + "physicalType": { + "type": "string", + "description": "The physical element data type in the data source.", + "examples": ["table", "view", "topic", "file"] + }, + "description": { + "type": "string", + "description": "Description of the element." + }, + "businessName": { + "type": "string", + "description": "The business name of the element." + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + } + }, + "SchemaObject": { + "type": "object", + "properties": { + "logicalType": { + "type": "string", + "description": "The logical element data type.", + "enum": ["object"] + }, + "physicalName": { + "type": "string", + "description": "Physical name.", + "examples": ["table_1_2_0"] + }, + "dataGranularityDescription": { + "type": "string", + "description": "Granular level of the data in the object.", + "examples": ["Aggregation by country"] + }, + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + }, + "relationships": { + "type": "array", + "description": "A list of relationships to other properties. Each relationship must have 'from', 'to' and optionally 'type' field.", + "items": { + "$ref": "#/$defs/RelationshipSchemaLevel" + } + }, + "quality": { + "$ref": "#/$defs/DataQualityChecks" + } + }, + "allOf": [ + { + "$ref": "#/$defs/SchemaElement" + } + ], + "required": ["name"], + "unevaluatedProperties": false + }, + "SchemaBaseProperty": { + "type": "object", + "properties": { + "primaryKey": { + "type": "boolean", + "description": "Boolean value specifying whether the element is primary or not. Default is false." + }, + "primaryKeyPosition": { + "type": "integer", + "default": -1, + "description": "If element is a primary key, the position of the primary key element. Starts from 1. Example of `account_id, name` being primary key columns, `account_id` has primaryKeyPosition 1 and `name` primaryKeyPosition 2. Default to -1." + }, + "logicalType": { + "type": "string", + "description": "The logical element data type.", + "enum": ["string", "date", "timestamp", "time", "number", "integer", "object", "array", "boolean"] + }, + "logicalTypeOptions": { + "type": "object", + "description": "Additional optional metadata to describe the logical type." + }, + "physicalType": { + "type": "string", + "description": "The physical element data type in the data source. For example, VARCHAR(2), DOUBLE, INT." + }, + "physicalName": { + "type": "string", + "description": "Physical name.", + "examples": ["col_str_a"] + }, + "required": { + "type": "boolean", + "default": false, + "description": "Indicates if the element may contain Null values; possible values are true and false. Default is false." + }, + "unique": { + "type": "boolean", + "default": false, + "description": "Indicates if the element contains unique values; possible values are true and false. Default is false." + }, + "partitioned": { + "type": "boolean", + "default": false, + "description": "Indicates if the element is partitioned; possible values are true and false." + }, + "partitionKeyPosition": { + "type": "integer", + "default": -1, + "description": "If element is used for partitioning, the position of the partition element. Starts from 1. Example of `country, year` being partition columns, `country` has partitionKeyPosition 1 and `year` partitionKeyPosition 2. Default to -1." + }, + "classification": { + "type": "string", + "description": "Can be anything, like confidential, restricted, and public to more advanced categorization. Some companies like PayPal, use data classification indicating the class of data in the element; expected values are 1, 2, 3, 4, or 5.", + "examples": ["confidential", "restricted", "public"] + }, + "encryptedName": { + "type": "string", + "description": "The element name within the dataset that contains the encrypted element value. For example, unencrypted element `email_address` might have an encryptedName of `email_address_encrypt`." + }, + "transformSourceObjects": { + "type": "array", + "description": "List of objects in the data source used in the transformation.", + "items": { + "type": "string" + } + }, + "transformLogic": { + "type": "string", + "description": "Logic used in the element transformation." + }, + "transformDescription": { + "type": "string", + "description": "Describes the transform logic in very simple terms." + }, + "examples": { + "type": "array", + "description": "List of sample element values.", + "items": { + "$ref": "#/$defs/AnyType" + } + }, + "criticalDataElement": { + "type": "boolean", + "default": false, + "description": "True or false indicator; If element is considered a critical data element (CDE) then true else false." + }, + "relationships": { + "type": "array", + "description": "A list of relationships to other properties. When defined at property level, the 'from' field is implicit and should not be specified.", + "items": { + "$ref": "#/$defs/RelationshipPropertyLevel" + } + }, + "quality": { + "$ref": "#/$defs/DataQualityChecks" + } + }, + "allOf": [ + { + "$ref": "#/$defs/SchemaElement" + }, + { + "if": { + "properties": { + "logicalType": { + "const": "string" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "minLength": { + "type": "integer", + "minimum": 0, + "description": "Minimum length of the string." + }, + "maxLength": { + "type": "integer", + "minimum": 0, + "description": "Maximum length of the string." + }, + "pattern": { + "type": "string", + "description": "Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (https://262.ecma-international.org/5.1/#sec-15.10.1)." + }, + "format": { + "type": "string", + "examples": ["password", "byte", "binary", "email", "uuid", "uri", "hostname", "ipv4", "ipv6"], + "description": "Provides extra context about what format the string follows." + } + } + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "date" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "format": { + "type": "string", + "examples": ["yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss", "HH:mm:ss"], + "description": "Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). For example, format 'yyyy-MM-dd'." + }, + "exclusiveMaximum": { + "type": "string", + "description": "All values must be strictly less than this value (values < exclusiveMaximum)." + }, + "maximum": { + "type": "string", + "description": "All date values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMinimum": { + "type": "string", + "description": "All values must be strictly greater than this value (values > exclusiveMinimum)." + }, + "minimum": { + "type": "string", + "description": "All date values are greater than or equal to this value (values >= minimum)." + } + } + } + } + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "logicalType": { + "const": "timestamp" + } + } + }, + { + "properties": { + "logicalType": { + "const": "time" + } + } + } + ] + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "format": { + "type": "string", + "examples": ["yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss", "HH:mm:ss"], + "description": "Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). For example, format 'yyyy-MM-dd'." + }, + "exclusiveMaximum": { + "type": "string", + "description": "All values must be strictly less than this value (values < exclusiveMaximum)." + }, + "maximum": { + "type": "string", + "description": "All date values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMinimum": { + "type": "string", + "description": "All values must be strictly greater than this value (values > exclusiveMinimum)." + }, + "minimum": { + "type": "string", + "description": "All date values are greater than or equal to this value (values >= minimum)." + }, + "timezone": { + "type": "boolean", + "description": "Whether the timestamp defines the timezone or not. If true, timezone information is included in the timestamp." + }, + "defaultTimezone": { + "type": "string", + "description": "The default timezone of the timestamp. If timezone is not defined, the default timezone UTC is used.", + "default": "Etc/UTC" + } + } + } + } + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "logicalType": { + "const": "integer" + } + } + } + ] + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "multipleOf": { + "type": "number", + "exclusiveMinimum": 0, + "description": "Values must be multiples of this number. For example, multiple of 5 has valid values 0, 5, 10, -5." + }, + "maximum": { + "type": "number", + "description": "All values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMaximum": { + "type": "number", + "description": "All values must be strictly less than this value (values < exclusiveMaximum)." + }, + "minimum": { + "type": "number", + "description": "All values are greater than or equal to this value (values >= minimum)." + }, + "exclusiveMinimum": { + "type": "number", + "description": "All values must be strictly greater than this value (values > exclusiveMinimum)." + }, + "format": { + "type": "string", + "default": "i32", + "description": "Format of the value in terms of how many bits of space it can use and whether it is signed or unsigned (follows the Rust integer types).", + "enum": ["i8", "i16", "i32", "i64", "i128", "u8", "u16", "u32", "u64", "u128"] + } + } + } + } + } + }, + { + "if": { + "anyOf": [ + { + "properties": { + "logicalType": { + "const": "number" + } + } + } + ] + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "multipleOf": { + "type": "number", + "exclusiveMinimum": 0, + "description": "Values must be multiples of this number. For example, multiple of 5 has valid values 0, 5, 10, -5." + }, + "maximum": { + "type": "number", + "description": "All values are less than or equal to this value (values <= maximum)." + }, + "exclusiveMaximum": { + "type": "number", + "description": "All values must be strictly less than this value (values < exclusiveMaximum)." + }, + "minimum": { + "type": "number", + "description": "All values are greater than or equal to this value (values >= minimum)." + }, + "exclusiveMinimum": { + "type": "number", + "description": "All values must be strictly greater than this value (values > exclusiveMinimum)." + }, + "format": { + "type": "string", + "default": "i32", + "description": "Format of the value in terms of how many bits of space it can use (follows the Rust float types).", + "enum": ["f32", "f64"] + } + } + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "object" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "maxProperties": { + "type": "integer", + "minimum": 0, + "description": "Maximum number of properties." + }, + "minProperties": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Minimum number of properties." + }, + "required": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": true, + "description": "Property names that are required to exist in the object." + } + } + }, + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + } + } + } + }, + { + "if": { + "properties": { + "logicalType": { + "const": "array" + } + } + }, + "then": { + "properties": { + "logicalTypeOptions": { + "type": "object", + "additionalProperties": false, + "properties": { + "maxItems": { + "type": "integer", + "minimum": 0, + "description": "Maximum number of items." + }, + "minItems": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Minimum number of items" + }, + "uniqueItems": { + "type": "boolean", + "default": false, + "description": "If set to true, all items in the array are unique." + } + } + }, + "items": { + "$ref": "#/$defs/SchemaItemProperty", + "description": "List of items in an array (only applicable when `logicalType: array`)." + } + } + } + } + ], + "unevaluatedProperties": false + }, + "SchemaProperty": { + "type": "object", + "$ref": "#/$defs/SchemaBaseProperty", + "required": ["name"], + "unevaluatedProperties": false + }, + "SchemaItemProperty": { + "type": "object", + "$ref": "#/$defs/SchemaBaseProperty", + "properties": { + "properties": { + "type": "array", + "description": "A list of properties for the object.", + "items": { + "$ref": "#/$defs/SchemaProperty" + } + } + }, + "unevaluatedProperties": false + }, + "Tags": { + "type": "array", + "description": "A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`.", + "examples": ["finance", "sensitive", "employee_record"], + "items": { + "type": "string" + } + }, + "DataQuality": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + }, + "businessImpact": { + "type": "string", + "description": "Consequences of the rule failure.", + "examples": ["operational", "regulatory"] + }, + "customProperties": { + "type": "array", + "description": "Additional properties required for rule execution.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "description": { + "type": "string", + "description": "Describe the quality check to be completed." + }, + "dimension": { + "type": "string", + "description": "The key performance indicator (KPI) or dimension for data quality.", + "enum": ["accuracy", "completeness", "conformity", "consistency", "coverage", "timeliness", "uniqueness"] + }, + "method": { + "type": "string", + "examples": ["reconciliation"] + }, + "name": { + "type": "string", + "description": "Name of the data quality check." + }, + "schedule": { + "type": "string", + "description": "Rule execution schedule details.", + "examples": ["0 20 * * *"] + }, + "scheduler": { + "type": "string", + "description": "The name or type of scheduler used to start the data quality check.", + "examples": ["cron"] + }, + "severity": { + "type": "string", + "description": "The severance of the quality rule.", + "examples": ["info", "warning", "error"] + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "type": { + "type": "string", + "description": "The type of quality check. 'text' is human-readable text that describes the quality of the data. 'library' is a set of maintained predefined quality attributes such as row count or unique. 'sql' is an individual SQL query that returns a value that can be compared. 'custom' is quality attributes that are vendor-specific, such as Soda or Great Expectations.", + "enum": ["text", "library", "sql", "custom"], + "default": "library" + }, + "unit": { + "type": "string", + "description": "Unit the rule is using, popular values are `rows` or `percent`, but any value is allowed.", + "examples": ["rows", "percent"] + } + }, + "allOf": [ + { + "if": { + "anyOf": [ + { + "properties": { + "type": { + "const": "library" + } + }, + "required": ["type"] + }, + { + "properties": { + "metric": { + "type": "string" + } + }, + "required": ["metric"] + } + ] + }, + "then": { + "$ref": "#/$defs/DataQualityLibrary" + } + }, + { + "if": { + "properties": { + "type": { + "const": "sql" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/DataQualitySql" + } + }, + { + "if": { + "properties": { + "type": { + "const": "custom" + } + }, + "required": ["type"] + }, + "then": { + "$ref": "#/$defs/DataQualityCustom" + } + } + ], + "unevaluatedProperties": false + }, + "DataQualityChecks": { + "type": "array", + "description": "Data quality rules with all the relevant information for rule setup and execution.", + "items": { + "$ref": "#/$defs/DataQuality" + } + }, + "DataQualityOperators": { + "type": "object", + "description": "Common comparison operators for data quality checks.", + "oneOf": [ + { + "properties": { + "mustBe": { + "description": "Must be equal to the value to be valid. When using numbers, it is equivalent to '='." + } + }, + "required": ["mustBe"] + }, + { + "properties": { + "mustNotBe": { + "description": "Must not be equal to the value to be valid. When using numbers, it is equivalent to '!='." + } + }, + "required": ["mustNotBe"] + }, + { + "properties": { + "mustBeGreaterThan": { + "type": "number", + "description": "Must be greater than the value to be valid. It is equivalent to '>'." + } + }, + "required": ["mustBeGreaterThan"] + }, + { + "properties": { + "mustBeGreaterOrEqualTo": { + "type": "number", + "description": "Must be greater than or equal to the value to be valid. It is equivalent to '>='." + } + }, + "required": ["mustBeGreaterOrEqualTo"] + }, + { + "properties": { + "mustBeLessThan": { + "type": "number", + "description": "Must be less than the value to be valid. It is equivalent to '<'." + } + }, + "required": ["mustBeLessThan"] + }, + { + "properties": { + "mustBeLessOrEqualTo": { + "type": "number", + "description": "Must be less than or equal to the value to be valid. It is equivalent to '<='." + } + }, + "required": ["mustBeLessOrEqualTo"] + }, + { + "properties": { + "mustBeBetween": { + "type": "array", + "description": "Must be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + } + }, + "required": ["mustBeBetween"] + }, + { + "properties": { + "mustNotBeBetween": { + "type": "array", + "description": "Must not be between the two numbers to be valid. Smallest number first in the array.", + "minItems": 2, + "maxItems": 2, + "uniqueItems": true, + "items": { + "type": "number" + } + } + }, + "required": ["mustNotBeBetween"] + } + ] + }, + "DataQualityLibrary": { + "type": "object", + "allOf": [ + { + "$ref": "#/$defs/DataQualityOperators" + } + ], + "properties": { + "metric": { + "type": "string", + "description": "Define a data quality check based on the predefined metrics as per ODCS.", + "enum": ["nullValues", "missingValues", "invalidValues", "duplicateValues", "rowCount"] + }, + "rule": { + "type": "string", + "deprecated": true, + "description": "Use metric instead" + }, + "arguments": { + "type": "object", + "description": "Additional arguments for the metric, if needed." + } + }, + "required": ["metric"] + }, + "DataQualitySql": { + "type": "object", + "allOf": [ + { + "$ref": "#/$defs/DataQualityOperators" + } + ], + "properties": { + "query": { + "type": "string", + "description": "Query string that adheres to the dialect of the provided server.", + "examples": ["SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL"] + } + }, + "required": ["query"] + }, + "DataQualityCustom": { + "type": "object", + "properties": { + "engine": { + "type": "string", + "description": "Name of the engine which executes the data quality checks.", + "examples": ["soda", "great-expectations", "monte-carlo", "dbt"] + }, + "implementation": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object" + } + ] + } + }, + "required": ["engine", "implementation"] + }, + "AuthoritativeDefinitions": { + "type": "array", + "description": "List of links to sources that provide more details on the dataset; examples would be a link to an external definition, a training video, a git repo, data catalog, or another tool. Authoritative definitions follow the same structure in the standard.", + "items": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "url": { + "type": "string", + "description": "URL to the authority." + }, + "type": { + "type": "string", + "description": "Type of definition for authority: v2.3 adds standard values: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`.", + "examples": ["businessDefinition", "transformationImplementation", "videoTutorial", "tutorial", "implementation"] + }, + "description": { + "type": "string", + "description": "Description of the authoritative definition for humans." + } + }, + "required": ["url", "type"], + "additionalProperties": false + } + }, + "Support": { + "type": "array", + "description": "Top level for support channels.", + "items": { + "$ref": "#/$defs/SupportItem" + } + }, + "SupportItem": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "channel": { + "type": "string", + "description": "Channel name or identifier." + }, + "url": { + "type": "string", + "description": "Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.)." + }, + "description": { + "type": "string", + "description": "Description of the channel, free text." + }, + "tool": { + "type": "string", + "description": "Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, `googlechat`, or `other`.", + "examples": ["email", "slack", "teams", "discord", "ticket", "googlechat", "other"] + }, + "scope": { + "type": "string", + "description": "Scope can be: `interactive`, `announcements`, `issues`, `notifications`.", + "examples": ["interactive", "announcements", "issues", "notifications"] + }, + "invitationUrl": { + "type": "string", + "description": "Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax)." + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + }, + "required": ["channel"], + "additionalProperties": false + }, + "Pricing": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "priceAmount": { + "type": "number", + "description": "Subscription price per unit of measure in `priceUnit`." + }, + "priceCurrency": { + "type": "string", + "description": "Currency of the subscription price in `price.priceAmount`." + }, + "priceUnit": { + "type": "string", + "description": "The unit of measure for calculating cost. Examples megabyte, gigabyte." + } + }, + "additionalProperties": false + }, + "TeamMember": { + "type": "object", + "description": "Team member information.", + "additionalProperties": false, + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "username": { + "type": "string", + "description": "The user's username or email." + }, + "name": { + "type": "string", + "description": "The user's name." + }, + "description": { + "type": "string", + "description": "The user's description." + }, + "role": { + "type": "string", + "description": "The user's job role; Examples might be owner, data steward. There is no limit on the role." + }, + "dateIn": { + "type": "string", + "format": "date", + "description": "The date when the user joined the team." + }, + "dateOut": { + "type": "string", + "format": "date", + "description": "The date when the user ceased to be part of the team." + }, + "replacedByUsername": { + "type": "string", + "description": "The username of the user who replaced the previous user." + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "customProperties": { + "type": "array", + "description": "Custom properties block.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + } + }, + "required": ["username"] + }, + "Team": { + "type": "object", + "description": "Team information.", + "additionalProperties": false, + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "name": { + "type": "string", + "description": "Team name." + }, + "description": { + "type": "string", + "description": "Team description." + }, + "members": { + "type": "array", + "description": "List of members.", + "items": { + "$ref": "#/$defs/TeamMember" + } + }, + "tags": { + "$ref": "#/$defs/Tags" + }, + "customProperties": { + "type": "array", + "description": "Custom properties block.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "authoritativeDefinitions": { + "$ref": "#/$defs/AuthoritativeDefinitions" + } + } + }, + "Role": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "role": { + "type": "string", + "description": "Name of the IAM role that provides access to the dataset." + }, + "description": { + "type": "string", + "description": "Description of the IAM role and its permissions." + }, + "access": { + "type": "string", + "description": "The type of access provided by the IAM role." + }, + "firstLevelApprovers": { + "type": "string", + "description": "The name(s) of the first-level approver(s) of the role." + }, + "secondLevelApprovers": { + "type": "string", + "description": "The name(s) of the second-level approver(s) of the role." + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + }, + "required": ["role"], + "additionalProperties": false + }, + "ServiceLevelAgreementProperty": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "property": { + "type": "string", + "description": "Specific property in SLA, check the periodic table. May requires units (more details to come)." + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "description": "Agreement value. The label will change based on the property itself." + }, + "valueExt": { + "$ref": "#/$defs/AnyNonCollectionType", + "description": "Extended agreement value. The label will change based on the property itself." + }, + "unit": { + "type": "string", + "description": "**d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard." + }, + "element": { + "type": "string", + "description": "Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas." + }, + "driver": { + "type": "string", + "description": "Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`.", + "examples": ["regulatory", "analytics", "operational"] + }, + "description": { + "type": "string", + "description": "Description of the SLA for humans.", + "examples": ["99.9% of the time, data is available by 6 AM UTC"] + }, + "scheduler": { + "type": "string", + "description": "Name of the scheduler, can be cron or any tool your organization support.", + "examples": ["cron"] + }, + "schedule": { + "type": "string", + "description": "Configuration information for the scheduling tool, for cron a possible value is 0 20 * * *.", + "examples": ["0 20 * * *"] + } + }, + "required": ["property", "value"], + "additionalProperties": false + }, + "CustomProperties": { + "type": "array", + "description": "A list of key/value pairs for custom properties.", + "items": { + "$ref": "#/$defs/CustomProperty" + } + }, + "CustomProperty": { + "type": "object", + "properties": { + "id": { + "$ref": "#/$defs/StableId" + }, + "property": { + "type": "string", + "description": "The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract." + }, + "value": { + "$ref": "#/$defs/AnyType", + "description": "The value of the key." + }, + "description": { + "type": "string", + "description": "Description of the custom property." + } + }, + "required": ["property", "value"], + "additionalProperties": false + }, + "AnyType": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "AnyNonCollectionType": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "RelationshipBase": { + "type": "object", + "description": "Base definition for relationships between properties, typically for foreign key constraints.", + "properties": { + "type": { + "type": "string", + "description": "The type of relationship. Defaults to 'foreignKey'.", + "default": "foreignKey", + "enum": ["foreignKey"] + }, + "from": { + "oneOf": [ + { + "anyOf": [ + { + "$ref": "#/$defs/ShorthandReference" + }, + { + "$ref": "#/$defs/FullyQualifiedReference" + } + ], + "description": "Source property reference using fully qualified or shorthand notation." + }, + { + "type": "array", + "description": "Array of source properties for composite keys.", + "items": { + "anyOf": [ + { + "$ref": "#/$defs/ShorthandReference" + }, + { + "$ref": "#/$defs/FullyQualifiedReference" + } + ] + }, + "minItems": 1 + } + ], + "description": "Source property or properties." + }, + "to": { + "oneOf": [ + { + "anyOf": [ + { + "$ref": "#/$defs/ShorthandReference" + }, + { + "$ref": "#/$defs/FullyQualifiedReference" + } + ], + "description": "Target property reference using fully qualified or shorthand notation." + }, + { + "type": "array", + "description": "Array of target properties for composite keys.", + "items": { + "anyOf": [ + { + "$ref": "#/$defs/ShorthandReference" + }, + { + "$ref": "#/$defs/FullyQualifiedReference" + } + ] + }, + "minItems": 1 + } + ], + "description": "Target property or properties to reference." + }, + "customProperties": { + "$ref": "#/$defs/CustomProperties" + } + } + }, + "RelationshipSchemaLevel": { + "type": "object", + "description": "Relationship definition at schema level, requiring both 'from' and 'to' fields with matching types.", + "allOf": [ + { + "$ref": "#/$defs/RelationshipBase" + }, + { + "required": ["from", "to"] + }, + { + "oneOf": [ + { + "description": "Single-column relationship - both fields must be strings", + "properties": { + "from": { + "type": "string" + }, + "to": { + "type": "string" + } + } + }, + { + "description": "Composite key relationship - both fields must be arrays with matching lengths", + "properties": { + "from": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "to": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + } + } + } + ] + } + ], + "unevaluatedProperties": false + }, + "RelationshipPropertyLevel": { + "type": "object", + "description": "Relationship definition at property level, where 'from' is implicitly the current property.", + "allOf": [ + { + "$ref": "#/$defs/RelationshipBase" + }, + { + "type": "object", + "required": ["to"] + }, + { + "not": { + "required": ["from"] + }, + "description": "The 'from' field must not be specified at property level as it is implicitly derived from the property context" + } + ], + "unevaluatedProperties": false + }, + "Relationship": { + "description": "Compatibility wrapper for relationship definitions.", + "oneOf": [ + { + "$ref": "#/$defs/RelationshipSchemaLevel" + }, + { + "$ref": "#/$defs/RelationshipPropertyLevel" + } + ] + } + } +} diff --git a/datacontract/templates/datacontract_odcs.html b/datacontract/templates/datacontract_odcs.html index cd9ab779d..84cf46b56 100644 --- a/datacontract/templates/datacontract_odcs.html +++ b/datacontract/templates/datacontract_odcs.html @@ -1,3 +1,61 @@ +{% macro render_nested_properties(properties, level) %} + + {% for property in properties or [] %} + + +
+ {% for i in range(0,level)%} +   + {% endfor %} + {% if level > 0 %} + ↳ + {% endif %} + {{ property.name or "items" }}
+ {% if property.primaryKey %} + Primary Key + {% endif %} + {% if property.partitioned %} + Partitioned + {% endif %} + {% if property.criticalDataElement %} + Critical + {% endif %} + + {{ + property.businessName or "-" }} + + +
{{ property.logicalType }}
+ {% if property.physicalType and property.physicalType != + property.logicalType %} +
{{ property.physicalType }}
+ {% endif %} + + + {% if property.required %} + Yes + {% else %} + No + {% endif %} + + {{ property.description or "-" + }} + + + {% if property.properties %} + {{render_nested_properties(property.properties, level+1)}} + {% endif %} + {% if property.items %} + {{render_nested_properties([property.items], level+1)}} + {% endif %} + {% endfor %} + + +{% endmacro %} + @@ -12,7 +70,6 @@ -
- {% for schema in datacontract.schema_ %} + {% for schema in datacontract.schema_ or [] %}
@@ -225,10 +282,10 @@

{% if schema.name %} - {{ schema.name }} + {{ schema.name }} {% endif %} {% if schema.physicalName and schema.physicalName != schema.name %} - ({{ schema.physicalName }}) + ({{ schema.physicalName }}) {% endif %} {{ schema.physicalType or schema.logicalType }} @@ -265,45 +322,7 @@

- {% for property in schema.properties %} - - -
{{ property.name }}
- {% if property.primaryKey %} - Primary Key - {% endif %} - {% if property.partitioned %} - Partitioned - {% endif %} - {% if property.criticalDataElement %} - Critical - {% endif %} - - {{ - property.businessName or "-" }} - - -
{{ property.logicalType }}
- {% if property.physicalType and property.physicalType != - property.logicalType %} -
{{ property.physicalType }}
- {% endif %} - - - {% if property.required %} - Yes - {% else %} - No - {% endif %} - - {{ property.description or "-" - }} - - - {% endfor %} + {{ render_nested_properties(schema.properties, 0) }} {% if schema.quality %} @@ -403,6 +422,24 @@

Team

+ {# Check if old array format (deprecated) or new Team object format (v3.1.0+) #} + {% if datacontract.team is sequence and datacontract.team is not string %} + {# Deprecated array format #} + {% set team_members = datacontract.team %} + {% else %} + {# New team format (v3.1.0+): object with name, description, members #} + {% if datacontract.team.name or datacontract.team.description %} +
+ {% if datacontract.team.name %} +
{{ datacontract.team.name }}
+ {% endif %} + {% if datacontract.team.description %} +
{{ datacontract.team.description }}
+ {% endif %} +
+ {% endif %} + {% set team_members = datacontract.team.members or [] %} + {% endif %} @@ -429,7 +466,7 @@

Team

- {% for member in datacontract.team %} + {% for member in team_members %}
{{ member.username }} @@ -663,4 +700,4 @@

Filters

for field_name, field in model.fields.items() %} {{ field_name|lower|e }} {{ field.description|lower|e if field.description else '' }} {% endfor %} + {% endfor %} {% + for tag in contract.spec.tags if contract.spec.tags %} + tag {{ tag|lower|e }} {% endfor %} "> @@ -117,6 +120,15 @@

{{contract.spec.info.titl {{contract.spec.info.description}}

+ {% if contract.spec.tags %} +
+
+ {% for tag in contract.spec.tags %} + {{tag}} + {% endfor %} +
+
+ {% endif %}
@@ -185,15 +197,15 @@