From bea8104f84f18fa22a2a7d077c07f612f276db59 Mon Sep 17 00:00:00 2001 From: Anders Westrheim Date: Thu, 10 Apr 2025 14:46:30 +0200 Subject: [PATCH 1/4] added readme, changed version number of whl file --- .github/workflows/ci.yml | 2 +- README.md | 92 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72f7f6b..ca762c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,7 +84,7 @@ jobs: run: | # Find the .whl file using a wildcard and rename it WHL_FILE=$(find . -type f -name "*.whl" -print -quit) - mv "$WHL_FILE" dist/dataworkbench-latest-py3-none-any.whl + mv "$WHL_FILE" dist/dataworkbench-1.0-py3-none-any.whl - name: Log package version number if: github.event_name == 'push' diff --git a/README.md b/README.md index 9506790..bf1d4d3 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,100 @@ | | | | --- | --- | | Testing | [![CI](https://github.com/veracity/DataWorkbench/actions/workflows/ci.yml/badge.svg)](https://github.com/veracity/DataWorkbench/actions/workflows/ci.yml) | -| + + +# DataWorkbench ## What is it? +Veracity DataWorkbench is a Python SDK designed to bridge your Python environment with Veracity DataWorkbench services. It simplifies access to data cataloging, lineage tracking, and APIs — supporting efficient data workflows across local and cloud environments such as Databricks -## Table of Contents +## Table of Contents - [Features](#features) +- [Installation](#installation) +- [How to use it](#how-to-use-it) +- [Configuration](#configuration) +- [Examples](#examples) +- [API Reference](#api-reference) +- [Contributing](#contributing) +- [License](#license) ## Features +- **DataCatalogue**: Register and manage datasets in the Veracity Data Workbench Data Catalogue. + +## Installation +This package is pre-installed in Veracity-hosted Databricks environments (if analytics features are enabled). + +To install the latest version locally: + +```sh +pip install https://github.com/veracity/DataWorkbench/releases/latest/download/dataworkbench-1.0-py3-none-any.whl +``` +Make sure you have the required credentials and environment variables set when running outside Databricks. + + +## How to use it +In Veracity-hosted Databricks, the SDK is ready to use: + +```python +import dataworkbench +``` + +To use it on your local machine, it requires you to set a set of variables to connect to the Veracity Dataworkbench API. + +### Basic Example + +```python +from dataworkbench import DataCatalogue + +df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["letter", "number"]) + +datacatalogue = DataCatalogue() # Naming subject to change +datacatalogue.save(df, "Dataset Name", "Description", tags={"environment": ["test"]}) +``` + +## Configuration + +When using Dataworkbench locally, you need to configure the following environment variables: + +```python +# Required for local machine setup +import os + +os.environ["ApimClientId"] = "your-apim-client-id" +os.environ["ApimClientSecret"] = "your-apim-client-secret" +os.environ["ApimScope"] = "your-apim-scope" +``` + +Alternatively, create a `.env` file or use a configuration file: + +``` +# .env file example +ApimClientId=your-apim-client-id +ApimClientSecret=your-apim-client-secret +ApimScope=your-apim-scope +``` + +## Examples + +### Saving a Spark DataFrame to the Data Catalogue + +```python +from dataworkbench import DataCatalogue + +df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["letter", "number"]) + +datacatalogue = DataCatalogue() # Naming subject to change +datacatalogue.save(df, "Dataset Name", "Description", tags={"environment": ["test"]}) +``` + +## API Reference + +### DataCatalogue + +- `save(df, name, description=None, tags=None)`: Save a Spark DataFrame to the Data Workbench Data Catalogue + + +## License + +Dataworkbench is licensed under [WHICH LICENSE](LICENSE). From a71fd05ba06601944c258d78dac5532c0702232e Mon Sep 17 00:00:00 2001 From: Anders Westrheim Date: Thu, 10 Apr 2025 14:55:11 +0200 Subject: [PATCH 2/4] testing coverage report --- .github/workflows/ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca762c7..b1c95d7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,6 +41,13 @@ jobs: run: | pytest --junitxml=test-results-${{ matrix.python-version }}.xml --cov=dataworkbench --cov-report=xml tests/ + - name: Get Coverage report + uses: orgoro/coverage@v3.2 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} + if: ${{ always() }} + # Step to upload the test results as an artifact - name: Upload test results as artifact uses: actions/upload-artifact@v4 @@ -49,13 +56,6 @@ jobs: path: test-results-${{ matrix.python-version }}.xml if: ${{ always() }} - # Step to upload the coverage report as an artifact - - name: Upload coverage report as artifact - uses: actions/upload-artifact@v4 - with: - name: coverage-report-${{ matrix.python-version }} - path: coverage.xml - if: ${{ always() }} From b4751d943a050506ae1868c481993190ae59a51c Mon Sep 17 00:00:00 2001 From: Anders Westrheim Date: Thu, 10 Apr 2025 15:01:32 +0200 Subject: [PATCH 3/4] comment out the coverage report for now --- .github/workflows/ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b1c95d7..e436cfb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,12 +41,12 @@ jobs: run: | pytest --junitxml=test-results-${{ matrix.python-version }}.xml --cov=dataworkbench --cov-report=xml tests/ - - name: Get Coverage report - uses: orgoro/coverage@v3.2 - with: - coverageFile: coverage.xml - token: ${{ secrets.GITHUB_TOKEN }} - if: ${{ always() }} + # - name: Get Coverage report + # uses: orgoro/coverage@v3.2 + # with: + # coverageFile: coverage.xml + # token: ${{ secrets.GITHUB_TOKEN }} + # if: ${{ always() }} # Step to upload the test results as an artifact - name: Upload test results as artifact From b05c88299560374fc0c8795e337fb1d9a90f8b98 Mon Sep 17 00:00:00 2001 From: Anders Westrheim Date: Thu, 10 Apr 2025 15:53:24 +0200 Subject: [PATCH 4/4] Fix typing --- src/dataworkbench/datacatalogue.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dataworkbench/datacatalogue.py b/src/dataworkbench/datacatalogue.py index 5cb729f..befcc6a 100644 --- a/src/dataworkbench/datacatalogue.py +++ b/src/dataworkbench/datacatalogue.py @@ -39,7 +39,7 @@ def __init__(self) -> None: self.gateway: Gateway = Gateway() self.storage_base_url: str = get_secret("StorageBaseUrl") - def __build_storage_url(self, folder_id: str) -> str: + def __build_storage_url(self, folder_id: uuid.UUID) -> str: """ Build the ABFSS URL for the target storage location. @@ -53,8 +53,8 @@ def __build_storage_url(self, folder_id: str) -> str: >>> catalogue = DataCatalogue() >>> catalogue._build_storage_url("abc123") """ - if not isinstance(folder_id, str): - raise TypeError("folder_id must be a string") + if not isinstance(folder_id, uuid.UUID): + raise TypeError("folder_id must be uuid") if not folder_id: raise ValueError("folder_id cannot be empty") @@ -116,7 +116,7 @@ def save( raise TypeError("tags must be a dictionary or None") # Generate folder_id - folder_id = str(uuid.uuid4()) + folder_id = uuid.uuid4() target_path = self.__build_storage_url(folder_id)