Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
11e082f
Automated metadata generation using genAI MVP (#1598)
pelinKuran Oct 1, 2024
6b403be
Refactor metadata generation to use prompt templates and output parser
dlpzx Oct 7, 2024
0882d58
Adjust Prompts of metadata generation
dlpzx Oct 7, 2024
a229ce0
Move prompt files into txt files. Clean-up and add examples
dlpzx Oct 28, 2024
8bfa15f
Alternative way of returning column metadata
dlpzx Oct 29, 2024
933b2ad
Added ResourceThresholdService
dlpzx Oct 31, 2024
2c3bbf9
Merge remote-tracking branch 'origin/main' into feat/automated-metada…
dlpzx Oct 31, 2024
f4e6c64
FE work and Bedrock client cross-region
dlpzx Oct 31, 2024
636e970
Merge branch 'main' into feat/automated-metadata-generation
dlpzx Jun 18, 2025
9322382
merge-conflicts
dlpzx Jun 18, 2025
586fb79
Merge branch 'refs/heads/main' into feat/automated-metadata-generation
dlpzx Jul 17, 2025
12c6e97
GenAI metadata generation - improvements
dlpzx Jul 17, 2025
cfef943
GenAI metadata generation - remove subitem descriptions
dlpzx Jul 22, 2025
ba78df7
GenAI metadata generation - improvements
dlpzx Jul 22, 2025
d138ff8
Merge branch 'main' into feat/automated-metadata-generation
dlpzx Aug 8, 2025
b3cc20f
GenAI metadata generation - resolve conflicts+add guardrails
dlpzx Aug 8, 2025
e1bd092
Merge remote-tracking branch 'origin/main' into feat/automated-metada…
dlpzx Sep 1, 2025
c5bcd75
GenAI metadata generation - improvements
dlpzx Sep 1, 2025
4dfbc88
add resource_threshold
petrkalos Oct 10, 2025
d986e20
apply resource_thresholds
petrkalos Oct 10, 2025
d1486e2
upgrade boto3 across
petrkalos Oct 10, 2025
7f3383d
fix migration filename
petrkalos Oct 13, 2025
32e679a
fix argument name
petrkalos Oct 13, 2025
4449b04
Merge branch 'refs/heads/main' into feat/automated-metadata-generation
petrkalos Oct 15, 2025
310dfe9
reset alembic down_revision
petrkalos Oct 15, 2025
d1aeb13
Merge branch 'main' into feat/automated-metadata-generation
petrkalos Oct 15, 2025
f9a782a
sync alembic revisions
petrkalos Oct 15, 2025
ef3faf8
print db history on PR
petrkalos Oct 15, 2025
cae812c
fix alembic doc format
petrkalos Oct 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/alembic-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,7 @@ jobs:
cache: 'pip'
- name: Drop tables
run: make drop-tables
- name: Show history
run: make history-db
- name: Upgrade tables
run: make upgrade-db
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ upgrade-db: upgrade-pip install-backend
export PYTHONPATH=./backend && \
alembic -c backend/alembic.ini upgrade head

history-db: upgrade-pip install-backend
pip install 'alembic'
export PYTHONPATH=./backend && \
alembic -c backend/alembic.ini history

generate-migrations: upgrade-pip install-backend
pip install 'alembic'
export PYTHONPATH=./backend && \
Expand Down
4 changes: 2 additions & 2 deletions backend/dataall/base/cdkproxy/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
aws-cdk-lib==2.208.0
boto3==1.35.26
boto3-stubs==1.35.26
boto3==1.40.48
boto3-stubs==1.40.48
cdk-nag==2.7.2
fastapi == 0.116.1
PyYAML==6.0.2
Expand Down
23 changes: 23 additions & 0 deletions backend/dataall/base/db/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,26 @@ def __init__(self, action, message):

def __str__(self):
return f'{self.message}'


class ResourceThresholdExceeded(Exception):
def __init__(self, username, action):
self.username = username
self.action = action
self.message = f"""
An error occurred (ResourceThresholdExceeded) when calling {self.action} operation:
Requests exceeded max daily invocation count for User: {self.username}
"""

def __str__(self):
return f'{self.message}'


class ModelGuardrailException(Exception):
def __init__(self, message):
self.message = f"""
An error occurred (ModelGuardrailException) when invoking the model: {message}
"""

def __str__(self):
return f'{self.message}'
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from dataall.base.db import Base, utils
from sqlalchemy import String, Integer, Column, Date
from datetime import date


class ResourceThreshold(Base):
__tablename__ = 'resource_threshold'
actionUri = Column(String(64), primary_key=True, default=utils.uuid('resource_threshold'))
username = Column(String(64), nullable=False)
actionType = Column(String(64), nullable=False)
date = Column(Date, default=date.today, nullable=False)
count = Column(Integer, default=1, nullable=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataall.core.resource_threshold.db.resource_threshold_models import ResourceThreshold
from sqlalchemy import and_
from datetime import date


class ResourceThresholdRepository:
@staticmethod
def get_count_today(session, username, action_type):
amount = (
session.query(ResourceThreshold.count)
.filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
ResourceThreshold.date == date.today(),
)
)
.scalar()
)
return amount if amount else 0

@staticmethod
def add_entry(session, username, action_type):
user_entry = ResourceThresholdRepository._get_user_entry(session, username, action_type)
if user_entry:
session.query(ResourceThreshold).filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
)
).update({ResourceThreshold.count: 1, ResourceThreshold.date: date.today()}, synchronize_session=False)
session.commit()
else:
action_entry = ResourceThreshold(username=username, actionType=action_type)
session.add(action_entry)
session.commit()

@staticmethod
def increment_count(session, username, action_type):
session.query(ResourceThreshold).filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
ResourceThreshold.date == date.today(),
)
).update({ResourceThreshold.count: ResourceThreshold.count + 1}, synchronize_session=False)
session.commit()

@staticmethod
def _get_user_entry(session, username, action_type):
entry = (
session.query(ResourceThreshold)
.filter(and_(ResourceThreshold.username == username, ResourceThreshold.actionType == action_type))
.first()
)
return entry
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from dataall.core.resource_threshold.db.resource_threshold_repositories import ResourceThresholdRepository
from dataall.base.db import exceptions
from functools import wraps
from dataall.base.config import config
from dataall.base.context import get_context

import logging

log = logging.getLogger(__name__)


class ResourceThresholdService:
@staticmethod
def check_invocation_count(action_type, max_daily_count_config_path):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
context = get_context()
with context.db_engine.scoped_session() as session:
count = ResourceThresholdRepository.get_count_today(
session=session, username=context.username, action_type=action_type
)
max_count = config.get_property(max_daily_count_config_path, 10)
log.info(
f'User {context.username} has invoked {action_type} {count} times today of max {max_count}'
)
if count < max_count:
if count == 0:
ResourceThresholdRepository.add_entry(
session=session, username=context.username, action_type=action_type
)
else:
ResourceThresholdRepository.increment_count(
session=session, username=context.username, action_type=action_type
)
return func(*args, **kwargs)
else:
raise exceptions.ResourceThresholdExceeded(username=context.username, action=action_type)

return wrapper

return decorator
9 changes: 9 additions & 0 deletions backend/dataall/modules/s3_datasets/api/dataset/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dataall.base.api.constants import GraphQLEnumMapper


class MetadataGenerationTargets(GraphQLEnumMapper):
"""Describes the s3_datasets metadata generation targets"""

Table = 'Table'
Folder = 'Folder'
S3_Dataset = 'S3_Dataset'
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
],
)


DatasetPresignedUrlInput = gql.InputType(
name='DatasetPresignedUrlInput',
arguments=[
Expand All @@ -58,6 +59,14 @@

CrawlerInput = gql.InputType(name='CrawlerInput', arguments=[gql.Argument(name='prefix', type=gql.String)])

TableSampleData = gql.InputType(
name='TableSampleData',
arguments=[
gql.Field(name='fields', type=gql.ArrayType(gql.String)),
gql.Field(name='rows', type=gql.ArrayType(gql.String)),
],
)

ImportDatasetInput = gql.InputType(
name='ImportDatasetInput',
arguments=[
Expand Down
19 changes: 14 additions & 5 deletions backend/dataall/modules/s3_datasets/api/dataset/mutations.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from dataall.base.api import gql
from dataall.modules.s3_datasets.api.dataset.input_types import (
ModifyDatasetInput,
NewDatasetInput,
ImportDatasetInput,
)
from dataall.modules.s3_datasets.api.dataset.input_types import ModifyDatasetInput, NewDatasetInput, ImportDatasetInput
from dataall.modules.s3_datasets.api.dataset.resolvers import (
create_dataset,
update_dataset,
generate_dataset_access_token,
delete_dataset,
import_dataset,
start_crawler,
generate_metadata,
)
from dataall.modules.s3_datasets.api.dataset.enums import MetadataGenerationTargets

createDataset = gql.MutationField(
name='createDataset',
Expand Down Expand Up @@ -68,3 +66,14 @@
resolver=start_crawler,
type=gql.Ref('GlueCrawler'),
)
generateMetadata = gql.MutationField(
name='generateMetadata',
args=[
gql.Argument(name='resourceUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='targetType', type=gql.NonNullableType(MetadataGenerationTargets.toGraphQLEnum())),
gql.Argument(name='metadataTypes', type=gql.NonNullableType(gql.ArrayType(gql.String))),
gql.Argument(name='tableSampleData', type=gql.Ref('TableSampleData')),
],
type=gql.ArrayType(gql.Ref('DatasetMetadata')),
resolver=generate_metadata,
)
10 changes: 10 additions & 0 deletions backend/dataall/modules/s3_datasets/api/dataset/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
get_dataset_assume_role_url,
get_file_upload_presigned_url,
list_datasets_owned_by_env_group,
list_dataset_tables_folders,
)

getDataset = gql.QueryField(
Expand Down Expand Up @@ -45,3 +46,12 @@
resolver=list_datasets_owned_by_env_group,
test_scope='Dataset',
)
listDatasetTablesFolders = gql.QueryField(
name='listDatasetTablesFolders',
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='filter', type=gql.Ref('DatasetFilter')),
],
type=gql.Ref('DatasetItemsSearchResult'),
resolver=list_dataset_tables_folders,
)
Loading
Loading