Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:

- name: Lint with Ruff
run: |
poetry run ruff check crategen/
poetry run ruff check crategen/ tests/

- name: Type check with Mypy
run: |
Expand All @@ -39,6 +39,6 @@ jobs:
run: |
poetry add pytest pytest-cov pytest-mock

# - name: Run tests
# run: |
# poetry run pytest --cov=crategen
- name: Run tests
run: |
poetry run pytest --cov=crategen
8 changes: 6 additions & 2 deletions crategen/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
@click.command()
@click.option('--input', prompt='Input file', help='Path to the input JSON file.')
@click.option('--output', prompt='Output file', help='Path to the output JSON file.')
@click.option('--conversion-type', prompt='Conversion type', type=click.Choice(['tes-to-wrroc', 'wes-to-wrroc']), help='Type of conversion to perform.')
@click.option('--conversion-type', prompt='Conversion type', type=click.Choice(['tes-to-wrroc', 'wes-to-wrroc', 'wrroc-to-tes', 'wrroc-to-wes']), help='Type of conversion to perform.')
def cli(input, output, conversion_type):
"""
Command Line Interface for converting TES/WES to WRROC.
Command Line Interface for converting TES/WES to WRROC and vice versa.
"""
manager = ConverterManager()

Expand All @@ -21,6 +21,10 @@ def cli(input, output, conversion_type):
result = manager.convert_tes_to_wrroc(data)
elif conversion_type == 'wes-to-wrroc':
result = manager.convert_wes_to_wrroc(data)
elif conversion_type == 'wrroc-to-tes':
result = manager.convert_wrroc_to_tes(data)
elif conversion_type == 'wrroc-to-wes':
result = manager.convert_wrroc_to_wes(data)

# Save the result to the output JSON file
with open(output, 'w') as output_file:
Expand Down
6 changes: 6 additions & 0 deletions crategen/converter_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@ def convert_tes_to_wrroc(self, tes_data):

def convert_wes_to_wrroc(self, wes_data):
return self.wes_converter.convert_to_wrroc(wes_data)

def convert_wrroc_to_tes(self, wrroc_data):
return self.tes_converter.convert_from_wrroc(wrroc_data)

def convert_wrroc_to_wes(self, wrroc_data):
return self.wes_converter.convert_from_wrroc(wrroc_data)
Comment on lines +14 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be expressive enough to call the arg just data in all cases, instead of tes_data, wes_data and wrroc_data? The input "type" is already included in the function names.

Also, please add the following in your next PR for ALL methods/functions (should have already been there, I didn't pay attention during review):

  • Type hints for args and return types (as well as for any global and local vars where the type isn't obvious, i.e., that are not assigned the result of a typed function/method); type hints are not required for test cases.
  • Google-style docstrings - please also add for classes, modules and packages; please also add for test cases.

See our contributor guide for details.

Please do not add it in this PR, otherwise it will take forever to review and merge it. But make sure it is the very next PR, as it is very important to us to have well documented and typed code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the feedback. I will rename the arguments to data in all cases for consistency. I will also ensure that all methods/functions include type hints and Google-style docstrings in the next PR as per the contributor guide. I understand the importance of well-documented and typed code.

69 changes: 30 additions & 39 deletions crategen/converters/tes_converter.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,43 @@
from pydantic import ValidationError
from .abstract_converter import AbstractConverter
from .utils import convert_to_iso8601
from ..models import TESData
from ..validators import validate_wrroc_tes

class TESConverter(AbstractConverter):

def convert_to_wrroc(self, tes_data):
# Validate and extract data with defaults
id = tes_data.get("id", "")
name = tes_data.get("name", "")
description = tes_data.get("description", "")
executors = tes_data.get("executors", [{}])
inputs = tes_data.get("inputs", [])
outputs = tes_data.get("outputs", [])
creation_time = tes_data.get("creation_time", "")
end_time = tes_data.get("logs", [{}])[0].get("end_time", "") # Corrected to fetch from logs
def convert_to_wrroc(self, tes_data: dict) -> dict:
try:
validated_tes_data = TESData(**tes_data)
except ValidationError as e:
raise ValueError(f"Invalid TES data: {e}")

# Convert to WRROC
wrroc_data = {
"@id": id,
"name": name,
"description": description,
"instrument": executors[0].get("image", None) if executors else None,
"object": [{"@id": input.get("url", ""), "name": input.get("path", "")} for input in inputs],
"result": [{"@id": output.get("url", ""), "name": output.get("path", "")} for output in outputs],
"startTime": convert_to_iso8601(creation_time),
"endTime": convert_to_iso8601(end_time),
"@id": validated_tes_data.id,
"name": validated_tes_data.name,
"description": validated_tes_data.description,
"instrument": validated_tes_data.executors[0].image if validated_tes_data.executors else None,
"object": [{"@id": input.url, "name": input.path} for input in validated_tes_data.inputs],
"result": [{"@id": output.url, "name": output.path} for output in validated_tes_data.outputs],
"startTime": convert_to_iso8601(validated_tes_data.creation_time),
"endTime": convert_to_iso8601(validated_tes_data.logs[0].end_time) if validated_tes_data.logs else None,
}
return wrroc_data

def convert_from_wrroc(self, wrroc_data):
# Validate and extract data with defaults
id = wrroc_data.get("@id", "")
name = wrroc_data.get("name", "")
description = wrroc_data.get("description", "")
instrument = wrroc_data.get("instrument", "")
object_data = wrroc_data.get("object", [])
result_data = wrroc_data.get("result", [])
start_time = wrroc_data.get("startTime", "")
end_time = wrroc_data.get("endTime", "")
def convert_from_wrroc(self, data: dict) -> dict:
try:
data_validated = validate_wrroc_tes(data)
except ValidationError as e:
raise ValueError(f"Invalid WRROC data: {e}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In such cases, it is good practice to chain exceptions, i.e., instead of

except Exception as exc:
  raise Exception(f"My own text: {exc}")

do

except Exception as exc:
  raise Exception("My own text") from exc

Please address this in a future PR for all these cases (but not in this PR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will address exception chaining in all relevant cases in a future PR to follow best practices as you suggested.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Please create an issue for that and then close this conversation.


# Convert from WRROC to TES
tes_data = {
"id": id,
"name": name,
"description": description,
"executors": [{"image": instrument}],
"inputs": [{"url": obj.get("@id", ""), "path": obj.get("name", "")} for obj in object_data],
"outputs": [{"url": res.get("@id", ""), "path": res.get("name", "")} for res in result_data],
"creation_time": start_time,
"logs": [{"end_time": end_time}], # Added to logs
"id": data_validated.id,
"name": data_validated.name,
"description": data_validated.description,
"executors": [{"image": data_validated.instrument}],
"inputs": [{"url": obj.id, "path": obj.name} for obj in data_validated.object],
"outputs": [{"url": res.id, "path": res.name} for res in data_validated.result],
"creation_time": data_validated.startTime,
"logs": [{"end_time": data_validated.endTime}],
}
return tes_data
return tes_data
61 changes: 29 additions & 32 deletions crategen/converters/wes_converter.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,43 @@
from pydantic import ValidationError
from .abstract_converter import AbstractConverter
from ..models import WESData
from .utils import convert_to_iso8601
from ..validators import validate_wrroc_wes

class WESConverter(AbstractConverter):

def convert_to_wrroc(self, wes_data):
# Validate and extract data with defaults
run_id = wes_data.get("run_id", "")
name = wes_data.get("run_log", {}).get("name", "")
state = wes_data.get("state", "")
start_time = wes_data.get("run_log", {}).get("start_time", "")
end_time = wes_data.get("run_log", {}).get("end_time", "")
outputs = wes_data.get("outputs", {})
def convert_to_wrroc(self, wes_data: dict) -> dict:
try:
wes_model = WESData(**wes_data)
except ValidationError as e:
raise ValueError(f"Invalid WES data: {e}")
outputs = wes_model.outputs

# Convert to WRROC
wrroc_data = {
"@id": run_id,
"name": name,
"status": state,
"startTime": convert_to_iso8601(start_time),
"endTime": convert_to_iso8601(end_time),
"result": [{"@id": output.get("location", ""), "name": output.get("name", "")} for output in outputs],
"@id": wes_model.run_id,
"name": wes_model.run_log.name,
"status": wes_model.state,
"startTime": convert_to_iso8601(wes_model.run_log.start_time),
"endTime": convert_to_iso8601(wes_model.run_log.end_time),
"result": [{"@id": output.location, "name": output.name} for output in outputs],
}
return wrroc_data

def convert_from_wrroc(self, wrroc_data):
# Validate and extract data with defaults
run_id = wrroc_data.get("@id", "")
name = wrroc_data.get("name", "")
start_time = wrroc_data.get("startTime", "")
end_time = wrroc_data.get("endTime", "")
state = wrroc_data.get("status", "")
result_data = wrroc_data.get("result", [])

# Convert from WRROC to WES

def convert_from_wrroc(self, data: dict) -> dict:
try:
data_validated = validate_wrroc_wes(data)
except ValidationError as e:
raise ValueError(f"Invalid WRROC data: {e}")

wes_data = {
"run_id": run_id,
"run_id": data_validated.id,
"run_log": {
"name": name,
"start_time": start_time,
"end_time": end_time,
"name": data_validated.name,
"start_time": data_validated.startTime,
"end_time": data_validated.endTime,
},
"state": state,
"outputs": [{"location": res.get("@id", ""), "name": res.get("name", "")} for res in result_data],
"state": data_validated.status,
"outputs": [{"location": res.id, "name": res.name} for res in data_validated.result],
}
return wes_data
return wes_data
105 changes: 105 additions & 0 deletions crategen/models.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When running the tool with WRROC data as input, you probably want to first validate that the input is indeed a valid WRROC entity. For that, you could use a model WRROCData (or just WRROC). This validation would be useful whenever you deal with WRROC data, e.g., in the convert_from_wrroc() methods of both the WES and TES converters. So you could consider writing an external WRROC validator function (perhaps in crategen.validators?) that you then import in the converters, so that you do not need to repeat code.

In a next step, you then want to validate that the WRROC data has all the fields required for the requested conversion. This code is specific to the converter, i.e., you may need different data for the conversion to TES than you need for the conversion to WES. For that you could define models WRROCDataTES and WRROCDataWES.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will implement an external WRROC validator function to ensure WRROC data validity and avoid code repetition. Additionally, I will define separate models for WRROCDataTES and WRROCDataWES for specific validation in the respective converters.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please double check your WRROCData, WRROCDataTES and WRROCDataWES models according to the mapping you did at the beginning of the project.

  • We want to use WRROCData to validate any WRROC entity. However, given that there are three different profiles (Process Run, Workflow Run, Provenance Run), we want to have three different models. Luckily, given that these profiles are hierarchical (with Process Run being the least and Provenance Run being the most complex), we can make use of inheritance to avoid repeating ourselves. So you can start with the least complex one (WRROCProcess) and include all of its defined properties. None of its optional and all of its required properties should be required in your model. Additional properties should be allowed (see comment further below). Then go on with the next (WRROCWorkflow) and inherit from WRROCProcess. Only add additional properties defined for the Workflow Run profile that are not defined for Process Run (because the ones defined there will be inherited). If there are properties that are required for Workflow Run that were only optional for Process Run, override them. Finally, define WRROCProvenance in the same way, this time inheriting from WRROCWorkflow. Having these models, you can then write a validator that not only checks if a given object is a WRROC entity, but it will also tell you which profile(s) it adheres to. You can do so by first validating against WRROCProvenance. If it passes, you know that (1) the object is a WRROC entity and (2) it adheres to all profiles. If it does not pass, you validate against WRROCWorkflow. If that passes, you know that (1) the object is a WRROC entity and (2) it adheres to the Workflow Run and Process Run profiles. If it does not pass, you validate against WRROCProcess. If that passes, you know that (1) the object is a WRROC entity and (2) it adheres only to the Process Run profile. It it does not pass, you know that the object is not a WRROC entity. Having these models will be very useful to validate the input and also to automatically detect which conversions are possible.
  • WRROCDataWES and WRROCDataTES should define ONLY those WRROC properties that are required to extract a WES and TES request from, respectively. We will use this to check whether we can proceed with the WRROC to WES conversion. Additional properties should be allowed (see comment further below). Given that both WRROCDataWES and WRROCDataTES are strict subsets of the core WRROC models above with respect to their properties (that is, e.g., all of the properties of WRROCDataWES will be defined somewhere in either of the WRROC profile models), you could try to avoid repetitions by using one of the approaches discussed here. However, if you do that, consider that some of the properties could be required for WRROCDataWES and WRROCDataTES, but are only optional in the WRROC profiles.

I think the models and the corresponding validators (including unit tests etc.) should be a single PR.

Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from pydantic import BaseModel
from typing import Optional

class Executor(BaseModel):
image: str
command: list[str]

class TESInputs(BaseModel):
url: str
path: str

class TESOutputs(BaseModel):
url: str
path: str

class TESLogs(BaseModel):
end_time: Optional[str] = None

class TESData(BaseModel):
id: str
name: str
description: Optional[str] = ""
executors: list[Executor]
inputs: list[TESInputs]
outputs: list[TESOutputs]
creation_time: str
logs: list[TESLogs]

class Config:
extra = "forbid"

class WESRunLog(BaseModel):
name: Optional[str] = None
start_time: Optional[str] = None
end_time: Optional[str] = None
cmd: Optional[list[str]] = None
stdout: Optional[str] = None
stderr: Optional[str] = None
exit_code: Optional[int] = None

class WESOutputs(BaseModel):
location: str
name: str

class WESRequest(BaseModel):
workflow_params: dict[str, str]
workflow_type: str
workflow_type_version: str
tags: Optional[dict[str, str]] = None

class WESData(BaseModel):
run_id: str
request: WESRequest
state: str
run_log: WESRunLog
task_logs: Optional[list[WESRunLog]] = None
outputs: list[WESOutputs]

class Config:
extra = "forbid"

class WRROCInputs(BaseModel):
id: str
name: str

class WRROCOutputs(BaseModel):
id: str
name: str

class WRROCData(BaseModel):
id: str
name: str
description: Optional[str] = ""
instrument: Optional[str] = None
object: list[WRROCInputs]
result: list[WRROCOutputs]
startTime: Optional[str] = None
endTime: Optional[str] = None

class Config:
extra = "forbid"

class WRROCDataTES(BaseModel):
id: str
name: str
description: Optional[str] = ""
instrument: Optional[str] = None
object: list[WRROCInputs]
result: list[WRROCOutputs]
startTime: Optional[str] = None
endTime: Optional[str] = None

class Config:
extra = "forbid"

class WRROCDataWES(BaseModel):
id: str
name: str
status: str
result: list[WRROCOutputs]
startTime: Optional[str] = None
endTime: Optional[str] = None

class Config:
extra = "forbid"
56 changes: 56 additions & 0 deletions crategen/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from pydantic import ValidationError
from .models import WRROCData, WRROCDataTES, WRROCDataWES

def validate_wrroc(data: dict) -> WRROCData:
"""
Validate that the input data is a valid WRROC entity.

Args:
data (dict): The input data to validate.

Returns:
WRROCData: The validated WRROC data.

Raises:
ValueError: If the data is not valid WRROC data.
"""
try:
return WRROCData(**data)
except ValidationError as e:
raise ValueError(f"Invalid WRROC data: {e}")
Comment on lines +4 to +20
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't currently seem to be used. If you want (or need) to include code to filter out additional properties not needed by WES/TES, you also need to put that code somewhere. Two options:

  1. You add the code that validates the WRROC before you pass it to the appropriate handler (WES or TES). If you do the filtering, you want to include that logic in the specific handler (WES or TES) - or you put that logic in the validators themselves. I tend to prefer this option.
  2. You call the validator within the validate_wrroc_wes() and validate_wrroc_tes() validators before you do the TES-/WES-specific validations. In that case any filtering logic, if necessary, needs to be also put in the specific validators, after the general WRROC validation and before the TES-/WES-specific validation.

If you do need to filter out additional properties, please try to abstract and write a function for that where you somehow pass the properties that you want to keep.

Copy link
Member

@uniqueg uniqueg Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An additional note about the additional properties: I really don't think we need to filter those out - they don't hurt us. We should only validate what we need to validate, and that is that everything we need is available and of the correct/expected type.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please work on the models first (see comments below and above), then rewrite your validators to account for what is written in those comments (especially the one below). But as I said, please do so in a clean, fresh PR that only changes/adds the 3 mentioned files. Make sure the code adheres to all good standards (docstrings, typing, error chaining etc), but only for the new code - the old code will be changed later.


def validate_wrroc_tes(data: dict) -> WRROCDataTES:
"""
Validate that the input data is a valid WRROC entity for TES.

Args:
data (dict): The input data to validate.

Returns:
WRROCDataTES: The validated WRROC data for TES.

Raises:
ValueError: If the data is not valid WRROC data for TES.
"""
try:
return WRROCDataTES(**data)
except ValidationError as e:
raise ValueError(f"Invalid WRROC data: {e}")

def validate_wrroc_wes(data: dict) -> WRROCDataWES:
"""
Validate that the input data is a valid WRROC entity for WES.

Args:
data (dict): The input data to validate.

Returns:
WRROCDataWES: The validated WRROC data for WES.

Raises:
ValueError: If the data is not valid WRROC data for WES.
"""
try:
return WRROCDataWES(**data)
except ValidationError as e:
raise ValueError(f"Invalid WRROC data: {e}")
Loading