diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index cea1f9c6..5415aa89 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -9,8 +9,8 @@ The architecture establishes clear layers with controlled dependencies: │ CLI Commands │ │ (validate, run, etc.) │ ├─────────────────────────────────────────────────────────────┤ -│ Application Facade │ -│ (QTypeFacade + Services) │ +│ Application │ +│ (Services) │ ├─────────────────────────────────────────────────────────────┤ │ Interpreter │ │ (execution engine) │ diff --git a/common/aws.bedrock.models.qtype.yaml b/common/aws.bedrock.models.qtype.yaml index e5a23fda..6242c9cf 100644 --- a/common/aws.bedrock.models.qtype.yaml +++ b/common/aws.bedrock.models.qtype.yaml @@ -142,10 +142,6 @@ provider: aws-bedrock - id: amazon.titan-embed-g1-text-02 provider: aws-bedrock -- id: amazon.titan-text-express-v1:0:8k - provider: aws-bedrock -- id: amazon.titan-text-express-v1 - provider: aws-bedrock - id: amazon.titan-embed-text-v1:2:8k provider: aws-bedrock - id: amazon.titan-embed-text-v1 diff --git a/common/tools.qtype.yaml b/common/tools.qtype.yaml index 06c6d817..17c1d3a4 100644 --- a/common/tools.qtype.yaml +++ b/common/tools.qtype.yaml @@ -11,124 +11,102 @@ tools: function_name: base64_decode id: qtype.application.commons.tools.base64_decode inputs: - data: - optional: false - type: text + - id: data + type: text module_path: qtype.application.commons.tools name: base64_decode outputs: - result: - optional: false - type: file + - id: base64_decode_result + type: file type: PythonFunctionTool - description: Encode bytes to a Base64 string. function_name: base64_encode id: qtype.application.commons.tools.base64_encode inputs: - data: - optional: false - type: file + - id: data + type: file module_path: qtype.application.commons.tools name: base64_encode outputs: - result: - optional: false - type: text + - id: base64_encode_result + type: text type: PythonFunctionTool - description: Calculate the difference between two timestamps. function_name: calculate_time_difference id: qtype.application.commons.tools.calculate_time_difference inputs: - end_time: - optional: false - type: datetime - start_time: - optional: false - type: datetime + - id: start_time + type: datetime + - id: end_time + type: datetime module_path: qtype.application.commons.tools name: calculate_time_difference outputs: - result: - optional: false - type: TimeDifferenceResultType + - id: calculate_time_difference_result + type: TimeDifferenceResultType type: PythonFunctionTool - description: Format a timestamp using a custom format string that can be passed to strftime. function_name: format_datetime id: qtype.application.commons.tools.format_datetime inputs: - format_string: - optional: false - type: text - timestamp: - optional: false - type: datetime + - id: timestamp + type: datetime + - id: format_string + type: text module_path: qtype.application.commons.tools name: format_datetime outputs: - result: - optional: false - type: text + - id: format_datetime_result + type: text type: PythonFunctionTool - description: Get the current UTC timestamp. function_name: get_current_timestamp id: qtype.application.commons.tools.get_current_timestamp - inputs: {} + inputs: [] module_path: qtype.application.commons.tools name: get_current_timestamp outputs: - result: - optional: false - type: datetime + - id: get_current_timestamp_result + type: datetime type: PythonFunctionTool - description: Parse a human-readable duration string into seconds. function_name: parse_duration_string id: qtype.application.commons.tools.parse_duration_string inputs: - duration: - optional: false - type: text + - id: duration + type: text module_path: qtype.application.commons.tools name: parse_duration_string outputs: - result: - optional: false - type: int + - id: parse_duration_string_result + type: int type: PythonFunctionTool - description: Add a specified amount of time from a given timestamp. function_name: timedelta id: qtype.application.commons.tools.timedelta inputs: - days: - optional: true - type: int - hours: - optional: true - type: int - microseconds: - optional: true - type: int - milliseconds: - optional: true - type: int - minutes: - optional: true - type: int - seconds: - optional: true - type: int - timestamp: - optional: false - type: datetime - weeks: - optional: true - type: int + - id: timestamp + type: datetime + - id: days + type: int? + - id: seconds + type: int? + - id: microseconds + type: int? + - id: milliseconds + type: int? + - id: minutes + type: int? + - id: hours + type: int? + - id: weeks + type: int? module_path: qtype.application.commons.tools name: timedelta outputs: - result: - optional: false - type: datetime + - id: timedelta_result + type: datetime type: PythonFunctionTool types: - description: Custom type for TimeDifferenceResultType diff --git a/docs/How To/Command Line Usage/load_multiple_inputs_from_files.md b/docs/How To/Command Line Usage/load_multiple_inputs_from_files.md index d869b010..a189f044 100644 --- a/docs/How To/Command Line Usage/load_multiple_inputs_from_files.md +++ b/docs/How To/Command Line Usage/load_multiple_inputs_from_files.md @@ -10,8 +10,8 @@ qtype run app.qtype.yaml --input-file inputs.csv ### Supported File Formats -- **CSV**: Columns map to input variable names -- **JSON**: Array of objects or records format +- **CSV**: Columns map to input variable names (best for primitive types) +- **JSON**: Array of objects or records format (best for nested/complex types) - **Parquet**: Efficient columnar format for large datasets - **Excel**: `.xlsx` or `.xls` files @@ -19,10 +19,25 @@ qtype run app.qtype.yaml --input-file inputs.csv When you provide `--input-file`, QType: 1. Reads the file into a pandas DataFrame -2. Each row becomes one execution of the flow -3. Column names must match flow input variable IDs -4. Processes rows with configured concurrency -5. Returns results as a DataFrame (can be saved with `--output`) +2. Automatically converts data to match input variable types +3. Each row becomes one execution of the flow +4. Column names must match flow input variable IDs +5. Processes rows with configured concurrency +6. Returns results as a DataFrame (can be saved with `--output`) + +### Type Conversion + +QType automatically converts file data to match your flow's input types: + +- **Primitive types** (`int`, `float`, `bool`, `text`): Converted from file values +- **Custom types**: Validated and instantiated from dict/object columns (use JSON format) +- **Domain types**: Built-in types like `ChatMessage` or `SearchResult` (use JSON format) + +**Format Selection Guide:** + +- Use **CSV** for simple data with primitive types (strings, numbers, booleans) +- Use **JSON** for complex data with custom types, nested objects, or domain types +- Use **Parquet** for large datasets with mixed types and efficient storage ## Complete Example diff --git a/docs/How To/Data Processing/read_data_from_files.md b/docs/How To/Data Processing/read_data_from_files.md index a0c52e42..fc14f383 100644 --- a/docs/How To/Data Processing/read_data_from_files.md +++ b/docs/How To/Data Processing/read_data_from_files.md @@ -1,6 +1,6 @@ # Read Data from Files -Load structured data from files using FileSource, which supports CSV, JSON, JSONL, and Parquet formats with automatic format detection based on file extension. +Load structured data from files using FileSource, which supports CSV, JSON, JSONL, and Parquet formats with automatic format detection and type conversion. ### QType YAML @@ -20,8 +20,34 @@ steps: - **path**: File path (relative to YAML file or absolute), supports local files and cloud storage (s3://, gs://, etc.) - **outputs**: Column names from the file to extract as variables (must match actual column names) - **Format detection**: Automatically determined by file extension (.csv, .json, .jsonl, .parquet) +- **Type conversion**: Automatically converts data to match variable types (primitives, domain types, custom types) - **Streaming**: Emits one FlowMessage per row, enabling downstream steps to process data in parallel +### Automatic Type Conversion + +FileSource automatically converts data from files to match your variable types: + +- **Primitive types** (`int`, `float`, `bool`, `text`): Direct conversion from file data +- **Domain types** (`ChatMessage`, `SearchResult`, etc.): Validated from dict/object columns +- **Custom types**: Your defined types are validated and instantiated from dict/object columns + +**Format Recommendations:** + +- **CSV**: Best for simple primitive types (strings, numbers, booleans) +- **JSON/JSONL**: Recommended for nested objects, custom types, and domain types +- **Parquet**: Best for large datasets with mixed types and efficient storage + +**Example with Custom Types (JSON format):** + +```json +[ + {"person": {"name": "Alice", "age": 30}, "score": 95}, + {"person": {"name": "Bob", "age": 25}, "score": 87} +] +``` + +JSON preserves nested objects, making it ideal for complex types. CSV stores everything as strings, requiring nested objects to be serialized as JSON strings within the CSV. + ## Complete Example ```yaml diff --git a/docs/How To/Language Features/use_optional_variables.md b/docs/How To/Language Features/use_optional_variables.md new file mode 100644 index 00000000..7184bee0 --- /dev/null +++ b/docs/How To/Language Features/use_optional_variables.md @@ -0,0 +1,41 @@ +# Use Optional Variables + +Mark variables as optional to handle cases where data may be missing or unset, allowing your flow to continue gracefully instead of failing. + +### QType YAML + +```yaml +variables: + - id: email + type: text? # Optional text variable +``` + +### Explanation + +- **`?` suffix**: Shorthand syntax to mark a variable as optional +- **Optional variables**: Can be `None` or set to a value +- **FieldExtractor**: Returns `None` for optional output variables when JSONPath finds no matches, instead of raising an error. If you make the variable non-optional, it will raise an error. + +## Complete Example + +```yaml +--8<-- "../examples/language_features/optional_variables.qtype.yaml" +``` + +**Run it:** +```bash +# When email field exists +qtype run examples/language_features/optional_variables.qtype.yaml -i '{"user_profile": {"email":"hello@domain.com"}}' +# Results: +# email: hello@domain.com + +# When email field is missing +qtype run examples/language_features/optional_variables.qtype.yaml -i '{"user_profile": "just text"}' +# Results: +# email: None +``` + +## See Also + +- [Variable Reference](../../components/Variable.md) +- [FieldExtractor Reference](../../components/FieldExtractor.md) diff --git a/docs/How To/Tools & Integration/create_tools_from_openapi_specifications.md b/docs/How To/Tools & Integration/create_tools_from_openapi_specifications.md index 5465286b..a3442ad1 100644 --- a/docs/How To/Tools & Integration/create_tools_from_openapi_specifications.md +++ b/docs/How To/Tools & Integration/create_tools_from_openapi_specifications.md @@ -31,20 +31,17 @@ tools: method: GET endpoint: /api/v3/pet/{petId} auth: swagger-petstore---openapi-30_api_key_api_key - parameters: - petId: + inputs: + - id: petId type: int - optional: false outputs: - id: - type: int - optional: true - name: - type: text - optional: false - status: + - id: id + type: int? + - id: name type: text - optional: true + - id: status + type: text? + parameters: [] ``` ### Explanation diff --git a/docs/How To/Tools & Integration/create_tools_from_python_modules.md b/docs/How To/Tools & Integration/create_tools_from_python_modules.md index 3291ad1b..bc3f670b 100644 --- a/docs/How To/Tools & Integration/create_tools_from_python_modules.md +++ b/docs/How To/Tools & Integration/create_tools_from_python_modules.md @@ -42,16 +42,13 @@ tools: module_path: myapp.utils name: calculate_age inputs: - birth_date: + - id: birth_date type: datetime - optional: false - reference_date: + - id: reference_date type: datetime - optional: false outputs: - result: + - id: calculate_age_result type: int - optional: false ``` ### Explanation diff --git a/docs/Tutorials/03-structured-data.md b/docs/Tutorials/03-structured-data.md index eaac4615..56eb8578 100644 --- a/docs/Tutorials/03-structured-data.md +++ b/docs/Tutorials/03-structured-data.md @@ -478,4 +478,4 @@ A: Not directly. Decoder maps JSON fields to individual outputs. If you need the A: Use **Decoder** when you have a JSON/XML string to parse. Use **FieldExtractor** when you already have structured data and need to extract specific fields using JSONPath (covered in advanced tutorials). **Q: Can I make properties optional?** -A: Currently all properties are required. For optional fields, you can define them in your flow logic but not include them in the Construct step. +A: Yes! Mark variables as optional using the `?` suffix (e.g., `type: text?`). Optional variables can be unset, `None`, or have a value. This is useful when extracting fields that may not always be present. See [Use Optional Variables](../How%20To/Language%20Features/use_optional_variables.md). diff --git a/docs/Tutorials/04-tools-and-function-calling.md b/docs/Tutorials/04-tools-and-function-calling.md index 6e26599c..08449d1f 100644 --- a/docs/Tutorials/04-tools-and-function-calling.md +++ b/docs/Tutorials/04-tools-and-function-calling.md @@ -74,7 +74,7 @@ Add a `references:` section before `flows:`: ```yaml # Import pre-built tools from the commons library references: - - !include https://raw.githubusercontent.com/bazaarvoice/qtype/refs/tags/v0.1.11/common/tools.qtype.yaml + - !include ../../common/tools.qtype.yaml ``` **What this means:** @@ -258,13 +258,12 @@ In this case: The `timedelta` tool definition (from commons library) looks like: ```yaml inputs: - timestamp: + - id: timestamp type: datetime - days: - type: int - hours: - type: int - optional: true + - id: days + type: int? + - id: hours + type: int? # ... more optional parameters ``` diff --git a/docs/components/APITool.md b/docs/components/APITool.md index 9f74683d..a9dd3001 100644 --- a/docs/components/APITool.md +++ b/docs/components/APITool.md @@ -7,4 +7,4 @@ Tool that invokes an API endpoint. - **method** (`str`): HTTP method to use (GET, POST, PUT, DELETE, etc.). - **auth** (`Reference[AuthProviderType] | str | None`): Optional AuthorizationProvider for API authentication. - **headers** (`dict[str, str]`): Optional HTTP headers to include in the request. -- **parameters** (`dict[str, ToolParameter]`): Output parameters produced by this tool. +- **parameters** (`list[Variable]`): Path and query parameters for the API call. diff --git a/docs/components/FieldExtractor.md b/docs/components/FieldExtractor.md index 47416973..ef9dce69 100644 --- a/docs/components/FieldExtractor.md +++ b/docs/components/FieldExtractor.md @@ -10,6 +10,8 @@ messages (1-to-many cardinality). If it matches a single value, it yields one output message. If it matches nothing, it raises an error. The extracted data is used to construct the output variable by passing it as keyword arguments to the output type's constructor. +If there is no match and the output variable is optional, it is set to None. +If there is no match and the output variable is required, an error is raised. Example JSONPath expressions: - `$.field_name` - Extract a single field - `$.items[*]` - Extract all items from a list @@ -17,4 +19,3 @@ Example JSONPath expressions: - **type** (`Literal`): (No documentation available.) - **json_path** (`str`): JSONPath expression to extract data from the input. Uses jsonpath-ng syntax. -- **fail_on_missing** (`bool`): Whether to raise an error if the JSONPath matches no data. If False, returns None. diff --git a/docs/components/Tool.md b/docs/components/Tool.md index 79ce9763..a5e70a2e 100644 --- a/docs/components/Tool.md +++ b/docs/components/Tool.md @@ -5,5 +5,5 @@ Base class for callable functions or external operations available to the model - **id** (`str`): Unique ID of this component. - **name** (`str`): Name of the tool function. - **description** (`str`): Description of what the tool does. -- **inputs** (`dict[str, ToolParameter]`): Input parameters required by this tool. -- **outputs** (`dict[str, ToolParameter]`): Output parameters produced by this tool. +- **inputs** (`list[Variable]`): Input parameters required by this tool. +- **outputs** (`list[Variable]`): Output parameters produced by this tool. diff --git a/docs/components/ToolParameter.md b/docs/components/ToolParameter.md deleted file mode 100644 index 83c18d43..00000000 --- a/docs/components/ToolParameter.md +++ /dev/null @@ -1,6 +0,0 @@ -### ToolParameter - -Defines a tool input or output parameter with type and optional flag. - -- **type** (`VariableType | str`): (No documentation available.) -- **optional** (`bool`): Whether this parameter is optional diff --git a/docs/components/Variable.md b/docs/components/Variable.md index 867d7da6..fbd8e44f 100644 --- a/docs/components/Variable.md +++ b/docs/components/Variable.md @@ -4,4 +4,5 @@ Schema for a variable that can serve as input, output, or parameter within the D - **id** (`str`): Unique ID of the variable. Referenced in prompts or steps. - **type** (`VariableType | str`): Type of data expected or produced. Either a CustomType or domain specific type. +- **optional** (`bool`): Whether this variable can be unset or None. Use '?' suffix in type string as shorthand (e.g., 'text?'). - **ui** (`TextInputUI | FileUploadUI | None`): Hints for the UI if needed. diff --git a/examples/language_features/optional_variables.qtype.yaml b/examples/language_features/optional_variables.qtype.yaml new file mode 100644 index 00000000..59960661 --- /dev/null +++ b/examples/language_features/optional_variables.qtype.yaml @@ -0,0 +1,32 @@ +id: optional_variables_demo +description: Demonstrates optional variables that can handle missing data gracefully + +flows: + - id: extract_optional_field + variables: + - id: user_profile + type: text + - id: email + type: text? + + inputs: + - user_profile + outputs: + - email + + steps: + # Try to extract email - may not exist, will return None if missing + - type: FieldExtractor + id: extract_email + inputs: + - user_profile + outputs: + - email + json_path: $.email + + - type: Echo + id: show_result + inputs: + - email + outputs: + - email diff --git a/examples/research_assistant/tavily.qtype.yaml b/examples/research_assistant/tavily.qtype.yaml index 3d4e0c92..a0d37b41 100644 --- a/examples/research_assistant/tavily.qtype.yaml +++ b/examples/research_assistant/tavily.qtype.yaml @@ -2,7 +2,8 @@ auths: - id: tavily-api_bearerauth_token token: ${TAVILY-API_BEARER} type: bearer_token -description: Tools created from API specification tavily.oas.yaml +description: Tools created from API specification + examples/research_assistant/tavily.oas.yaml flows: [] id: tavily-api indexes: [] @@ -17,81 +18,54 @@ tools: headers: {} id: search inputs: - auto_parameters: - optional: true - type: boolean - country: - optional: true - type: text - days: - optional: true - type: int - end_date: - optional: true - type: text - exclude_domains: - optional: true - type: - element_type: text - include_answer: - optional: true - type: text - include_domains: - optional: true - type: - element_type: text - include_favicon: - optional: true - type: boolean - include_images: - optional: true - type: boolean - include_raw_content: - optional: true - type: text - max_results: - optional: true - type: int - query: - optional: false - type: text - search_depth: - optional: true - type: text - start_date: - optional: true - type: text - time_range: - optional: true - type: text - topic: - optional: true - type: text + - id: query + type: text + - id: search_depth + type: text? + - id: topic + type: text? + - id: time_range + type: text? + - id: start_date + type: text? + - id: end_date + type: text? + - id: days + type: int? + - id: max_results + type: int? + - id: include_domains + type: list[text]? + - id: exclude_domains + type: list[text]? + - id: include_answer + type: text? + - id: include_raw_content + type: text? + - id: include_images + type: boolean? + - id: country + type: text? + - id: auto_parameters + type: boolean? + - id: include_favicon + type: boolean? method: POST name: Search the web outputs: - answer: - optional: true - type: text - follow_up_questions: - optional: true - type: - element_type: text - images: - optional: true - type: - element_type: schema_4844015520 - query: - optional: true - type: text - response_time: - optional: true - type: float - results: - optional: true - type: - element_type: schema_4844016144 - parameters: {} + - id: query + type: text? + - id: follow_up_questions + type: list[text]? + - id: answer + type: text? + - id: images + type: list[schema_4849029360]? + - id: results + type: list[schema_4849029792]? + - id: response_time + type: float? + parameters: [] type: APITool - auth: tavily-api_bearerauth_token description: 'Extract and process content from one or more URLs with various formatting @@ -100,34 +74,24 @@ tools: headers: {} id: extract inputs: - extract_depth: - optional: true - type: text - format: - optional: true - type: text - include_favicon: - optional: true - type: boolean - include_images: - optional: true - type: boolean - urls: - optional: false - type: - element_type: text + - id: urls + type: list[text] + - id: include_images + type: boolean? + - id: extract_depth + type: text? + - id: format + type: text? + - id: include_favicon + type: boolean? method: POST name: Extract content from URLs outputs: - failed_results: - optional: true - type: - element_type: schema_4844027472 - results: - optional: true - type: - element_type: schema_4844024880 - parameters: {} + - id: results + type: list[schema_4849038576]? + - id: failed_results + type: list[schema_4849041168]? + parameters: [] type: APITool - auth: tavily-api_bearerauth_token description: 'Crawl a website starting from a given URL with depth and breadth controls. @@ -136,60 +100,40 @@ tools: headers: {} id: crawl inputs: - allow_external: - optional: true - type: boolean - exclude_domains: - optional: true - type: - element_type: text - exclude_paths: - optional: true - type: - element_type: text - extract_depth: - optional: true - type: text - format: - optional: true - type: text - include_favicon: - optional: true - type: boolean - include_images: - optional: true - type: boolean - instructions: - optional: true - type: text - limit: - optional: true - type: int - max_breadth: - optional: true - type: int - max_depth: - optional: true - type: int - select_domains: - optional: true - type: - element_type: text - select_paths: - optional: true - type: - element_type: text - url: - optional: false - type: text + - id: url + type: text + - id: max_depth + type: int? + - id: max_breadth + type: int? + - id: limit + type: int? + - id: instructions + type: text? + - id: select_paths + type: list[text]? + - id: select_domains + type: list[text]? + - id: exclude_paths + type: list[text]? + - id: exclude_domains + type: list[text]? + - id: allow_external + type: boolean? + - id: include_images + type: boolean? + - id: extract_depth + type: text? + - id: format + type: text? + - id: include_favicon + type: boolean? method: POST name: Crawl websites outputs: - results: - optional: true - type: - element_type: schema_4844408352 - parameters: {} + - id: results + type: list[schema_4849520304]? + parameters: [] type: APITool - auth: tavily-api_bearerauth_token description: "Map the structure and links of a website starting from a given URL. @@ -198,61 +142,44 @@ tools: headers: {} id: map inputs: - allow_external: - optional: true - type: boolean - exclude_domains: - optional: true - type: - element_type: text - exclude_paths: - optional: true - type: - element_type: text - include_images: - optional: true - type: boolean - instructions: - optional: true - type: text - limit: - optional: true - type: int - max_breadth: - optional: true - type: int - max_depth: - optional: true - type: int - select_domains: - optional: true - type: - element_type: text - select_paths: - optional: true - type: - element_type: text - url: - optional: false - type: text + - id: url + type: text + - id: max_depth + type: int? + - id: max_breadth + type: int? + - id: limit + type: int? + - id: instructions + type: text? + - id: select_paths + type: list[text]? + - id: select_domains + type: list[text]? + - id: exclude_paths + type: list[text]? + - id: exclude_domains + type: list[text]? + - id: allow_external + type: boolean? + - id: include_images + type: boolean? method: POST name: Map website structure outputs: - results: - optional: true - type: - element_type: schema_4844415936 - parameters: {} + - id: results + type: list[schema_4849527888]? + parameters: [] type: APITool types: - description: Generated from OpenAPI schema - id: schema_4844015520 + id: schema_4849029360 properties: alt_text: text? title: text? url: text? - description: Generated from OpenAPI schema - id: schema_4844016144 + id: schema_4849029792 properties: content: text? favicon: text? @@ -261,19 +188,19 @@ types: title: text? url: text? - description: Generated from OpenAPI schema - id: schema_4844024880 + id: schema_4849038576 properties: content: text? favicon: text? title: text? url: text? - description: Generated from OpenAPI schema - id: schema_4844027472 + id: schema_4849041168 properties: error: text? url: text? - description: Generated from OpenAPI schema - id: schema_4844408352 + id: schema_4849520304 properties: content: text? favicon: text? @@ -281,7 +208,7 @@ types: title: text? url: text? - description: Generated from OpenAPI schema - id: schema_4844415936 + id: schema_4849527888 properties: depth: int? links: list[text]? diff --git a/examples/tools_integration/sample_utils.py b/examples/tools_integration/sample_utils.py deleted file mode 100644 index 44eefa89..00000000 --- a/examples/tools_integration/sample_utils.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Sample utility module for testing tool generation.""" - -from datetime import datetime - - -def calculate_age(birth_date: datetime, reference_date: datetime) -> int: - """Calculate age in years between two dates. - - Args: - birth_date: The birth date - reference_date: The date to calculate age at - - Returns: - Age in complete years - """ - age = reference_date.year - birth_date.year - if (reference_date.month, reference_date.day) < ( - birth_date.month, - birth_date.day, - ): - age -= 1 - return age - - -def greeting(name: str, title: str = "Mr.") -> str: - """Generate a greeting message. - - Args: - name: The person's name - title: The person's title (optional) - - Returns: - A formatted greeting message - """ - return f"Hello, {title} {name}!" diff --git a/examples/tutorials/04_tools_and_function_calling.qtype.yaml b/examples/tutorials/04_tools_and_function_calling.qtype.yaml index e53d3386..3fb1be10 100644 --- a/examples/tutorials/04_tools_and_function_calling.qtype.yaml +++ b/examples/tutorials/04_tools_and_function_calling.qtype.yaml @@ -8,7 +8,7 @@ # Scenario: Calculate a deadline by adding days to the current time # # Run with: -# uv run qtype run -i '{"days_until_due": 3}' examples/tutorials/04_tools_and_function_calling.qtype.yaml +# qtype run -i '{"days_until_due": 3}' examples/tutorials/04_tools_and_function_calling.qtype.yaml id: deadline_calculator description: | @@ -17,7 +17,7 @@ description: | # Import pre-built tools from the commons library references: - - !include https://raw.githubusercontent.com/bazaarvoice/qtype/refs/tags/v0.1.11/common/tools.qtype.yaml + - !include ../../common/tools.qtype.yaml flows: - id: calculate_deadline @@ -50,7 +50,7 @@ flows: tool: qtype.application.commons.tools.get_current_timestamp input_bindings: {} output_bindings: - result: current_time + get_current_timestamp_result: current_time outputs: - current_time @@ -71,7 +71,7 @@ flows: timestamp: current_time days: days_until_due output_bindings: - result: deadline_time + timedelta_result: deadline_time outputs: - deadline_time @@ -84,6 +84,6 @@ flows: timestamp: deadline_time format_string: format_string output_bindings: - result: deadline_formatted + format_datetime_result: deadline_formatted outputs: - deadline_formatted diff --git a/qtype/application/__init__.py b/qtype/application/__init__.py index b15b9dbf..5cb5ee22 100644 --- a/qtype/application/__init__.py +++ b/qtype/application/__init__.py @@ -3,10 +3,8 @@ from __future__ import annotations from . import commons, converters -from .facade import QTypeFacade __all__ = [ - "QTypeFacade", "converters", "commons", ] diff --git a/qtype/application/converters/tools_from_api.py b/qtype/application/converters/tools_from_api.py index 01b8c243..0a54c875 100644 --- a/qtype/application/converters/tools_from_api.py +++ b/qtype/application/converters/tools_from_api.py @@ -26,7 +26,7 @@ BearerTokenAuthProvider, CustomType, OAuth2AuthProvider, - ToolParameter, + Variable, VariableType, ) @@ -202,9 +202,9 @@ def create_tool_parameters_from_body( existing_custom_types: dict[str, CustomType], schema_name_map: dict[int, str], default_param_name: str, -) -> dict[str, ToolParameter]: +) -> list[Variable]: """ - Convert an OpenAPI Response or RequestBody to a dictionary of ToolParameters. + Convert an OpenAPI Response or RequestBody to a list of Variables. If the body has only one content type with an Object schema, flatten its properties to individual parameters. Otherwise, create a single parameter with the body type. @@ -216,18 +216,18 @@ def create_tool_parameters_from_body( default_param_name: Name to use for non-flattened parameter Returns: - Dictionary of parameter name to ToolParameter objects + List of Variable objects """ # Check if we have content to analyze if not hasattr(oas, "content") or not oas.content: - return {} + return [] content = oas.content[0] input_type = to_variable_type( content, existing_custom_types, schema_name_map ) - # Convert CustomType to string ID for ToolParameter + # Convert CustomType to string ID for Variable input_type_value = ( input_type.id if isinstance(input_type, CustomType) else input_type ) @@ -240,7 +240,7 @@ def create_tool_parameters_from_body( custom_type = existing_custom_types[input_type.id] # Flatten the custom type properties to individual parameters - flattened_parameters = {} + flattened_parameters = [] for prop_name, prop_type_str in custom_type.properties.items(): # Check if the property is optional (has '?' suffix) is_optional = prop_type_str.endswith("?") @@ -248,8 +248,10 @@ def create_tool_parameters_from_body( prop_type_str.rstrip("?") if is_optional else prop_type_str ) - flattened_parameters[prop_name] = ToolParameter( - type=clean_type, optional=is_optional + flattened_parameters.append( + Variable.model_construct( + id=prop_name, type=clean_type, optional=is_optional + ) ) # remove the type from existing_custom_types to avoid confusion @@ -258,11 +260,11 @@ def create_tool_parameters_from_body( return flattened_parameters # If not flattening, create a single parameter (e.g., for simple types or arrays) - return { - default_param_name: ToolParameter( - type=input_type_value, optional=False + return [ + Variable.model_construct( + id=default_param_name, type=input_type_value, optional=False ) - } + ] def to_api_tool( @@ -297,7 +299,7 @@ def to_api_tool( ).replace("\n", " ") # Process inputs from request body and parameters - inputs = {} + inputs = [] if operation.request_body and operation.request_body.content: # Create input parameters from request body using the new function input_params = create_tool_parameters_from_body( @@ -306,27 +308,31 @@ def to_api_tool( schema_name_map, default_param_name="request", ) - inputs.update(input_params) + inputs.extend(input_params) # Add path and query parameters as inputs - parameters = {} + parameters = [] for param in operation.parameters: if param.schema: param_type = _schema_to_qtype_type( param.schema, existing_custom_types, schema_name_map ) - # Convert to appropriate type for ToolParameter + # Convert to appropriate type for Variable param_type_value = ( param_type.id if isinstance(param_type, CustomType) else param_type ) - parameters[param.name] = ToolParameter( - type=param_type_value, optional=not param.required + parameters.append( + Variable.model_construct( + id=param.name, + type=param_type_value, + optional=not param.required, + ) ) # Process outputs from responses - outputs = {} + outputs = [] # Find the success response (200-299 status codes) or default response success_response = next( (r for r in operation.responses if r.code and 200 <= r.code < 300), @@ -339,9 +345,9 @@ def to_api_tool( success_response, existing_custom_types, schema_name_map, - default_param_name="response", + default_param_name=f"{tool_id}_response", ) - outputs.update(output_params) + outputs.extend(output_params) return APITool( id=tool_id, diff --git a/qtype/application/converters/tools_from_module.py b/qtype/application/converters/tools_from_module.py index c9179922..193a940c 100644 --- a/qtype/application/converters/tools_from_module.py +++ b/qtype/application/converters/tools_from_module.py @@ -10,7 +10,7 @@ CustomType, ListType, PythonFunctionTool, - ToolParameter, + Variable, VariableType, ) @@ -43,14 +43,19 @@ def tools_from_module( f"No public functions found in module '{module_path}'" ) - custom_types: dict[str, CustomType] = {} + # Registry of actual Pydantic classes for validation + custom_type_registry: dict[str, Type[BaseModel]] = {} + # CustomType instances for YAML output + custom_type_models: dict[str, CustomType] = {} # Create Tool instances from functions tools = [ - _create_tool_from_function(func_name, func_info, custom_types) + _create_tool_from_function( + func_name, func_info, custom_type_registry, custom_type_models + ) for func_name, func_info in functions.items() ] - return (tools, list(custom_types.values())) + return (tools, list(custom_type_models.values())) except ImportError as e: raise ImportError(f"Cannot import module '{module_path}': {e}") from e @@ -116,7 +121,8 @@ def _get_module_functions( def _create_tool_from_function( func_name: str, func_info: dict[str, Any], - custom_types: dict[str, CustomType], + custom_type_registry: dict[str, Type[BaseModel]], + custom_type_models: dict[str, CustomType], ) -> PythonFunctionTool: """ Convert function metadata into a Tool instance. @@ -135,29 +141,38 @@ def _create_tool_from_function( else f"Function {func_name}" ) - # Create input parameters from function parameters - inputs = { - p["name"]: ToolParameter( - type=_map_python_type_to_variable_type(p["type"], custom_types), - optional=p["default"] != inspect.Parameter.empty, + # Create input parameters as list of Variables + inputs = [ + Variable.model_validate( + { + "id": p["name"], + "type": _map_python_type_to_variable_type( + p["type"], custom_type_registry, custom_type_models + ), + "optional": p["default"] != inspect.Parameter.empty, + }, + context={"custom_types": custom_type_registry}, ) for p in func_info["parameters"] - } - - # # quick hack - # for k, v in inputs.items(): - # if inspect.isclass(v.type) and issubclass(v.type, BaseModel): - # v.type = str(v.type.__name__) + ] # Create output parameter based on return type tool_id = func_info["module"] + "." + func_name output_type = _map_python_type_to_variable_type( - func_info["return_type"], custom_types + func_info["return_type"], custom_type_registry, custom_type_models ) - outputs = {"result": ToolParameter(type=output_type, optional=False)} - # outputs['result'].type = + outputs = [ + Variable.model_validate( + { + "id": f"{func_name}_result", + "type": output_type, + "optional": False, + }, + context={"custom_types": custom_type_registry}, + ) + ] return PythonFunctionTool( id=tool_id, @@ -172,7 +187,8 @@ def _create_tool_from_function( def _pydantic_to_custom_types( model_cls: Type[BaseModel], - custom_types: dict[str, CustomType], + custom_type_registry: dict[str, Type[BaseModel]], + custom_type_models: dict[str, CustomType], ) -> str: """ Converts a Pydantic BaseModel class into a QType CustomType. @@ -184,15 +200,20 @@ def _pydantic_to_custom_types( Args: model_cls: The Pydantic model class to convert. + custom_type_registry: Registry of actual Pydantic classes for validation + custom_type_models: Dictionary of CustomType models for YAML output Returns: - A dictionary mapping field names to their corresponding CustomType definitions. + The model name as a string type reference """ properties = {} model_name = model_cls.__name__ - if model_name in custom_types: + if model_name in custom_type_registry: return model_name # Already processed + # Register the actual class for validation + custom_type_registry[model_name] = model_cls + for field_name, field_info in model_cls.model_fields.items(): # Use the annotation (the type hint) for the field field_type = field_info.annotation @@ -202,22 +223,27 @@ def _pydantic_to_custom_types( ) elif get_origin(field_type) is Union: # Assume the union means it's optional + # TODO: support proper unions field_type = [ t for t in get_args(field_type) if t is not type(None) ][0] - rv = _map_python_type_to_type_str(field_type, custom_types) + rv = _map_python_type_to_type_str( + field_type, custom_type_registry, custom_type_models + ) properties[field_name] = f"{rv}?" elif get_origin(field_type) is list: inner_type = get_args(field_type)[0] - rv = _map_python_type_to_type_str(inner_type, custom_types) + rv = _map_python_type_to_type_str( + inner_type, custom_type_registry, custom_type_models + ) properties[field_name] = f"list[{rv}]" else: properties[field_name] = _map_python_type_to_type_str( - field_type, custom_types + field_type, custom_type_registry, custom_type_models ) - # Add the custom type to the list - custom_types[model_name] = CustomType( + # Add the CustomType model for YAML output + custom_type_models[model_name] = CustomType( id=model_name, properties=properties, description=model_cls.__doc__ or f"Custom type for {model_name}", @@ -227,7 +253,8 @@ def _pydantic_to_custom_types( def _map_python_type_to_variable_type( python_type: Any, - custom_types: dict[str, CustomType], + custom_type_registry: dict[str, Type[BaseModel]], + custom_type_models: dict[str, CustomType], ) -> str | VariableType: """ Map Python type annotations to QType VariableType. @@ -248,7 +275,9 @@ def _map_python_type_to_variable_type( element_type_annotation = args[0] # Recursively map the element type element_type = _map_python_type_to_variable_type( - element_type_annotation, custom_types + element_type_annotation, + custom_type_registry, + custom_type_models, ) # Support lists of both primitive types and custom types if isinstance(element_type, PrimitiveTypeEnum): @@ -281,7 +310,9 @@ def _map_python_type_to_variable_type( return python_type.__name__ elif inspect.isclass(python_type) and issubclass(python_type, BaseModel): # If it's a Pydantic model, create or retrieve its CustomType definition - return _pydantic_to_custom_types(python_type, custom_types) + return _pydantic_to_custom_types( + python_type, custom_type_registry, custom_type_models + ) raise ValueError( f"Unsupported Python type '{python_type}' for VariableType mapping" ) @@ -289,9 +320,12 @@ def _map_python_type_to_variable_type( def _map_python_type_to_type_str( python_type: Any, - custom_types: dict[str, CustomType], + custom_type_registry: dict[str, Type[BaseModel]], + custom_type_models: dict[str, CustomType], ) -> str: - var_type = _map_python_type_to_variable_type(python_type, custom_types) + var_type = _map_python_type_to_variable_type( + python_type, custom_type_registry, custom_type_models + ) if isinstance(var_type, PrimitiveTypeEnum): return var_type.value elif inspect.isclass(python_type): diff --git a/qtype/application/facade.py b/qtype/application/facade.py deleted file mode 100644 index 306fed7c..00000000 --- a/qtype/application/facade.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Main facade for qtype operations.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Any - -from qtype.base.types import PathLike -from qtype.semantic.model import Application as SemanticApplication -from qtype.semantic.model import DocumentType as SemanticDocumentType - -# Note: There should be _zero_ imports here at the top that import qtype.interpreter. -# That's the whole point of this facade - to avoid importing optional -# dependencies unless these methods are called. - -logger = logging.getLogger(__name__) - - -class QTypeFacade: - """ - Simplified interface for qtype operations. - - This facade provides lazy-loading wrappers for operations that require - optional dependencies (interpreter package), allowing base qtype to work - without those dependencies installed. - """ - - def telemetry(self, spec: SemanticDocumentType) -> None: - from qtype.interpreter.telemetry import register - - if isinstance(spec, SemanticApplication) and spec.telemetry: - logger.info( - f"Telemetry enabled with endpoint: {spec.telemetry.endpoint}" - ) - # Register telemetry if needed - register(spec.telemetry, self.secret_manager(spec), spec.id) - - def secret_manager(self, spec: SemanticDocumentType): - """ - Create a secret manager based on the specification. - - Args: - spec: SemanticDocumentType specification - - Returns: - Secret manager instance - """ - from qtype.interpreter.base.secrets import create_secret_manager - - if isinstance(spec, SemanticApplication): - return create_secret_manager(spec.secret_manager) - else: - raise ValueError( - "Can't create secret manager for non-Application spec" - ) - - async def execute_workflow( - self, - path: PathLike, - inputs: dict | Any, - flow_name: str | None = None, - **kwargs: Any, - ) -> Any: - """ - Execute a complete workflow from document to results. - - Args: - path: Path to the QType specification file - inputs: Dictionary of input values or DataFrame for batch - flow_name: Optional name of flow to execute - **kwargs: Additional dependencies for execution - - Returns: - DataFrame with results (one row per input) - """ - import pandas as pd - from opentelemetry import trace - - from qtype.interpreter.base.executor_context import ExecutorContext - from qtype.interpreter.converters import ( - dataframe_to_flow_messages, - flow_messages_to_dataframe, - ) - from qtype.interpreter.flow import run_flow - from qtype.interpreter.types import Session - from qtype.semantic.loader import load - - # Load the semantic application - semantic_model, type_registry = load(Path(path)) - assert isinstance(semantic_model, SemanticApplication) - self.telemetry(semantic_model) - - # Find the flow to execute - if flow_name: - target_flow = None - for flow in semantic_model.flows: - if flow.id == flow_name: - target_flow = flow - break - if target_flow is None: - raise ValueError(f"Flow '{flow_name}' not found") - else: - if semantic_model.flows: - target_flow = semantic_model.flows[0] - else: - raise ValueError("No flows found in application") - - logger.info(f"Executing flow {target_flow.id} from {path}") - - # Convert inputs to DataFrame (normalize single dict to 1-row DataFrame) - - if isinstance(inputs, dict): - input_df = pd.DataFrame([inputs]) - elif isinstance(inputs, pd.DataFrame): - input_df = inputs - else: - raise ValueError( - f"Inputs must be dict or DataFrame, got {type(inputs)}" - ) - - # Create session - session = Session( - session_id=kwargs.pop("session_id", "default"), - conversation_history=kwargs.pop("conversation_history", []), - ) - - # Convert DataFrame to FlowMessages - initial_messages = dataframe_to_flow_messages(input_df, session) - - # Execute the flow - secret_manager = self.secret_manager(semantic_model) - - context = ExecutorContext( - secret_manager=secret_manager, - tracer=trace.get_tracer(__name__), - ) - results = await run_flow( - target_flow, - initial_messages, - context=context, - **kwargs, - ) - - # Convert results back to DataFrame - results_df = flow_messages_to_dataframe(results, target_flow) - - return results_df - - def generate_aws_bedrock_models(self) -> list[dict[str, Any]]: - """ - Generate AWS Bedrock model definitions. - - Returns: - List of model definitions for AWS Bedrock models. - - Raises: - ImportError: If boto3 is not installed. - Exception: If AWS API call fails. - """ - import boto3 # type: ignore[import-untyped] - - logger.info("Discovering AWS Bedrock models...") - client = boto3.client("bedrock") - models = client.list_foundation_models() - - model_definitions = [] - for model_summary in models.get("modelSummaries", []): - model_definitions.append( - { - "id": model_summary["modelId"], - "provider": "aws-bedrock", - } - ) - - logger.info(f"Discovered {len(model_definitions)} AWS Bedrock models") - return model_definitions diff --git a/qtype/commands/generate.py b/qtype/commands/generate.py index 3fae4e09..8bc294e8 100644 --- a/qtype/commands/generate.py +++ b/qtype/commands/generate.py @@ -2,24 +2,48 @@ import json import logging from pathlib import Path -from typing import Optional +from typing import Any, Optional from qtype.dsl.model import Document logger = logging.getLogger(__name__) +def generate_aws_bedrock_models() -> list[dict[str, Any]]: + """Generate AWS Bedrock model definitions. + + Returns: + List of model definitions for AWS Bedrock models. + + Raises: + ImportError: If boto3 is not installed. + Exception: If AWS API call fails. + """ + import boto3 # type: ignore[import-untyped] + + logger.info("Discovering AWS Bedrock models...") + client = boto3.client("bedrock") + models = client.list_foundation_models() + + model_definitions = [] + for model_summary in models.get("modelSummaries", []): + model_definitions.append( + { + "id": model_summary["modelId"], + "provider": "aws-bedrock", + } + ) + + logger.info(f"Discovered {len(model_definitions)} AWS Bedrock models") + return model_definitions + + def run_dump_commons_library(args: argparse.Namespace) -> None: """Generate commons library tools and AWS Bedrock models.""" - import logging from pathlib import Path - from qtype.application.facade import QTypeFacade from qtype.dsl.model import Model, ModelList - logger = logging.getLogger(__name__) - facade = QTypeFacade() - try: # Generate common tools using convert module functionality logger.info("Generating common tools...") @@ -38,7 +62,7 @@ def run_dump_commons_library(args: argparse.Namespace) -> None: # Generate AWS Bedrock models logger.info("Generating AWS Bedrock models...") try: - model_definitions = facade.generate_aws_bedrock_models() + model_definitions = generate_aws_bedrock_models() model_list = ModelList( root=[ diff --git a/qtype/commands/run.py b/qtype/commands/run.py index d9f3eff4..40190cc4 100644 --- a/qtype/commands/run.py +++ b/qtype/commands/run.py @@ -11,11 +11,10 @@ from pathlib import Path from typing import Any -import pandas as pd from pydantic.warnings import UnsupportedFieldAttributeWarning -from qtype.application.facade import QTypeFacade from qtype.base.exceptions import InterpreterError, LoadError, ValidationError +from qtype.interpreter.converters import read_dataframe_from_file logger = logging.getLogger(__name__) @@ -29,50 +28,124 @@ logging.getLogger(name).setLevel(logging.WARNING) -def read_data_from_file(file_path: str) -> pd.DataFrame: - """ - Reads a file into a pandas DataFrame based on its MIME type. +def register_telemetry(spec) -> None: + """Register telemetry if enabled in the spec.""" + from qtype.interpreter.telemetry import register + from qtype.semantic.model import Application as SemanticApplication + + if isinstance(spec, SemanticApplication) and spec.telemetry: + logger.info( + f"Telemetry enabled with endpoint: {spec.telemetry.endpoint}" + ) + secret_mgr = create_secret_manager_for_spec(spec) + register(spec.telemetry, secret_mgr, spec.id) + + +def create_secret_manager_for_spec(spec): + """Create a secret manager based on the specification.""" + from qtype.interpreter.base.secrets import create_secret_manager + from qtype.semantic.model import Application as SemanticApplication + + if isinstance(spec, SemanticApplication): + return create_secret_manager(spec.secret_manager) + else: + raise ValueError( + "Can't create secret manager for non-Application spec" + ) + + +async def execute_workflow( + path: Path, + inputs: dict | Any, + flow_name: str | None = None, + **kwargs: Any, +) -> Any: + """Execute a complete workflow from document to results. + + Args: + path: Path to the QType specification file + inputs: Dictionary of input values or DataFrame for batch + flow_name: Optional name of flow to execute + **kwargs: Additional dependencies for execution + + Returns: + DataFrame with results (one row per input) """ - from pathlib import Path - - import magic - - mime_type = magic.Magic(mime=True).from_file(file_path) - - if mime_type == "text/csv": - # TODO: Restore na values and convert to optional once we support them https://github.com/bazaarvoice/qtype/issues/101 - df = pd.read_csv(file_path) - return df.fillna("") - elif mime_type == "text/plain": - # For text/plain, use file extension to determine format - file_ext = Path(file_path).suffix.lower() - if file_ext == ".csv": - # TODO: Restore na values and convert to optional once we support them https://github.com/bazaarvoice/qtype/issues/101 - df = pd.read_csv(file_path) - return df.fillna("") - elif file_ext == ".json": - return pd.read_json(file_path) + import pandas as pd + from opentelemetry import trace + + from qtype.interpreter.base.executor_context import ExecutorContext + from qtype.interpreter.converters import ( + dataframe_to_flow_messages, + flow_messages_to_dataframe, + ) + from qtype.interpreter.flow import run_flow + from qtype.interpreter.types import Session + from qtype.semantic.loader import load + from qtype.semantic.model import Application as SemanticApplication + + # Load the semantic application + semantic_model, type_registry = load(path) + assert isinstance(semantic_model, SemanticApplication) + register_telemetry(semantic_model) + + # Find the flow to execute + if flow_name: + target_flow = None + for flow in semantic_model.flows: + if flow.id == flow_name: + target_flow = flow + break + if target_flow is None: + raise ValueError(f"Flow '{flow_name}' not found") + else: + if semantic_model.flows: + target_flow = semantic_model.flows[0] else: - raise ValueError( - ( - f"Unsupported text/plain file extension: {file_ext}. " - "Supported extensions: .csv, .json" - ) - ) - elif mime_type == "application/json": - return pd.read_json(file_path) - elif mime_type in [ - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.ms-excel", - ]: - return pd.read_excel(file_path) - elif mime_type in ["application/vnd.parquet", "application/octet-stream"]: - return pd.read_parquet(file_path) + raise ValueError("No flows found in application") + + logger.info(f"Executing flow {target_flow.id} from {path}") + + # Convert inputs to DataFrame (normalize single dict to 1-row DataFrame) + if isinstance(inputs, dict): + input_df = pd.DataFrame([inputs]) + elif isinstance(inputs, pd.DataFrame): + input_df = inputs else: raise ValueError( - f"Unsupported MIME type for file {file_path}: {mime_type}" + f"Inputs must be dict or DataFrame, got {type(inputs)}" ) + # Create session + session = Session( + session_id=kwargs.pop("session_id", "default"), + conversation_history=kwargs.pop("conversation_history", []), + ) + + # Convert DataFrame to FlowMessages with type conversion + initial_messages_list = dataframe_to_flow_messages( + input_df, target_flow.inputs, session=session + ) + + # Execute the flow + secret_manager = create_secret_manager_for_spec(semantic_model) + + context = ExecutorContext( + secret_manager=secret_manager, + tracer=trace.get_tracer(__name__), + ) + results = await run_flow( + target_flow, + initial_messages_list, + context=context, + **kwargs, + ) + + # Convert results back to DataFrame + results_df = flow_messages_to_dataframe(results, target_flow) + + return results_df + def run_flow(args: Any) -> None: """Run a QType YAML spec file by executing its flows. @@ -82,7 +155,6 @@ def run_flow(args: Any) -> None: """ import asyncio - facade = QTypeFacade() spec_path = Path(args.spec) try: @@ -90,7 +162,7 @@ def run_flow(args: Any) -> None: if args.input_file: logger.info(f"Loading input data from file: {args.input_file}") - input: Any = read_data_from_file(args.input_file) + input: Any = read_dataframe_from_file(args.input_file) else: # Parse input JSON try: @@ -99,9 +171,9 @@ def run_flow(args: Any) -> None: logger.error(f"❌ Invalid JSON input: {e}") return - # Execute the workflow using the facade (now async, returns DataFrame) + # Execute the workflow using the standalone function result_df = asyncio.run( - facade.execute_workflow( + execute_workflow( spec_path, flow_name=args.flow, inputs=input, diff --git a/qtype/dsl/model.py b/qtype/dsl/model.py index d913245a..27fe15ca 100644 --- a/qtype/dsl/model.py +++ b/qtype/dsl/model.py @@ -197,8 +197,8 @@ def _resolve_type_field_validator(data: Any, info: ValidationInfo) -> Any: """ Shared validator for resolving 'type' fields in models. - This validator resolves string-based type references using the custom - type registry from the validation context. + This validator handles optional '?' syntax and resolves string-based + type references using the custom type registry from the validation context. Args: data: The data dict being validated @@ -212,6 +212,13 @@ def _resolve_type_field_validator(data: Any, info: ValidationInfo) -> Any: and "type" in data and isinstance(data["type"], str) ): + # Handle '?' suffix for optional types BEFORE type resolution + type_value = data["type"] + if type_value.endswith("?"): + # Strip '?' and mark as optional + data["type"] = type_value[:-1] + data["optional"] = True + # Get the registry of custom types from the validation context. custom_types = (info.context or {}).get("custom_types", {}) resolved = _resolve_variable_type(data["type"], custom_types) @@ -246,6 +253,27 @@ def get_id(item): return result +def _type_to_string(type_value: Any) -> str: + """Convert a type value to its string representation. + + Args: + type_value: The type value to convert + + Returns: + String representation of the type + """ + if isinstance(type_value, str): + return type_value + elif isinstance(type_value, PrimitiveTypeEnum): + return type_value.value + elif isinstance(type_value, ListType): + return str(type_value) + elif isinstance(type_value, type): + return type_value.__name__ + else: + return str(type_value) + + class Variable(StrictBaseModel): """Schema for a variable that can serve as input, output, or parameter within the DSL.""" @@ -259,15 +287,37 @@ class Variable(StrictBaseModel): "Type of data expected or produced. Either a CustomType or domain specific type." ), ) + optional: bool = Field( + default=False, + description=( + "Whether this variable can be unset or None. " + "Use '?' suffix in type string as shorthand (e.g., 'text?')." + ), + ) - ui: UIType | None = Field(None, description="Hints for the UI if needed.") + ui: UIType | None = Field( + default=None, description="Hints for the UI if needed." + ) @model_validator(mode="before") @classmethod def resolve_type(cls, data: Any, info: ValidationInfo) -> Any: - """Resolve string-based type references using the shared validator.""" + """Resolve string-based type references and handle optional '?' syntax.""" return _resolve_type_field_validator(data, info) + @model_serializer + def serialize_model(self): + """Serialize with '?' suffix for optional types.""" + result: dict[str, Any] = {"id": self.id} + + type_str = _type_to_string(self.type) + result["type"] = f"{type_str}?" if self.optional else type_str + + if self.ui is not None: + result["ui"] = self.ui.model_dump() + + return result + @model_validator(mode="after") def validate_ui_type(self) -> Variable: """Ensure at least one credential source is provided.""" @@ -303,37 +353,6 @@ class CustomType(StrictBaseModel): properties: dict[str, str] -class ToolParameter(BaseModel): - """Defines a tool input or output parameter with type and optional flag.""" - - type: VariableType | str - optional: bool = Field( - default=False, description="Whether this parameter is optional" - ) - - @model_validator(mode="before") - @classmethod - def resolve_type(cls, data: Any, info: ValidationInfo) -> Any: - """Resolve string-based type references using the shared validator.""" - return _resolve_type_field_validator(data, info) - - @staticmethod - def _serialize_type(value): - if isinstance(value, type): - return value.__name__ - elif hasattr(value, "__name__"): - return value.__name__ - return value - - @model_serializer - def _model_serializer(self): - # Use the default serialization, but ensure 'type' is a string - return { - "type": self._serialize_type(self.type), - "optional": self.optional, - } - - class ListType(BaseModel): """Represents a list type with a specific element type.""" @@ -496,12 +515,12 @@ class Tool(StrictBaseModel, ABC): description: str = Field( ..., description="Description of what the tool does." ) - inputs: dict[str, ToolParameter] = Field( - default_factory=dict, + inputs: list[Variable] = Field( + default_factory=list, description="Input parameters required by this tool.", ) - outputs: dict[str, ToolParameter] = Field( - default_factory=dict, + outputs: list[Variable] = Field( + default_factory=list, description="Output parameters produced by this tool.", ) @@ -536,9 +555,9 @@ class APITool(Tool): default_factory=dict, description="Optional HTTP headers to include in the request.", ) - parameters: dict[str, ToolParameter] = Field( - default_factory=dict, - description="Output parameters produced by this tool.", + parameters: list[Variable] = Field( + default_factory=list, + description="Path and query parameters for the API call.", ) @@ -673,6 +692,9 @@ class FieldExtractor(Step): The extracted data is used to construct the output variable by passing it as keyword arguments to the output type's constructor. + If there is no match and the output variable is optional, it is set to None. + If there is no match and the output variable is required, an error is raised. + Example JSONPath expressions: - `$.field_name` - Extract a single field - `$.items[*]` - Extract all items from a list @@ -684,10 +706,6 @@ class FieldExtractor(Step): ..., description="JSONPath expression to extract data from the input. Uses jsonpath-ng syntax.", ) - fail_on_missing: bool = Field( - default=True, - description="Whether to raise an error if the JSONPath matches no data. If False, returns None.", - ) class InvokeTool(Step, ConcurrentStepMixin): diff --git a/qtype/interpreter/converters.py b/qtype/interpreter/converters.py index 1d2acc57..77a80ab9 100644 --- a/qtype/interpreter/converters.py +++ b/qtype/interpreter/converters.py @@ -2,37 +2,16 @@ from __future__ import annotations -from collections.abc import AsyncIterator -from typing import Any, cast +from pathlib import Path +from typing import Any +import fsspec import pandas as pd from pydantic import BaseModel from qtype.interpreter.types import FlowMessage, Session -from qtype.semantic.model import Flow - - -async def dataframe_to_flow_messages( - df: pd.DataFrame, session: Session -) -> AsyncIterator[FlowMessage]: - """ - Convert a DataFrame to an async generator of FlowMessages. - - Each row in the DataFrame becomes a FlowMessage with the same session. - - Args: - df: DataFrame where each row represents one set of inputs - session: Session object to use for all messages - - Yields: - FlowMessages, one per DataFrame row - """ - # Use to_dict with orient='records' - much faster than iterrows - # This returns a list of dicts directly without Series overhead - records = cast(list[dict[str, Any]], df.to_dict(orient="records")) - - for record in records: - yield FlowMessage(session=session, variables=record) +from qtype.interpreter.typing import convert_dict_to_typed_variables +from qtype.semantic.model import Flow, Variable def flow_messages_to_dataframe( @@ -77,3 +56,140 @@ def flow_messages_to_dataframe( results.append(row_data) return pd.DataFrame(results) + + +def read_dataframe_from_file( + file_path: str, +) -> pd.DataFrame: + """ + Read a file into a pandas DataFrame. + + Automatically detects file format based on MIME type and supports both + local and remote files via fsspec. Returns raw DataFrame without type + conversion. + + Args: + file_path: Path to the file (local or remote, e.g., s3://bucket/file) + + Returns: + DataFrame with data from the file + + Raises: + ValueError: If file format is not supported or mime type detection fails + FileNotFoundError: If file does not exist + + Supported formats: + - CSV (.csv) + - JSON (.json) + - JSONL (.jsonl, JSON Lines) + - Parquet (.parquet) + - Excel (.xlsx, .xls) + + Examples: + >>> # Read CSV + >>> df = read_dataframe_from_file("data.csv") + >>> + >>> # Read from S3 + >>> df = read_dataframe_from_file("s3://bucket/data.parquet") + """ + import magic + + ext_to_mime = { + ".csv": "text/csv", + ".json": "application/json", + ".jsonl": "application/jsonlines", + ".parquet": "application/vnd.parquet", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".xls": "application/vnd.ms-excel", + } + # Detect MIME type - handle both local and remote files + # For remote files, we'll need to download a sample first + if file_path.startswith(("http://", "https://", "s3://", "gs://")): + # For remote files, infer from extension as fallback + extension = Path(file_path).suffix.lower() + # Map extensions to mime types + mime_type = ext_to_mime.get(extension, "application/octet-stream") + else: + # Local file - use magic to detect mime type + try: + mime_type = magic.Magic(mime=True).from_file(file_path) + except Exception as e: + # Fallback to extension-based detection + extension = Path(file_path).suffix.lower() + mime_type = ext_to_mime.get(extension, "application/octet-stream") + if mime_type == "application/octet-stream": + raise ValueError( + f"Could not determine file type for {file_path}: {e}" + ) + + # Open file with fsspec (supports local and remote files) + with fsspec.open(file_path, "rb") as file_handle: + # Read based on MIME type + if mime_type == "text/csv" or mime_type == "text/plain": + df = pd.read_csv(file_handle) # type: ignore[arg-type] + elif mime_type in ["application/json", "application/jsonlines"]: + # Check if it's JSONL by extension + if Path(file_path).suffix.lower() == ".jsonl": + df = pd.read_json( + file_handle, # type: ignore[arg-type] + lines=True, + ) + else: + df = pd.read_json(file_handle) # type: ignore[arg-type] + elif mime_type in [ + "application/vnd.parquet", + "application/octet-stream", + ]: + # Parquet is often detected as octet-stream + df = pd.read_parquet(file_handle) # type: ignore[arg-type] + elif mime_type in [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", + ]: + df = pd.read_excel(file_handle) # type: ignore[arg-type] + else: + raise ValueError( + f"Unsupported MIME type for file {file_path}: {mime_type}" + ) + + return df + + +def dataframe_to_flow_messages( + df: pd.DataFrame, + variables: list[Variable], + session: Session = Session(session_id="default"), +) -> list[FlowMessage]: + """ + Convert a DataFrame to FlowMessages with type conversion. + + Each row in the DataFrame becomes a FlowMessage with variables converted + to their proper types based on the Variable definitions. + + Args: + df: DataFrame with raw data + variables: List of Variable definitions for type conversion + session: Session to use for all FlowMessages (default: Session(session_id="default")) + + Returns: + List of FlowMessages, one per row, with typed variables + + Examples: + >>> from qtype.semantic.model import Variable + >>> from qtype.base.types import PrimitiveTypeEnum + >>> import pandas as pd + >>> + >>> df = pd.DataFrame({"age": ["30"], "score": ["95.5"]}) + >>> vars = [ + ... Variable(id="age", type=PrimitiveTypeEnum.int), + ... Variable(id="score", type=PrimitiveTypeEnum.float), + ... ] + >>> messages = dataframe_to_flow_messages(df, vars) + """ + messages = [] + + for row_dict in df.to_dict(orient="records"): + typed_vars = convert_dict_to_typed_variables(row_dict, variables) + messages.append(FlowMessage(session=session, variables=typed_vars)) + + return messages diff --git a/qtype/interpreter/executors/agent_executor.py b/qtype/interpreter/executors/agent_executor.py index 92b0dc2e..4182824f 100644 --- a/qtype/interpreter/executors/agent_executor.py +++ b/qtype/interpreter/executors/agent_executor.py @@ -112,9 +112,8 @@ async def _process_chat( # Convert input variables to chat messages inputs = [] for input_var in self.step.inputs: - value = message.variables.get(input_var.id) - if value and isinstance(value, ChatMessage): - inputs.append(to_chat_message(value)) + value = message.get_variable(input_var.id) + inputs.append(to_chat_message(value)) # Get session ID for memory isolation session_id = message.session.session_id diff --git a/qtype/interpreter/executors/construct_executor.py b/qtype/interpreter/executors/construct_executor.py index 1e64679e..050a36e7 100644 --- a/qtype/interpreter/executors/construct_executor.py +++ b/qtype/interpreter/executors/construct_executor.py @@ -4,7 +4,7 @@ from qtype.interpreter.base.base_step_executor import StepExecutor from qtype.interpreter.base.executor_context import ExecutorContext from qtype.interpreter.types import FlowMessage -from qtype.interpreter.typing import instantiate_variable +from qtype.interpreter.typing import convert_dict_to_typed_variables from qtype.semantic.model import Construct @@ -43,21 +43,26 @@ async def process_message( isinstance(output_var.type, ListType) or len(self.step.inputs) == 1 ): - inputs = message.variables[self.step.inputs[0].id] + # Single input: pass value directly + data = { + output_var.id: message.variables[self.step.inputs[0].id] + } elif hasattr(output_var.type, "model_validate"): # This is a custom type (Pydantic model) # field_bindings maps type field names to Variables - inputs = { - field_name: message.variables[var.id] - for field_name, var in self.step.field_bindings.items() + data = { + output_var.id: { + field_name: message.variables[var.id] + for field_name, var in self.step.field_bindings.items() + } } else: raise ValueError( "Construct step must have either a single input or output of a custom type." ) - constructed_value = instantiate_variable(output_var, inputs) - yield message.copy_with_variables( - {output_var.id: constructed_value} - ) + + # Use convert_dict_to_typed_variables to validate and convert + result = convert_dict_to_typed_variables(data, self.step.outputs) + yield message.copy_with_variables(result) except Exception as e: yield message.copy_with_error(self.step.id, e) diff --git a/qtype/interpreter/executors/doc_to_text_executor.py b/qtype/interpreter/executors/doc_to_text_executor.py index a48b2a4f..cf4ba0c3 100644 --- a/qtype/interpreter/executors/doc_to_text_executor.py +++ b/qtype/interpreter/executors/doc_to_text_executor.py @@ -49,9 +49,7 @@ async def process_message( try: # Get the input document - if input_id not in message.variables: - raise ValueError(f"Input variable '{input_id}' is missing") - doc = message.variables.get(input_id) + doc = message.get_variable(input_id) if not isinstance(doc, RAGDocument): raise ValueError( f"Input variable '{input_id}' must be a RAGDocument" diff --git a/qtype/interpreter/executors/field_extractor_executor.py b/qtype/interpreter/executors/field_extractor_executor.py index 039e9b91..485fbd6f 100644 --- a/qtype/interpreter/executors/field_extractor_executor.py +++ b/qtype/interpreter/executors/field_extractor_executor.py @@ -111,15 +111,11 @@ async def process_message( Multiple messages may be yielded if JSONPath matches multiple values. """ input_id = self.step.inputs[0].id - output_id = self.step.outputs[0].id + output_var = self.step.outputs[0] try: # Get the input value - input_value = message.variables.get(input_id) - if input_value is None: - raise ValueError( - f"Input variable '{input_id}' is not set or is None" - ) + input_value = message.get_variable(input_id) await self.stream_emitter.status( f"Extracting fields using JSONPath: {self.step.json_path}" @@ -132,17 +128,20 @@ async def process_message( matches = self.jsonpath_expr.find(input_dict) if not matches: - if self.step.fail_on_missing: + if output_var.optional: + # Yield message with None output + await self.stream_emitter.status( + "JSONPath matched 0 value(s)" + ) + yield message.copy_with_variables({output_var.id: None}) + return + else: raise ValueError( ( f"JSONPath expression '{self.step.json_path}' " f"did not match any data in input" ) ) - else: - # Yield message with None output - yield message.copy_with_variables({output_id: None}) - return await self.stream_emitter.status( f"JSONPath matched {len(matches)} value(s)" @@ -156,7 +155,9 @@ async def process_message( output_value = self._construct_output(extracted_data) # Yield message with the constructed output - yield message.copy_with_variables({output_id: output_value}) + yield message.copy_with_variables( + {output_var.id: output_value} + ) except Exception as e: # Emit error event to stream so frontend can display it diff --git a/qtype/interpreter/executors/file_source_executor.py b/qtype/interpreter/executors/file_source_executor.py index 51d026af..7565758b 100644 --- a/qtype/interpreter/executors/file_source_executor.py +++ b/qtype/interpreter/executors/file_source_executor.py @@ -1,13 +1,13 @@ from __future__ import annotations -from pathlib import Path from typing import AsyncIterator -import fsspec -import pandas as pd - from qtype.interpreter.base.base_step_executor import StepExecutor from qtype.interpreter.base.executor_context import ExecutorContext +from qtype.interpreter.converters import ( + dataframe_to_flow_messages, + read_dataframe_from_file, +) from qtype.interpreter.types import FlowMessage from qtype.semantic.model import ConstantPath, FileSource @@ -37,8 +37,6 @@ async def process_message( Yields: FlowMessages with the results of processing. """ - output_columns = {output.id for output in self.step.outputs} - # get the path if isinstance(self.step.path, ConstantPath): # type: ignore[attr-defined] file_path = self.step.path # type: ignore[attr-defined] @@ -55,30 +53,16 @@ async def process_message( f"Reading file from path: {file_path}" ) - # Determine file format from extension + # Get file path as string file_path_str = ( file_path.uri if isinstance(file_path, ConstantPath) else file_path ) - extension = Path(file_path_str).suffix.lower() - # Use fsspec to open the file and read with pandas - with fsspec.open(file_path_str, "rb") as file_handle: - if extension == ".csv": - df = pd.read_csv(file_handle) # type: ignore[arg-type] - elif extension == ".parquet": - df = pd.read_parquet(file_handle) # type: ignore[arg-type] - elif extension == ".json": - df = pd.read_json(file_handle) # type: ignore[arg-type] - elif extension == ".jsonl": - df = pd.read_json( - file_handle, - lines=True, # type: ignore[arg-type] - ) - else: - # Default to parquet if no extension or unknown - df = pd.read_parquet(file_handle) # type: ignore[arg-type] + # Read file into DataFrame using helper function + df = read_dataframe_from_file(file_path_str) - # confirm the outputs exist in the dataframe + # Validate that expected output columns are present + output_columns = {output.id for output in self.step.outputs} columns = set(df.columns) missing_columns = output_columns - columns if missing_columns: @@ -90,12 +74,15 @@ async def process_message( ) ) - for row in df.to_dict(orient="records"): - # Filter to only the expected output columns if they exist - row = { - str(k): v for k, v in row.items() if str(k) in output_columns - } - yield message.copy_with_variables(new_variables=row) + # Convert DataFrame to FlowMessages with type conversion + flow_messages = dataframe_to_flow_messages( + df, self.step.outputs, session=message.session + ) + + # Yield each message + for flow_message in flow_messages: + yield flow_message + await self.stream_emitter.status( f"Emitted {len(df)} rows from: {file_path_str}" ) diff --git a/qtype/interpreter/executors/invoke_embedding_executor.py b/qtype/interpreter/executors/invoke_embedding_executor.py index 2f4acb4f..e5e7c772 100644 --- a/qtype/interpreter/executors/invoke_embedding_executor.py +++ b/qtype/interpreter/executors/invoke_embedding_executor.py @@ -52,10 +52,7 @@ async def process_message( try: # Get the input value - input_value = message.variables.get(input_id) - - if input_value is None: - raise ValueError(f"Input variable '{input_id}' is missing") + input_value = message.get_variable(input_id) def _call(input_value=input_value): # Generate embedding based on input type diff --git a/qtype/interpreter/executors/invoke_tool_executor.py b/qtype/interpreter/executors/invoke_tool_executor.py index 184e6210..d86bf9e1 100644 --- a/qtype/interpreter/executors/invoke_tool_executor.py +++ b/qtype/interpreter/executors/invoke_tool_executor.py @@ -247,28 +247,26 @@ def _prepare_tool_inputs(self, message: FlowMessage) -> dict[str, Any]: """ tool_inputs = {} - for tool_param_name, step_var_id in self.step.input_bindings.items(): + for tool_param_name, step_variable in self.step.input_bindings.items(): # Get tool parameter definition - tool_param = self.step.tool.inputs.get(tool_param_name) + tool_param = next( + (p for p in self.step.tool.inputs if p.id == tool_param_name), + None, + ) if not tool_param: raise ValueError( f"Tool parameter '{tool_param_name}' not defined in tool" ) # Get value from message variables - value = message.variables.get(step_var_id) - - # Handle missing values - if value is None: - if not tool_param.optional: - raise ValueError( - ( - f"Required input '{step_var_id}' for tool " - f"parameter '{tool_param_name}' is missing" - ) - ) - # Skip optional parameters that are missing - continue + # Use default=None for optional params, let get_variable raise for required + if tool_param.optional: + value = message.get_variable(step_variable.id, default=None) + if value is None: + # Skip optional parameters that are unset + continue + else: + value = message.get_variable(step_variable.id) tool_inputs[tool_param_name] = value @@ -288,9 +286,12 @@ def _extract_tool_outputs(self, result: Any) -> dict[str, Any]: """ output_vars = {} - for tool_param_name, step_var_id in self.step.output_bindings.items(): + for tool_param_name, step_var in self.step.output_bindings.items(): # Get tool parameter definition - tool_param = self.step.tool.outputs.get(tool_param_name) + tool_param = next( + (p for p in self.step.tool.outputs if p.id == tool_param_name), + None, + ) if not tool_param: raise ValueError( f"Tool parameter '{tool_param_name}' not defined in tool" @@ -311,7 +312,7 @@ def _extract_tool_outputs(self, result: Any) -> dict[str, Any]: value = result if value is not None: - output_vars[step_var_id] = value + output_vars[step_var.id] = value return output_vars diff --git a/qtype/interpreter/executors/llm_inference_executor.py b/qtype/interpreter/executors/llm_inference_executor.py index 02934cf5..1fc0ef47 100644 --- a/qtype/interpreter/executors/llm_inference_executor.py +++ b/qtype/interpreter/executors/llm_inference_executor.py @@ -109,7 +109,7 @@ async def _process_chat( # Convert input variables to chat messages inputs = [] for input_var in self.step.inputs: - value = message.variables.get(input_var.id) + value = message.get_variable(input_var.id) # Convert any value type to ChatMessage, then to LlamaChatMessage chat_msg = variable_to_chat_message(value, input_var) inputs.append(to_chat_message(chat_msg)) diff --git a/qtype/interpreter/executors/prompt_template_executor.py b/qtype/interpreter/executors/prompt_template_executor.py index f746d1c8..af14aa02 100644 --- a/qtype/interpreter/executors/prompt_template_executor.py +++ b/qtype/interpreter/executors/prompt_template_executor.py @@ -51,9 +51,7 @@ async def process_message( input_map = {} for var in self.step.inputs: if var.id in format_args: - value = message.variables.get(var.id) - if value is not None: - input_map[var.id] = value + input_map[var.id] = message.get_variable(var.id) missing = format_args - input_map.keys() if missing: diff --git a/qtype/interpreter/tools/function_tool_helper.py b/qtype/interpreter/tools/function_tool_helper.py index abc257df..c0cec27e 100644 --- a/qtype/interpreter/tools/function_tool_helper.py +++ b/qtype/interpreter/tools/function_tool_helper.py @@ -14,7 +14,7 @@ from qtype.base.types import PrimitiveTypeEnum from qtype.dsl.model import ListType from qtype.dsl.types import PRIMITIVE_TO_PYTHON_TYPE -from qtype.semantic.model import APITool, PythonFunctionTool, ToolParameter +from qtype.semantic.model import APITool, PythonFunctionTool, Variable logger = logging.getLogger(__name__) @@ -29,9 +29,9 @@ class FunctionToolHelper: @staticmethod def _qtype_type_to_python_type( - param: ToolParameter, + param: Variable, ) -> type: - """Convert QType ToolParameter type to Python type for Pydantic. + """Convert QType Variable type to Python type for Pydantic. The param.type has already been resolved during semantic model creation, so we just need to convert it to the appropriate Python @@ -42,7 +42,7 @@ def _qtype_type_to_python_type( - Unknown → str Args: - param: The QType ToolParameter to convert. + param: The QType Variable to convert. Returns: Python type suitable for Pydantic field annotation. @@ -55,7 +55,8 @@ def _qtype_type_to_python_type( if isinstance(param.type, ListType): # Create a mock parameter with the element type to recursively # resolve it - element_param = ToolParameter( + element_param = Variable( + id="temp", type=param.type.element_type, optional=False, ) @@ -74,13 +75,13 @@ def _qtype_type_to_python_type( @staticmethod def _create_fn_schema( tool_name: str, - inputs: dict[str, ToolParameter], + inputs: list[Variable], ) -> type[BaseModel] | None: """Create a Pydantic model from QType tool input parameters. Args: tool_name: Name of the tool (used for model name). - inputs: Dictionary of input parameter names to ToolParameter. + inputs: List of input Variables. Returns: Pydantic BaseModel class representing the tool's input schema. @@ -91,17 +92,17 @@ def _create_fn_schema( # Each field is a tuple of (type_annotation, field_info) field_definitions: dict[str, Any] = {} - for param_name, param in inputs.items(): + for param in inputs: python_type = FunctionToolHelper._qtype_type_to_python_type(param) # Create field with optional annotation if param.optional: - field_definitions[param_name] = ( + field_definitions[param.id] = ( python_type | None, # type: ignore[valid-type] PydanticField(default=None), ) else: - field_definitions[param_name] = ( + field_definitions[param.id] = ( python_type, PydanticField(...), ) diff --git a/qtype/interpreter/types.py b/qtype/interpreter/types.py index a6fe003c..d5790863 100644 --- a/qtype/interpreter/types.py +++ b/qtype/interpreter/types.py @@ -1,11 +1,38 @@ from typing import Any, Dict, Literal, Optional, Protocol, Union -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_serializer from qtype.base.types import StrictBaseModel from qtype.dsl.domain_types import ChatMessage from qtype.semantic.model import Step + +class _UnsetType: + """Sentinel representing an unset variable. + + Distinguishes between: + - Variable never mentioned (not in dict) + - Variable explicitly unset (UNSET value in dict) + - Variable set to None (None value in dict) + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __repr__(self) -> str: + return "UNSET" + + def __bool__(self) -> bool: + return False + + +UNSET = _UnsetType() + + # Stream Event Types (Discriminated Union) # These events are emitted by executors during flow execution # and can be converted to Vercel UI chunks for frontend display @@ -293,8 +320,9 @@ class FlowMessage(BaseModel): """ model_config = ConfigDict( - frozen=True - ) # Enforces immutability at the model level + frozen=True, + arbitrary_types_allowed=True, # Allow UNSET sentinel + ) session: Session variables: Dict[str, Any] = Field( @@ -307,6 +335,49 @@ def is_failed(self) -> bool: """Checks if this state has encountered an error.""" return self.error is not None + def is_set(self, var_id: str) -> bool: + """Check if a variable is set (not UNSET, may be None).""" + value = self.variables.get(var_id, UNSET) + return value is not UNSET + + def get_variable(self, var_id: str, *, default: Any = UNSET) -> Any: + """Get variable value, raising if unset and no default provided. + + Args: + var_id: Variable identifier + default: Value to return if variable is unset. If not provided, + raises ValueError on unset variables. + + Returns: + Variable value (may be None if explicitly set to None) + + Raises: + ValueError: If variable is unset and no default provided + + Examples: + # Required variable - throws if unset + value = message.get_variable("user_input") + + # Optional variable - returns None if unset + value = message.get_variable("optional_field", default=None) + + # Optional with custom default + value = message.get_variable("count", default=0) + """ + value = self.variables.get(var_id, UNSET) + + if value is UNSET: + if default is UNSET: + raise ValueError( + ( + f"Required variable '{var_id}' is not set. " + f"Available variables: {list(self.variables.keys())}" + ) + ) + return default + + return value + def copy_with_error(self, step_id: str, exc: Exception) -> "FlowMessage": """Returns a copy of this state marked as failed.""" return self.model_copy( @@ -319,15 +390,29 @@ def copy_with_error(self, step_id: str, exc: Exception) -> "FlowMessage": } ) - # It's useful to have copy-on-write style helpers def copy_with_variables( self, new_variables: dict[str, Any] ) -> "FlowMessage": + """Create a new FlowMessage with updated variables. + + Note: Can set variables to UNSET to explicitly mark them as unset. + """ new_vars = self.variables.copy() new_vars.update(new_variables) new_state = self.model_copy(update={"variables": new_vars}) return new_state + @model_serializer + def serialize_model(self): + """Custom serialization that excludes UNSET variables.""" + return { + "session": self.session, + "variables": { + k: v for k, v in self.variables.items() if v is not UNSET + }, + "error": self.error, + } + class InterpreterError(Exception): """Base exception class for ProtoGen interpreter errors.""" diff --git a/qtype/interpreter/typing.py b/qtype/interpreter/typing.py index 8029b5a3..61d95deb 100644 --- a/qtype/interpreter/typing.py +++ b/qtype/interpreter/typing.py @@ -1,9 +1,9 @@ from __future__ import annotations import uuid -from typing import Any, Type, get_origin +from typing import Any, Type -from pydantic import BaseModel, Field, TypeAdapter, create_model +from pydantic import BaseModel, Field, create_model from qtype.dsl.model import ListType, PrimitiveTypeEnum from qtype.dsl.model import Variable as DSLVariable @@ -138,37 +138,36 @@ def flow_results_to_output_container( return output_container(outputs=outputs, errors=errors) -def instantiate_variable(variable: DSLVariable, value: Any) -> Any: +def convert_dict_to_typed_variables( + data: dict[str, Any], variables: list[Variable] +) -> dict[str, Any]: """ - Unified contract to ensure data matches its QType definition. - Handles CustomTypes, DomainTypes, and Primitives. + Convert a dictionary of raw values to properly typed variables. + + Uses Pydantic model validation to convert all values at once based on + Variable type declarations. This is more efficient than converting each + field individually. + + Args: + data: Dictionary with raw values (e.g., from DataFrame row) + variables: List of Variable definitions with type information + + Returns: + Dictionary with values converted to their declared types + + Raises: + ValidationError: If values cannot be converted to declared types """ - target_type, _ = _get_variable_type(variable) + # Create a Pydantic model from the variable definitions + model_class = create_model( + "TypedVariables", + __base__=BaseModel, + **_fields_from_variables(variables), + ) - # 1. Handle the 'Parameterized Generic' Check (The isinstance fix) - # We check if target_type is a generic (like list[T]) vs a simple class. - origin = get_origin(target_type) + # Validate and convert the data using Pydantic + validated = model_class.model_validate(data) - if origin is None: - # It's a simple type (int, RAGChunk, etc.) - if isinstance(value, target_type): - return value - else: - # It's a generic (list[str], etc.). - # We skip the identity check and let TypeAdapter handle it. - pass - - # 2. Handle Pydantic Models (Custom/Domain Types) - if hasattr(target_type, "model_validate"): - return target_type.model_validate(value) # type: ignore[misc] - - # 3. Handle Primitives & Complex Python Types (List, Optional, Union) - try: - # TypeAdapter is the "V2 way" to validate things that aren't - # full Pydantic models (like List[int] or Optional[str]) - return TypeAdapter(target_type).validate_python(value) - except Exception: - # Fallback to your original manual cast if TypeAdapter is overkill - if isinstance(target_type, type): - return target_type(value) - raise ValueError(f"Unsupported target type: {target_type}") + # Return as dict but preserve actual typed instances (not serialized) + # Use __dict__ to get the actual field values without serialization + return dict(validated) diff --git a/qtype/semantic/generate.py b/qtype/semantic/generate.py index bb9c062d..bef20737 100644 --- a/qtype/semantic/generate.py +++ b/qtype/semantic/generate.py @@ -144,8 +144,7 @@ def generate_semantic_model(args: argparse.Namespace) -> None: CustomType, DecoderFormat, ListType, - PrimitiveTypeEnum, - ToolParameter + PrimitiveTypeEnum ) from qtype.dsl.model import Variable as DSLVariable # noqa: F401 from qtype.dsl.model import VariableType # noqa: F401 @@ -158,10 +157,8 @@ def generate_semantic_model(args: argparse.Namespace) -> None: f.write( dedent(''' class Variable(DSLVariable, BaseModel): - """Semantic version of DSL Variable with ID references resolved.""" - value: Any | None = Field(None, description="The value of the variable") - def is_set(self) -> bool: - return self.value is not None + """Semantic version of DSL Variable.""" + pass ''').lstrip() ) diff --git a/qtype/semantic/model.py b/qtype/semantic/model.py index c3891176..4b79cdd6 100644 --- a/qtype/semantic/model.py +++ b/qtype/semantic/model.py @@ -32,19 +32,15 @@ DecoderFormat, ListType, PrimitiveTypeEnum, - ToolParameter, ) from qtype.dsl.model import Variable as DSLVariable # noqa: F401 from qtype.semantic.base_types import ImmutableModel class Variable(DSLVariable, BaseModel): - """Semantic version of DSL Variable with ID references resolved.""" + """Semantic version of DSL Variable.""" - value: Any | None = Field(None, description="The value of the variable") - - def is_set(self) -> bool: - return self.value is not None + pass class AuthorizationProvider(ImmutableModel): @@ -66,12 +62,12 @@ class Tool(ImmutableModel): description: str = Field( ..., description="Description of what the tool does." ) - inputs: dict[str, ToolParameter] = Field( - default_factory=dict, + inputs: list[Variable] = Field( + default_factory=list, description="Input parameters required by this tool.", ) - outputs: dict[str, ToolParameter] = Field( - default_factory=dict, + outputs: list[Variable] = Field( + default_factory=list, description="Output parameters produced by this tool.", ) @@ -421,9 +417,9 @@ class APITool(Tool): default_factory=dict, description="Optional HTTP headers to include in the request.", ) - parameters: dict[str, ToolParameter] = Field( - default_factory=dict, - description="Output parameters produced by this tool.", + parameters: list[Variable] = Field( + default_factory=list, + description="Path and query parameters for the API call.", ) @@ -556,6 +552,9 @@ class FieldExtractor(Step): The extracted data is used to construct the output variable by passing it as keyword arguments to the output type's constructor. + If there is no match and the output variable is optional, it is set to None. + If there is no match and the output variable is required, an error is raised. + Example JSONPath expressions: - `$.field_name` - Extract a single field - `$.items[*]` - Extract all items from a list @@ -567,10 +566,6 @@ class FieldExtractor(Step): ..., description="JSONPath expression to extract data from the input. Uses jsonpath-ng syntax.", ) - fail_on_missing: bool = Field( - True, - description="Whether to raise an error if the JSONPath matches no data. If False, returns None.", - ) class InvokeEmbedding(Step, ConcurrentStepMixin): diff --git a/schema/qtype.schema.json b/schema/qtype.schema.json index a8614343..c59cd85f 100644 --- a/schema/qtype.schema.json +++ b/schema/qtype.schema.json @@ -68,20 +68,20 @@ "type": "string" }, "inputs": { - "additionalProperties": { - "$ref": "#/$defs/ToolParameter" - }, "description": "Input parameters required by this tool.", + "items": { + "$ref": "#/$defs/Variable" + }, "title": "Inputs", - "type": "object" + "type": "array" }, "outputs": { - "additionalProperties": { - "$ref": "#/$defs/ToolParameter" - }, "description": "Output parameters produced by this tool.", + "items": { + "$ref": "#/$defs/Variable" + }, "title": "Outputs", - "type": "object" + "type": "array" }, "type": { "const": "APITool", @@ -125,12 +125,12 @@ "type": "object" }, "parameters": { - "additionalProperties": { - "$ref": "#/$defs/ToolParameter" + "description": "Path and query parameters for the API call.", + "items": { + "$ref": "#/$defs/Variable" }, - "description": "Output parameters produced by this tool.", "title": "Parameters", - "type": "object" + "type": "array" } }, "required": [ @@ -1969,7 +1969,7 @@ }, "FieldExtractor": { "additionalProperties": false, - "description": "Extracts specific fields from input data using JSONPath expressions.\n\nThis step uses JSONPath syntax to extract data from structured inputs\n(Pydantic models, dicts, lists). The input is first converted to a dict\nusing model_dump() if it's a Pydantic model, then the JSONPath expression\nis evaluated.\n\nIf the JSONPath matches multiple values, the step yields multiple output\nmessages (1-to-many cardinality). If it matches a single value, it yields\none output message. If it matches nothing, it raises an error.\n\nThe extracted data is used to construct the output variable by passing it\nas keyword arguments to the output type's constructor.\n\nExample JSONPath expressions:\n- `$.field_name` - Extract a single field\n- `$.items[*]` - Extract all items from a list\n- `$.items[?(@.price > 10)]` - Filter items by condition", + "description": "Extracts specific fields from input data using JSONPath expressions.\n\nThis step uses JSONPath syntax to extract data from structured inputs\n(Pydantic models, dicts, lists). The input is first converted to a dict\nusing model_dump() if it's a Pydantic model, then the JSONPath expression\nis evaluated.\n\nIf the JSONPath matches multiple values, the step yields multiple output\nmessages (1-to-many cardinality). If it matches a single value, it yields\none output message. If it matches nothing, it raises an error.\n\nThe extracted data is used to construct the output variable by passing it\nas keyword arguments to the output type's constructor.\n\nIf there is no match and the output variable is optional, it is set to None.\nIf there is no match and the output variable is required, an error is raised.\n\nExample JSONPath expressions:\n- `$.field_name` - Extract a single field\n- `$.items[*]` - Extract all items from a list\n- `$.items[?(@.price > 10)]` - Filter items by condition", "properties": { "cache_config": { "anyOf": [ @@ -2028,12 +2028,6 @@ "description": "JSONPath expression to extract data from the input. Uses jsonpath-ng syntax.", "title": "Json Path", "type": "string" - }, - "fail_on_missing": { - "default": true, - "description": "Whether to raise an error if the JSONPath matches no data. If False, returns None.", - "title": "Fail On Missing", - "type": "boolean" } }, "required": [ @@ -3259,20 +3253,20 @@ "type": "string" }, "inputs": { - "additionalProperties": { - "$ref": "#/$defs/ToolParameter" - }, "description": "Input parameters required by this tool.", + "items": { + "$ref": "#/$defs/Variable" + }, "title": "Inputs", - "type": "object" + "type": "array" }, "outputs": { - "additionalProperties": { - "$ref": "#/$defs/ToolParameter" - }, "description": "Output parameters produced by this tool.", + "items": { + "$ref": "#/$defs/Variable" + }, "title": "Outputs", - "type": "object" + "type": "array" }, "type": { "const": "PythonFunctionTool", @@ -3687,37 +3681,6 @@ "title": "ToolList", "type": "array" }, - "ToolParameter": { - "description": "Defines a tool input or output parameter with type and optional flag.", - "properties": { - "type": { - "anyOf": [ - { - "$ref": "#/$defs/PrimitiveTypeEnum" - }, - {}, - { - "$ref": "#/$defs/ListType" - }, - { - "type": "string" - } - ], - "title": "Type" - }, - "optional": { - "default": false, - "description": "Whether this parameter is optional", - "title": "Optional", - "type": "boolean" - } - }, - "required": [ - "type" - ], - "title": "ToolParameter", - "type": "object" - }, "TypeList": { "description": "Schema for a standalone list of type definitions.", "items": { @@ -3751,6 +3714,12 @@ "description": "Type of data expected or produced. Either a CustomType or domain specific type.", "title": "Type" }, + "optional": { + "default": false, + "description": "Whether this variable can be unset or None. Use '?' suffix in type string as shorthand (e.g., 'text?').", + "title": "Optional", + "type": "boolean" + }, "ui": { "anyOf": [ { diff --git a/tests/application/test_tools_from_module.py b/tests/application/test_tools_from_module.py index acacae01..baa31443 100644 --- a/tests/application/test_tools_from_module.py +++ b/tests/application/test_tools_from_module.py @@ -115,22 +115,32 @@ def greet(name: str, greeting: str = "Hello") -> str: assert add_tool.description == "Add two numbers." assert add_tool.inputs is not None assert len(add_tool.inputs) == 2 - assert "a" in add_tool.inputs - assert "b" in add_tool.inputs - assert add_tool.inputs["a"].type == PrimitiveTypeEnum.int - assert add_tool.inputs["b"].type == PrimitiveTypeEnum.int - assert not add_tool.inputs["a"].optional - assert not add_tool.inputs["b"].optional + + # Find variables by id in the list + add_a = next((v for v in add_tool.inputs if v.id == "a"), None) + add_b = next((v for v in add_tool.inputs if v.id == "b"), None) + assert add_a is not None + assert add_b is not None + assert add_a.type == PrimitiveTypeEnum.int + assert add_b.type == PrimitiveTypeEnum.int + assert not add_a.optional + assert not add_b.optional greet_tool = next(t for t in tools if t.name == "greet") assert greet_tool.inputs is not None assert len(greet_tool.inputs) == 2 - assert "name" in greet_tool.inputs - assert "greeting" in greet_tool.inputs - assert greet_tool.inputs["name"].type == PrimitiveTypeEnum.text - assert greet_tool.inputs["greeting"].type == PrimitiveTypeEnum.text - assert not greet_tool.inputs["name"].optional - assert greet_tool.inputs["greeting"].optional + + # Find variables by id in the list + greet_name = next((v for v in greet_tool.inputs if v.id == "name"), None) + greet_greeting = next( + (v for v in greet_tool.inputs if v.id == "greeting"), None + ) + assert greet_name is not None + assert greet_greeting is not None + assert greet_name.type == PrimitiveTypeEnum.text + assert greet_greeting.type == PrimitiveTypeEnum.text + assert not greet_name.optional + assert greet_greeting.optional def test_tools_from_module_errors(): @@ -195,10 +205,11 @@ def bad_func(x: int): def test_create_tool_from_function(mock_func_info, docstring, expected): """Test tool creation from function metadata.""" mock_func_info["docstring"] = docstring - custom_types = {} + custom_type_registry = {} + custom_type_models = {} tool = _create_tool_from_function( - "test_func", mock_func_info, custom_types + "test_func", mock_func_info, custom_type_registry, custom_type_models ) assert isinstance(tool, PythonFunctionTool) @@ -206,9 +217,12 @@ def test_create_tool_from_function(mock_func_info, docstring, expected): assert tool.description == expected assert tool.inputs is not None assert len(tool.inputs) == 1 - assert "x" in tool.inputs - assert tool.inputs["x"].type == PrimitiveTypeEnum.text - assert not tool.inputs["x"].optional + + # Find variable by id in the list + input_x = next((v for v in tool.inputs if v.id == "x"), None) + assert input_x is not None + assert input_x.type == PrimitiveTypeEnum.text + assert not input_x.optional def test_create_tool_no_parameters(): @@ -222,12 +236,17 @@ def test_create_tool_no_parameters(): "module": "test_module", } - tool = _create_tool_from_function("no_params", func_info, {}) - assert tool.inputs == {} + tool = _create_tool_from_function("no_params", func_info, {}, {}) + assert tool.inputs == [] assert len(tool.outputs) > 0 - assert "result" in tool.outputs - assert tool.outputs["result"].type == PrimitiveTypeEnum.text - assert not tool.outputs["result"].optional + + # Find result variable in outputs list + result_output = next( + (v for v in tool.outputs if v.id == "no_params_result"), None + ) + assert result_output is not None + assert result_output.type == PrimitiveTypeEnum.text + assert not result_output.optional @pytest.mark.parametrize( @@ -240,13 +259,17 @@ def test_create_tool_no_parameters(): ) def test_pydantic_to_custom_types(model_cls, expected_props): """Test Pydantic model conversion to custom types.""" - custom_types = {} - result = _pydantic_to_custom_types(model_cls, custom_types) + custom_type_registry = {} + custom_type_models = {} + result = _pydantic_to_custom_types( + model_cls, custom_type_registry, custom_type_models + ) assert result == model_cls.__name__ - assert model_cls.__name__ in custom_types + assert model_cls.__name__ in custom_type_registry + assert model_cls.__name__ in custom_type_models - custom_type = custom_types[model_cls.__name__] + custom_type = custom_type_models[model_cls.__name__] assert custom_type.properties == expected_props @@ -257,11 +280,15 @@ def test_pydantic_already_processed(): from qtype.dsl.model import CustomType mock_custom_type = cast(CustomType, Mock()) - custom_types = {"SampleModel": mock_custom_type} - result = _pydantic_to_custom_types(SampleModel, custom_types) + custom_type_registry = {"SampleModel": Mock()} + custom_type_models = {"SampleModel": mock_custom_type} + result = _pydantic_to_custom_types( + SampleModel, custom_type_registry, custom_type_models + ) assert result == "SampleModel" - assert len(custom_types) == 1 + assert len(custom_type_registry) == 1 + assert len(custom_type_models) == 1 def test_pydantic_no_annotation(): @@ -271,7 +298,7 @@ def test_pydantic_no_annotation(): mock_model.model_fields = {"bad_field": Mock(annotation=None)} with pytest.raises(TypeError, match="must have a type hint"): - _pydantic_to_custom_types(mock_model, {}) # type: ignore + _pydantic_to_custom_types(mock_model, {}, {}) # type: ignore @pytest.mark.parametrize( @@ -284,23 +311,27 @@ def test_pydantic_no_annotation(): ) def test_map_python_type_to_variable_type_primitives(python_type, expected): """Test mapping primitive types.""" - result = _map_python_type_to_variable_type(python_type, {}) + result = _map_python_type_to_variable_type(python_type, {}, {}) assert result == expected def test_map_python_type_pydantic_model(): """Test mapping Pydantic model.""" - custom_types = {} - result = _map_python_type_to_variable_type(SampleModel, custom_types) + custom_type_registry = {} + custom_type_models = {} + result = _map_python_type_to_variable_type( + SampleModel, custom_type_registry, custom_type_models + ) assert result == "SampleModel" - assert "SampleModel" in custom_types + assert "SampleModel" in custom_type_registry + assert "SampleModel" in custom_type_models def test_map_python_type_unsupported(): """Test error for unsupported type.""" with pytest.raises(ValueError, match="Unsupported Python type"): - _map_python_type_to_variable_type(dict, {}) + _map_python_type_to_variable_type(dict, {}, {}) @pytest.mark.parametrize( @@ -313,5 +344,5 @@ def test_map_python_type_unsupported(): ) def test_map_python_type_to_type_str(python_type, expected): """Test type to string mapping.""" - result = _map_python_type_to_type_str(python_type, {}) + result = _map_python_type_to_type_str(python_type, {}, {}) assert result == expected diff --git a/tests/document-specs/invalid_tool_list_duplicate_ids.qtype.yaml b/tests/document-specs/invalid_tool_list_duplicate_ids.qtype.yaml index 3df7f515..d5911bf3 100644 --- a/tests/document-specs/invalid_tool_list_duplicate_ids.qtype.yaml +++ b/tests/document-specs/invalid_tool_list_duplicate_ids.qtype.yaml @@ -3,8 +3,8 @@ name: Tool1 description: Test tool endpoint: https://api.test.com - inputs: {} - outputs: {} + inputs: [] + outputs: [] - id: tool1 type: PythonFunctionTool @@ -12,5 +12,5 @@ description: Another test tool function_name: test_func module_path: test.module - inputs: {} - outputs: {} + inputs: [] + outputs: [] diff --git a/tests/document-specs/mixed_types.json b/tests/document-specs/mixed_types.json new file mode 100644 index 00000000..ade9d2c6 --- /dev/null +++ b/tests/document-specs/mixed_types.json @@ -0,0 +1,17 @@ +[ + { + "person": {"name": "Alice Johnson", "age": 30, "email": "alice@example.com"}, + "result": {"content": "Found information about Python", "doc_id": "doc-123", "score": 0.95}, + "rank": 1 + }, + { + "person": {"name": "Bob Smith", "age": 25, "email": "bob@example.com"}, + "result": {"content": "Information about testing frameworks", "doc_id": "doc-456", "score": 0.87}, + "rank": 2 + }, + { + "person": {"name": "Carol Davis", "age": 35, "email": "carol@example.com"}, + "result": {"content": "Documentation on async programming", "doc_id": "doc-789", "score": 0.72}, + "rank": 3 + } +] diff --git a/tests/document-specs/referenced_tool_library.qtype.yaml b/tests/document-specs/referenced_tool_library.qtype.yaml index 6139e571..a49d8139 100644 --- a/tests/document-specs/referenced_tool_library.qtype.yaml +++ b/tests/document-specs/referenced_tool_library.qtype.yaml @@ -14,6 +14,6 @@ tools: endpoint: https://example.com method: GET auth: lib_bearer - inputs: {} - outputs: {} - parameters: {} + inputs: [] + outputs: [] + parameters: [] diff --git a/tests/document-specs/valid_tool_list.qtype.yaml b/tests/document-specs/valid_tool_list.qtype.yaml index 08ff1ee6..c0b4f8a7 100644 --- a/tests/document-specs/valid_tool_list.qtype.yaml +++ b/tests/document-specs/valid_tool_list.qtype.yaml @@ -5,11 +5,11 @@ endpoint: https://api.weather.com/v1/current method: GET inputs: - location: + - id: location type: text optional: false outputs: - temperature: + - id: temperature type: float optional: false @@ -20,13 +20,13 @@ function_name: calculate_sum module_path: my_tools.math_utils inputs: - a: + - id: a type: int optional: false - b: + - id: b type: int optional: false outputs: - result: + - id: result type: int optional: false diff --git a/tests/dsl/test_list_type.py b/tests/dsl/test_list_type.py index 3fb8a9b1..159c973f 100644 --- a/tests/dsl/test_list_type.py +++ b/tests/dsl/test_list_type.py @@ -54,14 +54,14 @@ def test_list_type_yaml_loading(): endpoint: https://api.example.com/test method: POST inputs: - urls: + - id: urls type: list[text] optional: false - query: + - id: query type: text optional: false outputs: - result: + - id: result type: text optional: false """ @@ -77,7 +77,8 @@ def test_list_type_yaml_loading(): assert len(document.root) == 1 tool = document.root[0] - urls_param = tool.inputs["urls"] + urls_param = next((v for v in tool.inputs if v.id == "urls"), None) + assert urls_param is not None assert isinstance(urls_param.type, ListType) assert urls_param.type.element_type == PrimitiveTypeEnum.text assert not urls_param.optional @@ -91,11 +92,13 @@ def test_list_type_with_python_functions(): from qtype.dsl.model import ListType # Test list[str] -> ListType - result = _map_python_type_to_variable_type(list[str], {}) + result = _map_python_type_to_variable_type(list[str], {}, {}) assert isinstance(result, ListType) assert result.element_type == PrimitiveTypeEnum.text # Test list[int] -> ListType - result = _map_python_type_to_variable_type(list[int], {}) + result = _map_python_type_to_variable_type(list[int], {}, {}) + assert isinstance(result, ListType) + assert result.element_type == PrimitiveTypeEnum.int assert isinstance(result, ListType) assert result.element_type == PrimitiveTypeEnum.int diff --git a/tests/dsl/test_optional_variables.py b/tests/dsl/test_optional_variables.py new file mode 100644 index 00000000..f03eadd6 --- /dev/null +++ b/tests/dsl/test_optional_variables.py @@ -0,0 +1,82 @@ +"""Tests for optional variable syntax and behavior.""" + +from qtype.dsl.model import Variable + + +def test_variable_with_question_mark_syntax(): + """Test that '?' suffix marks variable as optional.""" + var = Variable(id="optional_var", type="text?") + + assert var.optional is True + assert var.type == "text" + + +def test_variable_without_question_mark(): + """Test that variables without '?' are required by default.""" + var = Variable(id="required_var", type="text") + + assert var.optional is False + + +def test_variable_explicit_optional_flag(): + """Test explicit optional=True works.""" + var = Variable(id="optional_var", type="text", optional=True) + + assert var.optional is True + assert var.type == "text" + + +def test_variable_serialization_with_optional(): + """Test that optional variables serialize with '?' suffix.""" + var = Variable(id="opt_var", type="text?") + serialized = var.model_dump() + + assert serialized["type"] == "text?" + assert serialized["id"] == "opt_var" + + +def test_variable_serialization_without_optional(): + """Test that required variables serialize without '?' suffix.""" + var = Variable(id="req_var", type="text") + serialized = var.model_dump() + + assert serialized["type"] == "text" + assert serialized["id"] == "req_var" + + +def test_optional_with_list_type(): + """Test optional syntax works with list types.""" + var = Variable(id="optional_list", type="list[text]?") + + assert var.optional is True + # Type should be parsed as list[text] + serialized = var.model_dump() + assert serialized["type"] == "list[text]?" + + +# TODO: support nested options? Seems overkill for now..? +# def test_optional_within_list_type(): +# """Test optional syntax works with list types.""" +# var = Variable(id="optional_item_list", type="list[text?]") + +# assert var.optional is False +# # Type should be parsed as list[text] +# serialized = var.model_dump() +# assert serialized["type"] == "list[text?]" + + +def test_optional_with_number_type(): + """Test optional syntax works with primitive types.""" + var = Variable(id="optional_num", type="int?") + + assert var.optional is True + assert var.type == "int" + + +def test_explicit_optional_overrides_question_mark(): + """Test that explicit optional=False is respected even with '?'.""" + # The validator should strip '?' and set optional=True + var = Variable(id="var", type="text?", optional=False) + + # The '?' in type should take precedence + assert var.optional is True diff --git a/tests/interpreter/test_aggregate_executor.py b/tests/interpreter/test_aggregate_executor.py index af5121a1..fa5da3f7 100644 --- a/tests/interpreter/test_aggregate_executor.py +++ b/tests/interpreter/test_aggregate_executor.py @@ -25,7 +25,7 @@ def aggregate_step(): id="aggregate-step", type="Aggregate", inputs=[], - outputs=[Variable(id="stats", type="AggregateStats", value=None)], + outputs=[Variable(id="stats", type="AggregateStats")], ) diff --git a/tests/interpreter/test_cache.py b/tests/interpreter/test_cache.py index a81316ad..dcab4e68 100644 --- a/tests/interpreter/test_cache.py +++ b/tests/interpreter/test_cache.py @@ -27,10 +27,10 @@ class CacheTestStep(Step): type: str = "CacheTestStep" inputs: list[Variable] = [ - Variable(id="input", type=PrimitiveTypeEnum.text, value=None) + Variable(id="input", type=PrimitiveTypeEnum.text) ] outputs: list[Variable] = [ - Variable(id="result", type=PrimitiveTypeEnum.text, value=None) + Variable(id="result", type=PrimitiveTypeEnum.text) ] @@ -137,7 +137,7 @@ async def test_cache_includes_referenced_variables_outside_inputs(): # First execution with file1 input_df1 = pd.DataFrame([{"file_path": file1_path}]) messages1 = dataframe_to_flow_messages( - input_df1, Session(session_id="test") + input_df1, flow.inputs, session=Session(session_id="test") ) results1 = await run_flow(flow, messages1, context=context) result_df1 = flow_messages_to_dataframe(results1, flow) @@ -146,7 +146,7 @@ async def test_cache_includes_referenced_variables_outside_inputs(): # Second execution with file2 should NOT use cached result from file1 input_df2 = pd.DataFrame([{"file_path": file2_path}]) messages2 = dataframe_to_flow_messages( - input_df2, Session(session_id="test") + input_df2, flow.inputs, session=Session(session_id="test") ) results2 = await run_flow(flow, messages2, context=context) result_df2 = flow_messages_to_dataframe(results2, flow) diff --git a/tests/interpreter/test_collect_executor.py b/tests/interpreter/test_collect_executor.py index d5c72738..70648eef 100644 --- a/tests/interpreter/test_collect_executor.py +++ b/tests/interpreter/test_collect_executor.py @@ -16,8 +16,8 @@ async def test_collect_emits_single_list_with_common_ancestors( ): """Test that Collect emits one list and propagates only common ancestors.""" # Create variables - input_var = Variable(id="item", type="text", value=None) - output_var = Variable(id="items", type="list[text]", value=None) + input_var = Variable(id="item", type="text") + output_var = Variable(id="items", type="list[text]") # Create Collect step collect_step = Collect( diff --git a/tests/interpreter/test_construct_executor.py b/tests/interpreter/test_construct_executor.py index 6264d12a..c380c4f0 100644 --- a/tests/interpreter/test_construct_executor.py +++ b/tests/interpreter/test_construct_executor.py @@ -62,9 +62,7 @@ async def test_construct_primitive_type(executor_context): async def test_construct_list_type(executor_context): """Test constructing a list type.""" # Create variables - input_var = Variable( - id="items_input", type=PrimitiveTypeEnum.text, value=None - ) + input_var = Variable(id="items_input", type=PrimitiveTypeEnum.text) output_var = Variable( id="items_list", type=ListType(element_type=PrimitiveTypeEnum.int), diff --git a/tests/interpreter/test_dataframe_to_flow_messages.py b/tests/interpreter/test_dataframe_to_flow_messages.py new file mode 100644 index 00000000..8c7ea597 --- /dev/null +++ b/tests/interpreter/test_dataframe_to_flow_messages.py @@ -0,0 +1,190 @@ +"""Tests for dataframe_to_flow_messages converter.""" + +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from qtype.base.types import PrimitiveTypeEnum +from qtype.dsl.domain_types import SearchResult +from qtype.interpreter.converters import dataframe_to_flow_messages +from qtype.semantic.loader import load +from qtype.semantic.model import Variable + + +def test_dataframe_to_flow_messages_with_primitives(): + """Test converting DataFrame with primitive types to FlowMessages.""" + # Create DataFrame with primitive types + df = pd.DataFrame( + { + "id": ["1", "2", "3"], + "name": ["Alice", "Bob", "Carol"], + "age": ["30", "25", "35"], + "score": ["95.5", "87.3", "92.1"], + } + ) + + # Define variables + variables = [ + Variable(id="id", type=PrimitiveTypeEnum.int), + Variable(id="name", type=PrimitiveTypeEnum.text), + Variable(id="age", type=PrimitiveTypeEnum.int), + Variable(id="score", type=PrimitiveTypeEnum.float), + ] + + # Convert to FlowMessages + messages = dataframe_to_flow_messages(df, variables) + + # Verify we got 3 messages + assert len(messages) == 3 + + # Check first message + first_vars = messages[0].variables + assert first_vars["id"] == 1 + assert isinstance(first_vars["id"], int) + assert first_vars["name"] == "Alice" + assert isinstance(first_vars["name"], str) + assert first_vars["age"] == 30 + assert isinstance(first_vars["age"], int) + assert first_vars["score"] == 95.5 + assert isinstance(first_vars["score"], float) + + # Check second message + second_vars = messages[1].variables + assert second_vars["id"] == 2 + assert second_vars["name"] == "Bob" + assert second_vars["age"] == 25 + assert second_vars["score"] == 87.3 + + # Check third message + third_vars = messages[2].variables + assert third_vars["id"] == 3 + assert third_vars["name"] == "Carol" + assert third_vars["age"] == 35 + assert third_vars["score"] == 92.1 + + +def test_dataframe_to_flow_messages_with_domain_type(): + """Test converting DataFrame with domain type cells to FlowMessages. + + This simulates reading from parquet where SearchResult instances were + persisted as dicts via model_dump(). + """ + # Create SearchResult instances and dump to dicts (as would happen in parquet) + result1 = SearchResult( + content="Found relevant information about Python", + doc_id="doc-123", + score=0.95, + ) + result2 = SearchResult( + content="Another search result about testing", + doc_id="doc-456", + score=0.87, + ) + result3 = SearchResult( + content="Third result with lower relevance", + doc_id="doc-789", + score=0.72, + ) + + # Create DataFrame with dict cells (as read from parquet) + df = pd.DataFrame( + { + "result": [ + result1.model_dump(), + result2.model_dump(), + result3.model_dump(), + ] + } + ) + + # Define variables - result should be converted to SearchResult + variables = [Variable(id="result", type=SearchResult)] + + # Convert to FlowMessages + messages = dataframe_to_flow_messages(df, variables) + + # Verify we got 3 messages + assert len(messages) == 3 + + # Check first message - should have SearchResult instance + first_result = messages[0].variables["result"] + assert isinstance(first_result, SearchResult) + assert first_result.content == "Found relevant information about Python" + assert first_result.doc_id == "doc-123" + assert first_result.score == 0.95 + + # Check second message + second_result = messages[1].variables["result"] + assert isinstance(second_result, SearchResult) + assert second_result.content == "Another search result about testing" + assert second_result.doc_id == "doc-456" + assert second_result.score == 0.87 + + # Check third message + third_result = messages[2].variables["result"] + assert isinstance(third_result, SearchResult) + assert third_result.content == "Third result with lower relevance" + assert third_result.doc_id == "doc-789" + assert third_result.score == 0.72 + + +def test_dataframe_to_flow_messages_with_custom_type(): + """Test converting DataFrame with custom type cells to FlowMessages. + + This simulates reading from CSV/parquet where custom type instances were + persisted as dicts via model_dump(). + """ + # Load the custom type from spec + spec_path = Path("tests/specs/file_source_custom_types.qtype.yaml") + semantic_model, type_registry = load(spec_path) + + # Get the Person type from the registry + Person = type_registry["Person"] + + # Create Person instances and dump to dicts (as would happen in CSV/parquet) + person1 = Person(name="Alice Johnson", age=30, email="alice@example.com") + person2 = Person(name="Bob Smith", age=25, email="bob@example.com") + person3 = Person(name="Carol Davis", age=35, email="carol@example.com") + + # Create DataFrame with dict cells (as read from CSV/parquet) + df = pd.DataFrame( + { + "person": [ + person1.model_dump(), + person2.model_dump(), + person3.model_dump(), + ] + } + ) + + # Define variables - person should be converted to Person type + variables = [Variable(id="person", type=Person)] + + # Convert to FlowMessages + messages = dataframe_to_flow_messages(df, variables) + + # Verify we got 3 messages + assert len(messages) == 3 + + # Check first message - should have Person instance + first_person = messages[0].variables["person"] + assert isinstance(first_person, Person) + assert first_person.name == "Alice Johnson" + assert first_person.age == 30 + assert first_person.email == "alice@example.com" + + # Check second message + second_person = messages[1].variables["person"] + assert isinstance(second_person, Person) + assert second_person.name == "Bob Smith" + assert second_person.age == 25 + assert second_person.email == "bob@example.com" + + # Check third message + third_person = messages[2].variables["person"] + assert isinstance(third_person, Person) + assert third_person.name == "Carol Davis" + assert third_person.age == 35 + assert third_person.email == "carol@example.com" diff --git a/tests/interpreter/test_echo_executor.py b/tests/interpreter/test_echo_executor.py index 11e61259..36a40b13 100644 --- a/tests/interpreter/test_echo_executor.py +++ b/tests/interpreter/test_echo_executor.py @@ -12,8 +12,8 @@ async def test_echo_passes_through_variables(executor_context): """Test that Echo step passes through all input variables to outputs.""" # Create variables - var1 = Variable(id="test_var1", type="text", value=None) - var2 = Variable(id="test_var2", type="int", value=None) + var1 = Variable(id="test_var1", type="text") + var2 = Variable(id="test_var2", type="int") # Create Echo step echo_step = Echo( @@ -53,8 +53,8 @@ async def test_echo_passes_through_variables(executor_context): async def test_echo_handles_different_order(executor_context): """Test that Echo works with inputs and outputs in different order.""" # Create variables in different order - var1 = Variable(id="var_a", type="text", value=None) - var2 = Variable(id="var_b", type="int", value=None) + var1 = Variable(id="var_a", type="text") + var2 = Variable(id="var_b", type="int") # Create Echo step with different order for outputs echo_step = Echo( @@ -94,8 +94,8 @@ async def test_echo_handles_different_order(executor_context): async def test_echo_with_missing_variable(executor_context): """Test that Echo handles missing variables gracefully.""" # Create variables - var1 = Variable(id="present_var", type="text", value=None) - var2 = Variable(id="missing_var", type="int", value=None) + var1 = Variable(id="present_var", type="text") + var2 = Variable(id="missing_var", type="int") # Create Echo step echo_step = Echo( diff --git a/tests/interpreter/test_explode_executor.py b/tests/interpreter/test_explode_executor.py index 5319d7a4..0cff6a48 100644 --- a/tests/interpreter/test_explode_executor.py +++ b/tests/interpreter/test_explode_executor.py @@ -16,8 +16,8 @@ async def test_explode_emits_n_messages_with_matching_references( ): """Test that Explode emits N messages with matching references.""" # Create variables - input_var = Variable(id="items", type="list[text]", value=None) - output_var = Variable(id="item", type="text", value=None) + input_var = Variable(id="items", type="list[text]") + output_var = Variable(id="item", type="text") # Create Explode step explode_step = Explode( diff --git a/tests/interpreter/test_field_extractor_executor.py b/tests/interpreter/test_field_extractor_executor.py index 3d548d61..8b12d977 100644 --- a/tests/interpreter/test_field_extractor_executor.py +++ b/tests/interpreter/test_field_extractor_executor.py @@ -18,12 +18,8 @@ async def test_field_extractor_single_value( ): """Test extracting a single field from input.""" # Create a FieldExtractor step - input_var = Variable( - id="input_data", type=PrimitiveTypeEnum.text, value=None - ) - output_var = Variable( - id="output_name", type=PrimitiveTypeEnum.text, value=None - ) + input_var = Variable(id="input_data", type=PrimitiveTypeEnum.text) + output_var = Variable(id="output_name", type=PrimitiveTypeEnum.text) step = FieldExtractor( id="extract_name", @@ -59,10 +55,8 @@ async def test_field_extractor_multiple_values( ): """Test extracting multiple values (1-to-many).""" # Create a FieldExtractor step - input_var = Variable( - id="input_data", type=PrimitiveTypeEnum.text, value=None - ) - output_var = Variable(id="item", type=PrimitiveTypeEnum.text, value=None) + input_var = Variable(id="input_data", type=PrimitiveTypeEnum.text) + output_var = Variable(id="item", type=PrimitiveTypeEnum.text) step = FieldExtractor( id="extract_items", @@ -100,10 +94,8 @@ async def test_field_extractor_with_pydantic_model( ): """Test extracting from a Pydantic model input.""" # Create a FieldExtractor step - input_var = Variable(id="doc", type=RAGDocument, value=None) - output_var = Variable( - id="file_name", type=PrimitiveTypeEnum.text, value=None - ) + input_var = Variable(id="doc", type=RAGDocument) + output_var = Variable(id="file_name", type=PrimitiveTypeEnum.text) step = FieldExtractor( id="extract_filename", @@ -146,12 +138,8 @@ async def test_field_extractor_no_match( ): """Test that an error is raised when JSONPath doesn't match.""" # Create a FieldExtractor step - input_var = Variable( - id="input_data", type=PrimitiveTypeEnum.text, value=None - ) - output_var = Variable( - id="output_value", type=PrimitiveTypeEnum.text, value=None - ) + input_var = Variable(id="input_data", type=PrimitiveTypeEnum.text) + output_var = Variable(id="output_value", type=PrimitiveTypeEnum.text) step = FieldExtractor( id="extract_missing", diff --git a/tests/interpreter/test_file_source_executor.py b/tests/interpreter/test_file_source_executor.py new file mode 100644 index 00000000..3ba7f344 --- /dev/null +++ b/tests/interpreter/test_file_source_executor.py @@ -0,0 +1,81 @@ +"""Tests for FileSource executor.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from qtype.dsl.domain_types import SearchResult +from qtype.interpreter.executors.file_source_executor import FileSourceExecutor +from qtype.interpreter.types import FlowMessage, Session +from qtype.semantic.loader import load + + +@pytest.mark.asyncio +async def test_file_source_loads_custom_types(executor_context): + """Test that FileSource can load JSON with mixed types: custom, domain, and primitive.""" + # Load the spec with custom Person type + spec_path = Path("tests/specs/file_source_custom_types.qtype.yaml") + semantic_model, type_registry = load(spec_path) + + # Get the Person type and the flow + Person = type_registry["Person"] + flow = semantic_model.flows[0] + file_source_step = flow.steps[0] + + # Create FileSource executor + executor = FileSourceExecutor(file_source_step, executor_context) + + # Create an initial flow message + initial_message = FlowMessage( + session=Session(session_id="test"), variables={} + ) + + # Execute the FileSource step + results = [] + async for result_message in executor.process_message(initial_message): + results.append(result_message) + + # Validate we got 3 results (one per row in the JSON) + assert len(results) == 3 + + # Validate first row has all three types correctly + first_vars = results[0].variables + + # Check Person (custom type) + first_person = first_vars["person"] + assert isinstance(first_person, Person) + assert first_person.name == "Alice Johnson" + assert first_person.age == 30 + assert first_person.email == "alice@example.com" + + # Check SearchResult (domain type) + first_result = first_vars["result"] + assert isinstance(first_result, SearchResult) + assert first_result.content == "Found information about Python" + assert first_result.doc_id == "doc-123" + assert abs(first_result.score - 0.95) < 0.01 + + # Check rank (primitive int type) + first_rank = first_vars["rank"] + assert isinstance(first_rank, int) + assert first_rank == 1 + + # Validate second row + second_vars = results[1].variables + assert isinstance(second_vars["person"], Person) + assert second_vars["person"].name == "Bob Smith" + assert isinstance(second_vars["result"], SearchResult) + assert second_vars["result"].doc_id == "doc-456" + assert isinstance(second_vars["rank"], int) + assert second_vars["rank"] == 2 + + # Validate third row + third_vars = results[2].variables + assert isinstance(third_vars["person"], Person) + assert third_vars["person"].name == "Carol Davis" + assert isinstance(third_vars["result"], SearchResult) + assert third_vars["result"].doc_id == "doc-789" + assert isinstance(third_vars["rank"], int) + assert third_vars["rank"] == 3 diff --git a/tests/interpreter/test_function_tool_helper.py b/tests/interpreter/test_function_tool_helper.py index dfa38282..f8c22f19 100644 --- a/tests/interpreter/test_function_tool_helper.py +++ b/tests/interpreter/test_function_tool_helper.py @@ -58,34 +58,33 @@ async def test_create_python_function_tool_and_call(): def test_qtype_type_to_python_type_list(): """Test that list types are correctly converted to typed lists.""" from qtype.base.types import PrimitiveTypeEnum - from qtype.dsl.model import ListType, ToolParameter + from qtype.dsl.model import ListType, Variable # Test list[text] -> list[str] - text_list_param = ToolParameter( + text_list_var = Variable( + id="test_param", type=ListType(element_type=PrimitiveTypeEnum.text), optional=False, ) - python_type = FunctionToolHelper._qtype_type_to_python_type( - text_list_param - ) + python_type = FunctionToolHelper._qtype_type_to_python_type(text_list_var) assert python_type == list[str] # Test list[int] -> list[int] - int_list_param = ToolParameter( + int_list_var = Variable( + id="test_param", type=ListType(element_type=PrimitiveTypeEnum.int), optional=False, ) - python_type = FunctionToolHelper._qtype_type_to_python_type(int_list_param) + python_type = FunctionToolHelper._qtype_type_to_python_type(int_list_var) assert python_type == list[int] # Test list[boolean] -> list[bool] - bool_list_param = ToolParameter( + bool_list_var = Variable( + id="test_param", type=ListType(element_type=PrimitiveTypeEnum.boolean), optional=False, ) - python_type = FunctionToolHelper._qtype_type_to_python_type( - bool_list_param - ) + python_type = FunctionToolHelper._qtype_type_to_python_type(bool_list_var) assert python_type == list[bool] @@ -115,11 +114,18 @@ async def test_create_python_function_tool_with_custom_type(): ) # Verify the output type is the custom TimeDifferenceResultType - assert "result" in time_diff_tool.outputs - output_param = time_diff_tool.outputs["result"] + result_output = next( + ( + v + for v in time_diff_tool.outputs + if v.id == "calculate_time_difference_result" + ), + None, + ) + assert result_output is not None # The type should be resolved to a BaseModel subclass - assert isinstance(output_param.type, type) - assert issubclass(output_param.type, BaseModel) + assert isinstance(result_output.type, type) + assert issubclass(result_output.type, BaseModel) # Create the helper and generate the FunctionTool helper = ToolHelper() diff --git a/tests/specs/file_source_custom_types.qtype.yaml b/tests/specs/file_source_custom_types.qtype.yaml new file mode 100644 index 00000000..9870c740 --- /dev/null +++ b/tests/specs/file_source_custom_types.qtype.yaml @@ -0,0 +1,32 @@ +id: test-file-source-custom-types +description: Test FileSource with Custom Types +types: + - id: Person + properties: + name: text + age: int + email: text +flows: + - type: Flow + id: read_persons + inputs: [] + variables: + - id: person + type: Person + - id: result + type: SearchResult + - id: rank + type: int + outputs: + - person + - result + - rank + steps: + - id: source + type: FileSource + path: + uri: tests/document-specs/mixed_types.json + outputs: + - person + - result + - rank