From 947105687ab3fcbc067f5cca45e3807df055802a Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Tue, 10 Feb 2026 12:04:30 -0800 Subject: [PATCH 1/6] mcp test --- py-src/data_formulator/mcp_server.py | 587 +++++++++++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 py-src/data_formulator/mcp_server.py diff --git a/py-src/data_formulator/mcp_server.py b/py-src/data_formulator/mcp_server.py new file mode 100644 index 00000000..7c976bff --- /dev/null +++ b/py-src/data_formulator/mcp_server.py @@ -0,0 +1,587 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Data Formulator MCP Server + +Exposes Data Formulator's AI-powered data visualization capabilities +as an MCP (Model Context Protocol) server with the following tools: + +1. visualize_data: Given data + instruction → transformed data + chart (PNG) +2. explore_data: Multi-turn iterative exploration → rounds of response + data + chart + +Usage: + # Run as stdio MCP server (for MCP clients like Claude Desktop, VS Code, etc.) + python -m data_formulator.mcp_server + + # Or with uvx + uvx mcp run data_formulator.mcp_server + +Environment variables: + OPENAI_API_KEY / ANTHROPIC_API_KEY / etc. - API keys for LLM providers + DF_MCP_MODEL_ENDPOINT - LLM provider (default: "openai") + DF_MCP_MODEL_NAME - Model name (default: "gpt-4o") + DF_MCP_API_KEY - API key (overrides provider-specific key) + DF_MCP_API_BASE - Custom API base URL (optional) + DATALAKE_ROOT - Workspace root directory (optional) +""" + +import os +import sys +import json +import base64 +import logging +import tempfile +from pathlib import Path +from typing import Any + +import pandas as pd + +from dotenv import load_dotenv + +# Load environment variables +load_dotenv(os.path.join(Path(__file__).parent.parent.parent, 'api-keys.env')) +load_dotenv(os.path.join(Path(__file__).parent, 'api-keys.env')) + +from mcp.server.fastmcp import FastMCP + +from data_formulator.agents.client_utils import Client +from data_formulator.agents.agent_data_rec import DataRecAgent +from data_formulator.agents.agent_data_transform import DataTransformationAgent +from data_formulator.agents.agent_exploration import ExplorationAgent +from data_formulator.datalake.workspace import Workspace, WorkspaceWithTempData +from data_formulator.workflows.create_vl_plots import ( + assemble_vegailte_chart, + spec_to_base64, + detect_field_type, + create_chart_spec, +) +from data_formulator.workflows.exploration_flow import create_chart_spec_from_data + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _get_model_config() -> dict[str, str]: + """Build model config from environment variables.""" + endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "openai") + model = os.getenv("DF_MCP_MODEL_NAME", "gpt-4o") + + # Resolve API key: explicit > provider-specific + api_key = os.getenv("DF_MCP_API_KEY", "") + if not api_key: + api_key = os.getenv(f"{endpoint.upper()}_API_KEY", "") + + api_base = os.getenv("DF_MCP_API_BASE", os.getenv(f"{endpoint.upper()}_API_BASE", "")) + api_version = os.getenv("DF_MCP_API_VERSION", os.getenv(f"{endpoint.upper()}_API_VERSION", "")) + + return { + "endpoint": endpoint, + "model": model, + "api_key": api_key, + "api_base": api_base, + "api_version": api_version, + } + + +def _get_client() -> Client: + """Create an LLM client from environment config.""" + return Client.from_config(_get_model_config()) + + +def _get_workspace(session_id: str = "mcp_session") -> Workspace: + """Create or reuse a workspace for the MCP session.""" + return Workspace(session_id) + + +def _parse_data_input(data: str, data_format: str = "auto") -> pd.DataFrame: + """ + Parse data from a string (JSON or CSV) into a DataFrame. + + Args: + data: Raw data string (JSON array or CSV text) + data_format: "json", "csv", or "auto" (detect automatically) + + Returns: + pandas DataFrame + """ + if data_format == "auto": + stripped = data.strip() + if stripped.startswith("[") or stripped.startswith("{"): + data_format = "json" + else: + data_format = "csv" + + if data_format == "json": + parsed = json.loads(data) + if isinstance(parsed, dict): + parsed = [parsed] + return pd.DataFrame(parsed) + else: + from io import StringIO + return pd.read_csv(StringIO(data)) + + +def _make_chart_image( + rows: list[dict], + chart_type: str, + chart_encodings: dict[str, str], +) -> str | None: + """Create a base64 PNG from data rows + chart spec. Returns data URL or None.""" + try: + df = pd.DataFrame(rows) + if df.empty: + return None + + encodings = {} + for channel, field in chart_encodings.items(): + if field and field in df.columns: + field_type = detect_field_type(df[field]) + encodings[channel] = {"field": field, "type": field_type} + + spec = assemble_vegailte_chart(df, chart_type, encodings) + if spec: + return spec_to_base64(spec) + except Exception as e: + logger.warning(f"Chart creation failed: {e}") + return None + + +# --------------------------------------------------------------------------- +# MCP Server +# --------------------------------------------------------------------------- + +mcp = FastMCP( + "Data Formulator", + description=( + "AI-powered data visualization server. " + "Transform data, generate charts, and explore datasets interactively." + ), +) + + +@mcp.tool() +def visualize_data( + data: str, + instruction: str, + data_format: str = "auto", + table_name: str = "input_data", + chart_type: str = "", + x: str = "", + y: str = "", + color: str = "", + size: str = "", + facet: str = "", + max_repair_attempts: int = 1, +) -> dict[str, Any]: + """ + Transform data and generate a visualization based on a natural language instruction. + + Given tabular data (JSON or CSV) and a natural language instruction, this tool: + 1. Uses an AI agent to understand the intent and generate transformation code + 2. Executes the transformation to produce the output data + 3. Creates a chart (PNG) from the transformed data + + Use this for one-shot data analysis tasks like: + - "Show average sales by region as a bar chart" + - "Create a scatter plot of price vs rating colored by category" + - "Forecast the next 6 months of revenue" + + Args: + data: Tabular data as a JSON array of objects or CSV text. + instruction: Natural language description of what visualization to create. + data_format: "json", "csv", or "auto" (default: auto-detect). + table_name: Name for the input table (default: "input_data"). + chart_type: Optional chart type hint ("bar", "point", "line", "area", "heatmap", + "group_bar", "boxplot", "worldmap", "usmap"). Leave empty to let the AI decide. + x: Optional field name for x-axis encoding. + y: Optional field name for y-axis encoding. + color: Optional field name for color encoding. + size: Optional field name for size encoding. + facet: Optional field name for facet encoding. + max_repair_attempts: Max retries if code execution fails (default: 1). + + Returns: + A dictionary with: + - status: "ok" or "error" + - instruction_summary: Short description of what was done + - chart_type: The chart type used + - chart_encodings: Mapping of visual channels to fields + - transformed_data: List of row dicts (first 50 rows) + - transformed_data_full_count: Total row count + - chart_image_base64: Base64 PNG data URL of the chart (or null) + - code: The Python transformation code generated + - reasoning: The AI's reasoning about the transformation + """ + try: + # Parse input data + df = _parse_data_input(data, data_format) + rows = json.loads(df.to_json(orient="records", date_format="iso")) + + input_tables = [{"name": table_name, "rows": rows}] + + # Build chart encodings from optional hints + chart_encodings = {} + if x: chart_encodings["x"] = x + if y: chart_encodings["y"] = y + if color: chart_encodings["color"] = color + if size: chart_encodings["size"] = size + if facet: chart_encodings["facet"] = facet + + # Decide mode: recommendation (no encodings) vs transform (has encodings) + mode = "recommendation" if not chart_encodings else "transform" + + # Set up workspace + agent + client = _get_client() + workspace = _get_workspace() + temp_data = [{"name": table_name, "rows": rows}] + + with WorkspaceWithTempData(workspace, temp_data) as ws: + if mode == "recommendation": + agent = DataRecAgent(client=client, workspace=ws) + results = agent.run(input_tables, instruction, n=1) + else: + agent = DataTransformationAgent(client=client, workspace=ws) + goal = {"goal": instruction, "chart_type": chart_type, "chart_encodings": chart_encodings} + results = agent.run( + input_tables, + json.dumps(goal), + [], # no previous messages + ) + + # Repair loop + attempts = 0 + while results[0]["status"] == "error" and attempts < max_repair_attempts: + error_msg = results[0]["content"] + repair_instruction = ( + f"We ran into the following problem executing the code, please fix it:\n\n" + f"{error_msg}\n\n" + f"Please think step by step, reflect on why the error happens, and fix the code." + ) + prev_dialog = results[0]["dialog"] + + if mode == "recommendation": + results = agent.followup(input_tables, prev_dialog, [], repair_instruction, n=1) + else: + results = agent.followup(input_tables, prev_dialog, [], repair_instruction, n=1) + attempts += 1 + + # Process result + result = results[0] + if result["status"] != "ok": + return { + "status": "error", + "message": result.get("content", "Unknown error"), + "code": result.get("code", ""), + } + + transformed_data = result["content"] + refined_goal = result.get("refined_goal", {}) + code = result.get("code", "") + + out_rows = transformed_data.get("rows", []) + out_chart_type = refined_goal.get("chart_type", chart_type or "bar") + out_encodings = refined_goal.get("chart_encodings", chart_encodings) + + # Generate chart image + chart_image = _make_chart_image(out_rows, out_chart_type, out_encodings) + + return { + "status": "ok", + "instruction_summary": refined_goal.get("display_instruction", instruction), + "chart_type": out_chart_type, + "chart_encodings": out_encodings, + "transformed_data": out_rows[:50], + "transformed_data_full_count": len(out_rows), + "chart_image_base64": chart_image, + "code": code, + "reasoning": { + "mode": refined_goal.get("mode", mode), + "recommendation": refined_goal.get("recommendation", ""), + "output_fields": refined_goal.get("output_fields", []), + }, + } + + except Exception as e: + logger.exception("visualize_data failed") + return {"status": "error", "message": str(e)} + + +@mcp.tool() +def explore_data( + data: str, + question: str, + data_format: str = "auto", + table_name: str = "input_data", + max_iterations: int = 3, + max_repair_attempts: int = 1, +) -> dict[str, Any]: + """ + Iteratively explore a dataset through multiple rounds of AI-driven analysis. + + Given tabular data and a high-level exploration question, this tool: + 1. Breaks the question into a multi-step analysis plan + 2. For each step: transforms data, creates a chart, and decides the next step + 3. Returns all exploration steps with their data and charts + + Use this for open-ended data exploration like: + - "What are the key trends and patterns in this sales data?" + - "Explore the factors that affect student performance" + - "Analyze the relationship between weather and energy consumption" + + Args: + data: Tabular data as a JSON array of objects or CSV text. + question: High-level exploration question or topic. + data_format: "json", "csv", or "auto" (default: auto-detect). + table_name: Name for the input table (default: "input_data"). + max_iterations: Maximum number of exploration rounds (default: 3). + max_repair_attempts: Max code repair retries per step (default: 1). + + Returns: + A dictionary with: + - status: "ok" or "error" + - question: The original exploration question + - steps: List of exploration step results, each containing: + - iteration: Step number + - question: The question addressed in this step + - chart_type: Chart type used + - chart_encodings: Visual channel mappings + - transformed_data: Rows of transformed data (first 50) + - chart_image_base64: Base64 PNG of the chart (or null) + - code: Python transformation code + - instruction_summary: Short description of what was done + - summary: Final summary of exploration findings + - total_steps: Number of steps completed + """ + try: + # Parse input data + df = _parse_data_input(data, data_format) + rows = json.loads(df.to_json(orient="records", date_format="iso")) + + input_tables = [{"name": table_name, "rows": rows}] + + client = _get_client() + workspace = _get_workspace() + temp_data = [{"name": table_name, "rows": rows}] + + steps = [] + + with WorkspaceWithTempData(workspace, temp_data) as ws: + rec_agent = DataRecAgent(client=client, workspace=ws) + exploration_agent = ExplorationAgent(client=client, workspace=ws) + + completed_steps_for_agent = [] + current_question = question + current_plan: list[str] = [] + previous_dialog: list[dict] = [] + previous_data: dict = {} + + for iteration in range(1, max_iterations + 1): + # Step 1: Transform data for current question + if previous_dialog: + latest_sample = previous_data.get("rows", []) if isinstance(previous_data, dict) else [] + transform_results = rec_agent.followup( + input_tables=input_tables, + new_instruction=current_question, + latest_data_sample=latest_sample, + dialog=previous_dialog, + ) + else: + transform_results = rec_agent.run( + input_tables=input_tables, + description=current_question, + ) + + # Repair loop + attempt = 0 + while transform_results and transform_results[0]["status"] != "ok" and attempt < max_repair_attempts: + attempt += 1 + error_msg = transform_results[0]["content"] + dialog = transform_results[0]["dialog"] + repair_instr = ( + f"We ran into the following problem executing the code, please fix it:\n\n" + f"{error_msg}\n\nPlease think step by step and fix the code." + ) + transform_results = rec_agent.followup( + input_tables=input_tables, + new_instruction=repair_instr, + latest_data_sample=[], + dialog=dialog, + ) + + if not transform_results or transform_results[0]["status"] != "ok": + error_msg = transform_results[0]["content"] if transform_results else "Transform failed" + steps.append({ + "iteration": iteration, + "question": current_question, + "status": "error", + "message": error_msg, + }) + break + + result = transform_results[0] + transformed_data = result["content"] + refined_goal = result.get("refined_goal", {}) + code = result.get("code", "") + previous_dialog = result.get("dialog", []) + previous_data = transformed_data + + out_rows = transformed_data.get("rows", []) + out_chart_type = refined_goal.get("chart_type", "bar") + out_encodings = refined_goal.get("chart_encodings", {}) + + # Create chart + chart_image = _make_chart_image(out_rows, out_chart_type, out_encodings) + + step_result = { + "iteration": iteration, + "question": current_question, + "status": "ok", + "chart_type": out_chart_type, + "chart_encodings": out_encodings, + "transformed_data": out_rows[:50], + "transformed_data_full_count": len(out_rows), + "chart_image_base64": chart_image, + "code": code, + "instruction_summary": refined_goal.get("display_instruction", current_question), + } + steps.append(step_result) + + # Track for exploration agent + completed_steps_for_agent.append({ + "question": current_question, + "code": code, + "data": { + "rows": out_rows[:20], + "name": transformed_data.get("virtual", {}).get("table_name", f"step_{iteration}"), + }, + "visualization": chart_image, + }) + + # Step 2: Decide next step via exploration agent + if iteration >= max_iterations: + break + + try: + followup_results = exploration_agent.suggest_followup( + input_tables=input_tables, + completed_steps=completed_steps_for_agent, + next_steps=current_plan, + ) + + if followup_results and followup_results[0]["status"] == "ok": + plan = followup_results[0]["content"] + if plan.get("status") in ("present", "warning"): + # Agent decided to stop and present findings + break + next_steps = plan.get("next_steps", []) + if next_steps: + current_question = next_steps[0] + current_plan = next_steps[1:] + else: + break + else: + break + except Exception as e: + logger.warning(f"Exploration planning failed: {e}") + break + + # Build summary + summary_parts = [] + for s in steps: + if s.get("status") == "ok": + summary_parts.append(f"Step {s['iteration']}: {s.get('instruction_summary', s['question'])}") + + return { + "status": "ok", + "question": question, + "steps": steps, + "summary": "\n".join(summary_parts) if summary_parts else "No steps completed.", + "total_steps": len(steps), + } + + except Exception as e: + logger.exception("explore_data failed") + return {"status": "error", "message": str(e), "steps": []} + + +@mcp.tool() +def create_chart( + data: str, + chart_type: str, + x: str = "", + y: str = "", + color: str = "", + size: str = "", + facet: str = "", + data_format: str = "auto", +) -> dict[str, Any]: + """ + Create a chart directly from data and field mappings (no AI, no transformation). + + This is a fast, deterministic tool for creating standard charts when you already + know exactly which fields to use and how to map them. + + Args: + data: Tabular data as a JSON array of objects or CSV text. + chart_type: One of "bar", "point", "line", "area", "heatmap", + "group_bar", "boxplot". + x: Field name for x-axis. + y: Field name for y-axis. + color: Optional field name for color encoding. + size: Optional field name for size encoding. + facet: Optional field name for faceting. + data_format: "json", "csv", or "auto". + + Returns: + A dictionary with: + - status: "ok" or "error" + - chart_image_base64: Base64 PNG data URL + - chart_type: The chart type used + - fields_used: List of fields mapped to channels + """ + try: + df = _parse_data_input(data, data_format) + + # Build encoding dict + fields = [] + if x: fields.append(x) + if y: fields.append(y) + if color: fields.append(color) + if size: fields.append(size) + if facet: fields.append(facet) + + if not fields: + return {"status": "error", "message": "At least one field (x or y) is required."} + + spec = create_chart_spec(df, fields, chart_type) + if spec: + image = spec_to_base64(spec) + return { + "status": "ok", + "chart_image_base64": image, + "chart_type": chart_type, + "fields_used": fields, + } + else: + return {"status": "error", "message": "Failed to create chart specification."} + + except Exception as e: + logger.exception("create_chart failed") + return {"status": "error", "message": str(e)} + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(): + """Run the MCP server (stdio transport).""" + logging.basicConfig(level=logging.WARNING, stream=sys.stderr) + mcp.run() + + +if __name__ == "__main__": + main() From dc2130dfb36149fd506db86b44be8c4d921e9acd Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Tue, 10 Feb 2026 12:07:03 -0800 Subject: [PATCH 2/6] add mcp dependency --- pyproject.toml | 1 + uv.lock | 117 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 34599461..e52ffb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "yfinance", "connectorx>=0.4.5", "pyarrow>=23.0.0", + "mcp>=1.26.0", ] [project.urls] diff --git a/uv.lock b/uv.lock index 0df5b6a4..473ea937 100644 --- a/uv.lock +++ b/uv.lock @@ -731,6 +731,7 @@ dependencies = [ { name = "google-cloud-bigquery" }, { name = "jupyter" }, { name = "litellm" }, + { name = "mcp" }, { name = "numpy" }, { name = "openai" }, { name = "pandas" }, @@ -769,6 +770,7 @@ requires-dist = [ { name = "google-cloud-bigquery" }, { name = "jupyter" }, { name = "litellm" }, + { name = "mcp", specifier = ">=1.26.0" }, { name = "numpy" }, { name = "openai" }, { name = "pandas" }, @@ -1418,6 +1420,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "httpx-sse" +version = "0.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" }, +] + [[package]] name = "huggingface-hub" version = "1.3.7" @@ -2160,6 +2171,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, ] +[[package]] +name = "mcp" +version = "1.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "httpx" }, + { name = "httpx-sse" }, + { name = "jsonschema" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "pyjwt", extra = ["crypto"] }, + { name = "python-multipart" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "sse-starlette" }, + { name = "starlette" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, + { name = "uvicorn", marker = "sys_platform != 'emscripten'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/6d/62e76bbb8144d6ed86e202b5edd8a4cb631e7c8130f3f4893c3f90262b10/mcp-1.26.0.tar.gz", hash = "sha256:db6e2ef491eecc1a0d93711a76f28dec2e05999f93afd48795da1c1137142c66", size = 608005, upload-time = "2026-01-24T19:40:32.468Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/d9/eaa1f80170d2b7c5ba23f3b59f766f3a0bb41155fbc32a69adfa1adaaef9/mcp-1.26.0-py3-none-any.whl", hash = "sha256:904a21c33c25aa98ddbeb47273033c435e595bbacfdb177f4bd87f6dceebe1ca", size = 233615, upload-time = "2026-01-24T19:40:30.652Z" }, +] + [[package]] name = "mistune" version = "3.2.0" @@ -3022,6 +3058,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, ] +[[package]] +name = "pydantic-settings" +version = "2.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -3207,6 +3257,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, +] + [[package]] name = "pytz" version = "2025.2" @@ -3216,6 +3275,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, ] +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" }, + { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" }, + { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" }, + { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, + { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, + { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" }, + { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" }, + { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" }, + { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" }, + { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" }, +] + [[package]] name = "pywinpty" version = "3.0.2" @@ -3816,6 +3894,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, ] +[[package]] +name = "sse-starlette" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "starlette" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/8d/00d280c03ffd39aaee0e86ec81e2d3b9253036a0f93f51d10503adef0e65/sse_starlette-3.2.0.tar.gz", hash = "sha256:8127594edfb51abe44eac9c49e59b0b01f1039d0c7461c6fd91d4e03b70da422", size = 27253, upload-time = "2026-01-17T13:11:05.62Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/7f/832f015020844a8b8f7a9cbc103dd76ba8e3875004c41e08440ea3a2b41a/sse_starlette-3.2.0-py3-none-any.whl", hash = "sha256:5876954bd51920fc2cd51baee47a080eb88a37b5b784e615abb0b283f801cdbf", size = 12763, upload-time = "2026-01-17T13:11:03.775Z" }, +] + [[package]] name = "stack-data" version = "0.6.3" @@ -3830,6 +3921,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, ] +[[package]] +name = "starlette" +version = "0.52.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/68/79977123bb7be889ad680d79a40f339082c1978b5cfcf62c2d8d196873ac/starlette-0.52.1.tar.gz", hash = "sha256:834edd1b0a23167694292e94f597773bc3f89f362be6effee198165a35d62933", size = 2653702, upload-time = "2026-01-18T13:34:11.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, +] + [[package]] name = "terminado" version = "0.18.1" @@ -4046,6 +4150,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] +[[package]] +name = "uvicorn" +version = "0.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" }, +] + [[package]] name = "vega-datasets" version = "0.9.0" From b65c917d1394c4710a84be5264f72d92b071ebb3 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Tue, 10 Feb 2026 12:08:33 -0800 Subject: [PATCH 3/6] test file --- py-src/data_formulator/mcp_server.py | 55 +++- py-src/tests/test_mcp_server.py | 469 +++++++++++++++++++++++++++ 2 files changed, 520 insertions(+), 4 deletions(-) create mode 100644 py-src/tests/test_mcp_server.py diff --git a/py-src/data_formulator/mcp_server.py b/py-src/data_formulator/mcp_server.py index 7c976bff..d1501ed0 100644 --- a/py-src/data_formulator/mcp_server.py +++ b/py-src/data_formulator/mcp_server.py @@ -10,12 +10,59 @@ 1. visualize_data: Given data + instruction → transformed data + chart (PNG) 2. explore_data: Multi-turn iterative exploration → rounds of response + data + chart -Usage: - # Run as stdio MCP server (for MCP clients like Claude Desktop, VS Code, etc.) +Setup: + # Install with uv (recommended) + uv pip install -e ".[mcp]" # from project root + # or install mcp separately + uv pip install mcp + +Running the MCP server: + # Option 1: Run directly with uv + uv run python -m data_formulator.mcp_server + + # Option 2: Run with python (after installing) python -m data_formulator.mcp_server - # Or with uvx - uvx mcp run data_formulator.mcp_server + # Option 3: Run the module file directly + uv run py-src/data_formulator/mcp_server.py + +Configure in Claude Desktop (claude_desktop_config.json): + { + "mcpServers": { + "data-formulator": { + "command": "uv", + "args": [ + "--directory", "/path/to/data-formulator", + "run", "python", "-m", "data_formulator.mcp_server" + ], + "env": { + "OPENAI_API_KEY": "sk-...", + "DF_MCP_MODEL_ENDPOINT": "openai", + "DF_MCP_MODEL_NAME": "gpt-4o" + } + } + } + } + +Configure in VS Code (settings.json): + { + "mcp": { + "servers": { + "data-formulator": { + "command": "uv", + "args": [ + "--directory", "/path/to/data-formulator", + "run", "python", "-m", "data_formulator.mcp_server" + ], + "env": { + "OPENAI_API_KEY": "sk-...", + "DF_MCP_MODEL_ENDPOINT": "openai", + "DF_MCP_MODEL_NAME": "gpt-4o" + } + } + } + } + } Environment variables: OPENAI_API_KEY / ANTHROPIC_API_KEY / etc. - API keys for LLM providers diff --git a/py-src/tests/test_mcp_server.py b/py-src/tests/test_mcp_server.py new file mode 100644 index 00000000..27071c98 --- /dev/null +++ b/py-src/tests/test_mcp_server.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +""" +Sample script: Using Data Formulator as an MCP Server + +This script demonstrates how to use Data Formulator's MCP server for: + 1. Data Recommendation / Visualization (one-shot) + 2. Iterative Data Exploration (multi-turn) + +There are TWO ways to use the MCP tools demonstrated here: + + (A) Direct invocation — import and call the tool functions directly + (no MCP client/server needed, great for scripting & testing) + + (B) MCP client — connect to the MCP server over stdio and call tools + via the MCP protocol (how real MCP hosts like Claude Desktop use it) + +Prerequisites: + # Install dependencies with uv (from project root): + uv pip install -e ".[mcp]" + # or: + uv pip install mcp pandas vl-convert-python + + # Set your LLM API key: + export OPENAI_API_KEY="sk-..." # or ANTHROPIC_API_KEY, etc. + export DF_MCP_MODEL_ENDPOINT="openai" # openai | anthropic | azure | gemini | ollama + export DF_MCP_MODEL_NAME="gpt-4o" # model name + +Usage: + # Run all demos with uv (recommended): + uv run python py-src/tests/test_mcp_server.py + + # Run all demos (direct invocation, no server process needed): + python py-src/tests/test_mcp_server.py + + # Run only one demo: + uv run python py-src/tests/test_mcp_server.py --demo 1 # one-shot visualization + uv run python py-src/tests/test_mcp_server.py --demo 2 # iterative exploration + uv run python py-src/tests/test_mcp_server.py --demo 3 # MCP client over stdio +""" + +import argparse +import asyncio +import json +import base64 +import os +import sys +from pathlib import Path + +# --------------------------------------------------------------------------- +# Setup paths so we can import data_formulator +# --------------------------------------------------------------------------- +SCRIPT_DIR = Path(__file__).parent +PROJECT_ROOT = SCRIPT_DIR.parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + +OUTPUT_DIR = SCRIPT_DIR / "mcp_demo_output" +OUTPUT_DIR.mkdir(exist_ok=True) + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ SAMPLE DATA ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +SAMPLE_CSV = """Country,Year,GDP_Billion,Population_Million,CO2_Emission_MT +United States,2018,20580,327,5280 +United States,2019,21430,329,5130 +United States,2020,20940,331,4570 +United States,2021,23320,332,5010 +United States,2022,25460,333,5060 +China,2018,13890,1393,10060 +China,2019,14280,1398,10170 +China,2020,14720,1402,10670 +China,2021,17730,1405,11470 +China,2022,17960,1406,11400 +Germany,2018,3970,83,759 +Germany,2019,3890,83,702 +Germany,2020,3890,83,644 +Germany,2021,4220,83,675 +Germany,2022,4070,84,666 +India,2018,2710,1353,2480 +India,2019,2870,1366,2600 +India,2020,2660,1380,2440 +India,2021,3180,1393,2710 +India,2022,3390,1407,2830 +Japan,2018,4970,126,1160 +Japan,2019,5080,126,1140 +Japan,2020,5040,126,1060 +Japan,2021,4940,125,1070 +Japan,2022,4230,125,1050 +Brazil,2018,1870,210,460 +Brazil,2019,1870,211,470 +Brazil,2020,1440,212,440 +Brazil,2021,1650,213,490 +Brazil,2022,1920,214,490""" + +SAMPLE_JSON = json.dumps([ + {"Student": "Alice", "Math": 92, "Science": 88, "English": 95, "History": 78, "Grade": "A"}, + {"Student": "Bob", "Math": 76, "Science": 82, "English": 71, "History": 89, "Grade": "B"}, + {"Student": "Charlie", "Math": 88, "Science": 91, "English": 84, "History": 92, "Grade": "A"}, + {"Student": "Diana", "Math": 65, "Science": 70, "English": 90, "History": 85, "Grade": "B"}, + {"Student": "Eve", "Math": 95, "Science": 97, "English": 92, "History": 88, "Grade": "A"}, + {"Student": "Frank", "Math": 58, "Science": 62, "English": 68, "History": 72, "Grade": "C"}, + {"Student": "Grace", "Math": 84, "Science": 79, "English": 88, "History": 91, "Grade": "B"}, + {"Student": "Henry", "Math": 91, "Science": 85, "English": 79, "History": 83, "Grade": "A"}, + {"Student": "Iris", "Math": 73, "Science": 68, "English": 82, "History": 76, "Grade": "B"}, + {"Student": "Jack", "Math": 87, "Science": 93, "English": 86, "History": 80, "Grade": "A"}, +]) + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ HELPERS ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +def save_chart(base64_data_url: str | None, filename: str) -> None: + """Save a base64 data URL (data:image/png;base64,...) to a PNG file.""" + if not base64_data_url: + print(f" ⚠ No chart image for {filename}") + return + try: + b64 = base64_data_url.split(",", 1)[1] if "," in base64_data_url else base64_data_url + filepath = OUTPUT_DIR / filename + filepath.write_bytes(base64.b64decode(b64)) + print(f" ✅ Chart saved: {filepath}") + except Exception as e: + print(f" ❌ Failed to save chart {filename}: {e}") + + +def save_json_result(data: dict, filename: str) -> None: + """Save a dict to a JSON file.""" + filepath = OUTPUT_DIR / filename + filepath.write_text(json.dumps(data, indent=2, default=str)) + print(f" 📄 Result saved: {filepath}") + + +def print_section(title: str) -> None: + width = 70 + print() + print("=" * width) + print(f" {title}") + print("=" * width) + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ DEMO 1: One-shot Visualization (Data Recommendation) ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +def demo_1_one_shot_visualization(): + """ + Demonstrates the `visualize_data` tool: + Input: data (CSV or JSON) + natural language instruction + Output: transformed data + chart image (PNG) + reasoning + """ + print_section("DEMO 1: One-shot Data Visualization") + + # Import the MCP tool function directly + from data_formulator.mcp_server import visualize_data + + # --- Example 1a: CSV data, let AI recommend a visualization --- + print("\n📊 Example 1a: GDP trends (CSV, AI-recommended chart)") + print(" Instruction: 'Show GDP trends over time for each country as a line chart'") + + result = visualize_data( + data=SAMPLE_CSV, + instruction="Show GDP trends over time for each country as a line chart", + data_format="csv", + table_name="world_economy", + ) + + print(f" Status: {result['status']}") + if result["status"] == "ok": + print(f" Summary: {result['instruction_summary']}") + print(f" Chart type: {result['chart_type']}") + print(f" Encodings: {result['chart_encodings']}") + print(f" Data rows: {result['transformed_data_full_count']}") + print(f" Code:\n{result['code'][:300]}...") + save_chart(result.get("chart_image_base64"), "demo1a_gdp_trends.png") + save_json_result(result, "demo1a_result.json") + else: + print(f" Error: {result.get('message', 'Unknown')}") + + # --- Example 1b: JSON data, with encoding hints --- + print("\n📊 Example 1b: Student scores (JSON, with encoding hints)") + print(" Instruction: 'Compare students by their average score across all subjects'") + + result = visualize_data( + data=SAMPLE_JSON, + instruction="Compare students by their average score across all subjects", + data_format="json", + table_name="student_scores", + ) + + print(f" Status: {result['status']}") + if result["status"] == "ok": + print(f" Summary: {result['instruction_summary']}") + print(f" Chart type: {result['chart_type']}") + print(f" Encodings: {result['chart_encodings']}") + print(f" Output fields: {result['reasoning']['output_fields']}") + save_chart(result.get("chart_image_base64"), "demo1b_student_avg.png") + save_json_result(result, "demo1b_result.json") + else: + print(f" Error: {result.get('message', 'Unknown')}") + + # --- Example 1c: CO2 per capita analysis --- + print("\n📊 Example 1c: CO2 per capita (CSV, computed metric)") + print(" Instruction: 'Calculate CO2 emissions per capita and show as a grouped bar chart by country and year'") + + result = visualize_data( + data=SAMPLE_CSV, + instruction="Calculate CO2 emissions per capita (CO2 / Population) and show as a grouped bar chart by country for the latest year", + data_format="csv", + table_name="world_economy", + ) + + print(f" Status: {result['status']}") + if result["status"] == "ok": + print(f" Summary: {result['instruction_summary']}") + print(f" Chart type: {result['chart_type']}") + print(f" Data preview: {result['transformed_data'][:3]}") + save_chart(result.get("chart_image_base64"), "demo1c_co2_per_capita.png") + save_json_result(result, "demo1c_result.json") + else: + print(f" Error: {result.get('message', 'Unknown')}") + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ DEMO 2: Iterative Exploration (Multi-turn Workflow) ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +def demo_2_iterative_exploration(): + """ + Demonstrates the `explore_data` tool: + Input: data + high-level question + Output: multiple rounds of analysis, each with data + chart + reasoning + + The AI agent: + 1. Breaks the question into sub-questions + 2. For each sub-question: transforms data → creates chart → interprets result + 3. Decides the next question based on findings + 4. Presents a summary when exploration is complete + """ + print_section("DEMO 2: Iterative Data Exploration") + + from data_formulator.mcp_server import explore_data + + # --- Example 2a: Explore world economy data --- + print("\n🔍 Example 2a: Explore world economy trends") + print(" Question: 'Explore the relationship between GDP growth, population, and CO2 emissions'") + print(" Max iterations: 3") + print(" (This may take a minute as the AI performs multiple analysis rounds...)\n") + + result = explore_data( + data=SAMPLE_CSV, + question="Explore the relationship between GDP growth, population, and CO2 emissions across countries. What patterns emerge?", + data_format="csv", + table_name="world_economy", + max_iterations=3, + ) + + print(f" Status: {result['status']}") + print(f" Total steps completed: {result['total_steps']}") + + if result["status"] == "ok": + print(f"\n 📋 Summary:\n {result['summary']}") + + for step in result["steps"]: + i = step["iteration"] + print(f"\n --- Step {i} ---") + print(f" Question: {step['question']}") + if step.get("status") == "ok": + print(f" Chart type: {step['chart_type']}") + print(f" Encodings: {step['chart_encodings']}") + print(f" Data rows: {step.get('transformed_data_full_count', 'N/A')}") + save_chart(step.get("chart_image_base64"), f"demo2a_step{i}.png") + else: + print(f" Error: {step.get('message', 'Unknown')}") + + save_json_result(result, "demo2a_exploration.json") + + # --- Example 2b: Explore student performance --- + print("\n\n🔍 Example 2b: Explore student performance patterns") + print(" Question: 'Analyze student performance across subjects and identify strengths/weaknesses'") + print(" Max iterations: 3\n") + + result = explore_data( + data=SAMPLE_JSON, + question="Analyze student performance across subjects. Which subjects are hardest? Do grades correlate with specific subjects?", + data_format="json", + table_name="student_scores", + max_iterations=3, + ) + + print(f" Status: {result['status']}") + print(f" Total steps: {result['total_steps']}") + + if result["status"] == "ok": + print(f"\n 📋 Summary:\n {result['summary']}") + + for step in result["steps"]: + i = step["iteration"] + print(f"\n --- Step {i} ---") + print(f" Question: {step['question']}") + if step.get("status") == "ok": + print(f" Chart type: {step['chart_type']}") + print(f" Summary: {step.get('instruction_summary', '')}") + save_chart(step.get("chart_image_base64"), f"demo2b_step{i}.png") + + save_json_result(result, "demo2b_exploration.json") + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ DEMO 3: MCP Client over stdio (full MCP protocol) ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +async def demo_3_mcp_client(): + """ + Demonstrates connecting to the Data Formulator MCP server as a proper + MCP client over stdio transport. + + This is how real MCP hosts (Claude Desktop, VS Code Copilot, etc.) would + connect to the server. + """ + print_section("DEMO 3: MCP Client over stdio") + + try: + from mcp import ClientSession, StdioServerParameters + from mcp.client.stdio import stdio_client + except ImportError: + print(" ⚠ MCP client SDK not installed. Install with: pip install mcp") + print(" Skipping Demo 3.") + return + + # The server script to run + server_script = str(PROJECT_ROOT / "py-src" / "data_formulator" / "mcp_server.py") + + server_params = StdioServerParameters( + command=sys.executable, + args=[server_script], + env={ + **os.environ, # inherit env (API keys, etc.) + "PYTHONPATH": str(PROJECT_ROOT / "py-src"), + }, + ) + + print(" 🔌 Connecting to Data Formulator MCP server...") + + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # List available tools + tools = await session.list_tools() + print(f" 📦 Available tools: {[t.name for t in tools.tools]}") + + # --- Call visualize_data via MCP --- + print("\n 📊 Calling visualize_data via MCP protocol...") + + viz_result = await session.call_tool( + "visualize_data", + arguments={ + "data": SAMPLE_CSV, + "instruction": "Show GDP per capita trends over time for each country", + "data_format": "csv", + "table_name": "world_economy", + }, + ) + + # MCP returns content as TextContent or other content types + for content in viz_result.content: + if hasattr(content, "text"): + result = json.loads(content.text) + print(f" Status: {result.get('status')}") + if result.get("status") == "ok": + print(f" Summary: {result.get('instruction_summary')}") + print(f" Chart type: {result.get('chart_type')}") + save_chart(result.get("chart_image_base64"), "demo3_mcp_viz.png") + save_json_result(result, "demo3_mcp_viz.json") + + # --- Call explore_data via MCP --- + print("\n 🔍 Calling explore_data via MCP protocol...") + print(" (This may take a minute...)") + + explore_result = await session.call_tool( + "explore_data", + arguments={ + "data": SAMPLE_CSV, + "question": "What are the key economic trends across countries?", + "data_format": "csv", + "table_name": "world_economy", + "max_iterations": 2, + }, + ) + + for content in explore_result.content: + if hasattr(content, "text"): + result = json.loads(content.text) + print(f" Status: {result.get('status')}") + print(f" Steps: {result.get('total_steps')}") + if result.get("status") == "ok": + for step in result.get("steps", []): + i = step["iteration"] + print(f" Step {i}: {step.get('instruction_summary', step.get('question'))}") + save_chart(step.get("chart_image_base64"), f"demo3_mcp_explore_step{i}.png") + save_json_result(result, "demo3_mcp_explore.json") + + print("\n ✅ MCP client demo complete!") + + +# ╔═════════════════════════════════════════════════════════════════════════╗ +# ║ MAIN ║ +# ╚═════════════════════════════════════════════════════════════════════════╝ + +def main(): + parser = argparse.ArgumentParser( + description="Demo: Data Formulator as an MCP Server", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python test_mcp_server.py # Run demos 1 & 2 (direct invocation) + python test_mcp_server.py --demo 1 # One-shot visualization only + python test_mcp_server.py --demo 2 # Iterative exploration only + python test_mcp_server.py --demo 3 # MCP client over stdio + python test_mcp_server.py --demo all # Run all demos including MCP client + """, + ) + parser.add_argument( + "--demo", + choices=["1", "2", "3", "all"], + default=None, + help="Which demo to run (default: 1 and 2)", + ) + args = parser.parse_args() + + print("🚀 Data Formulator MCP Server Demo") + print(f" Output directory: {OUTPUT_DIR}") + print(f" Model endpoint: {os.getenv('DF_MCP_MODEL_ENDPOINT', 'openai')}") + print(f" Model name: {os.getenv('DF_MCP_MODEL_NAME', 'gpt-4o')}") + + # Check for API key + endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "openai") + api_key = os.getenv("DF_MCP_API_KEY", os.getenv(f"{endpoint.upper()}_API_KEY", "")) + if not api_key: + print(f"\n⚠️ No API key found! Set one of:") + print(f" export DF_MCP_API_KEY='your-key'") + print(f" export {endpoint.upper()}_API_KEY='your-key'") + print(f" (or set them in api-keys.env)") + sys.exit(1) + + if args.demo == "1": + demo_1_one_shot_visualization() + elif args.demo == "2": + demo_2_iterative_exploration() + elif args.demo == "3": + asyncio.run(demo_3_mcp_client()) + elif args.demo == "all": + demo_1_one_shot_visualization() + demo_2_iterative_exploration() + asyncio.run(demo_3_mcp_client()) + else: + # Default: run demos 1 and 2 + demo_1_one_shot_visualization() + demo_2_iterative_exploration() + + print("\n" + "=" * 70) + print(f" ✅ Demo complete! Check outputs in: {OUTPUT_DIR}") + print("=" * 70) + + +if __name__ == "__main__": + main() From af95c28a1b2ffa07e36ee1727f01a7666c6c410f Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Tue, 10 Feb 2026 13:40:08 -0800 Subject: [PATCH 4/6] updates --- py-src/data_formulator/datalake/workspace.py | 133 +++++---- py-src/data_formulator/mcp_server.py | 293 +++++++++++++++---- py-src/tests/test_mcp_server.py | 213 +++++++------- pyproject.toml | 1 + uv.lock | 2 + 5 files changed, 424 insertions(+), 218 deletions(-) diff --git a/py-src/data_formulator/datalake/workspace.py b/py-src/data_formulator/datalake/workspace.py index 776fdd3f..75a5cd33 100644 --- a/py-src/data_formulator/datalake/workspace.py +++ b/py-src/data_formulator/datalake/workspace.py @@ -579,77 +579,112 @@ class WorkspaceWithTempData: Context manager that temporarily adds temp data (list of {name, rows}) to a workspace as parquet files, yields the same workspace, and removes those files on exit. - OPTIMIZATION: Temp files are written directly to disk WITHOUT metadata updates. - This eliminates metadata file locking contention when multiple temp tables are - created concurrently. Since temp files are ephemeral (exist only during the context), - they don't need to be tracked in workspace.yaml. - - Python code can still access temp files via relative paths (e.g., pd.read_parquet()) - because the sandbox execution runs with workspace._path as the current working directory. - - Use when the client sends in-memory data (e.g. language == "python"): wrap the - workspace so temp tables are visible for the block and then cleaned up. + Two modes controlled by ``register_metadata``: + + **register_metadata=False (default)** + Temp files are written directly to disk WITHOUT metadata updates. + This eliminates metadata file locking contention when multiple temp tables + are created concurrently (the Flask/web-app path). Files use a ``.temp_`` + prefix so they can be identified for crash-recovery cleanup. + Python code can still access them via relative paths because the sandbox + runs with ``workspace._path`` as the working directory. + + **register_metadata=True** + Files are written with their plain sanitised name **and** registered in + ``workspace.yaml`` via ``add_table_metadata``. On exit they are removed + with ``delete_table`` (which cleans up both the file and the metadata + entry). Use this when downstream code needs ``read_data_as_df`` / + ``generate_data_summary`` to resolve tables by name – e.g. the MCP server. """ - def __init__(self, workspace: Workspace, temp_data: Optional[list[dict[str, Any]]] = None): + def __init__( + self, + workspace: Workspace, + temp_data: Optional[list[dict[str, Any]]] = None, + register_metadata: bool = False, + ): self._workspace = workspace self._temp_data = temp_data if temp_data else None - self._temp_files: list[Path] = [] # Track file paths for cleanup (not table names) + self._register_metadata = register_metadata + # When register_metadata=False we track file *paths* for cleanup. + # When register_metadata=True we track table *names* for delete_table(). + self._temp_files: list[Path] = [] + self._temp_table_names: list[str] = [] def __enter__(self) -> Workspace: if not self._temp_data: return self._workspace + from datetime import datetime from data_formulator.datalake.parquet_utils import sanitize_table_name for item in self._temp_data: base_name = item.get("name", "table") safe_name = sanitize_table_name(base_name) - # Use .temp_ prefix to distinguish from persistent tables - # This also helps with crash recovery - stale temp files can be identified and cleaned up - temp_filename = f".temp_{safe_name}.parquet" - file_path = self._workspace._path / temp_filename - - # Handle name conflicts by checking filesystem directly (no metadata read needed) - counter = 1 - while file_path.exists(): - temp_filename = f".temp_{safe_name}_{counter}.parquet" - file_path = self._workspace._path / temp_filename - counter += 1 - - # CRITICAL: Write parquet directly - NO metadata update - # This is the key optimization that eliminates metadata file locking contention. - # Temp files don't need metadata tracking since they're ephemeral and only - # live for the duration of this context. - # - # Python code can still access them via relative paths since the sandbox - # runs with workspace._path as cwd, e.g.: - # pd.read_parquet('.temp_sales.parquet') - # conn.execute("SELECT * FROM read_parquet('.temp_sales.parquet')") rows = item.get("rows", []) df = pd.DataFrame(rows) if rows else pd.DataFrame() - df.to_parquet(file_path) - self._temp_files.append(file_path) - logger.debug( - f"Added temp file {file_path.name} to workspace " - f"({len(df)} rows, no metadata update)" - ) + if self._register_metadata: + # ---- metadata-aware path ---- + filename = f"{safe_name}.parquet" + file_path = self._workspace._path / filename + df.to_parquet(file_path) + + from data_formulator.datalake.metadata import TableMetadata + + meta = TableMetadata( + name=safe_name, + source_type="upload", + filename=filename, + file_type="parquet", + created_at=datetime.now(), + row_count=len(df), + ) + self._workspace.add_table_metadata(meta) + self._temp_table_names.append(safe_name) + logger.debug( + f"Added table {safe_name} to workspace with metadata " + f"({len(df)} rows)" + ) + else: + # ---- fast path (no metadata) ---- + temp_filename = f".temp_{safe_name}.parquet" + file_path = self._workspace._path / temp_filename + + counter = 1 + while file_path.exists(): + temp_filename = f".temp_{safe_name}_{counter}.parquet" + file_path = self._workspace._path / temp_filename + counter += 1 + + df.to_parquet(file_path) + self._temp_files.append(file_path) + logger.debug( + f"Added temp file {file_path.name} to workspace " + f"({len(df)} rows, no metadata update)" + ) return self._workspace def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - # Delete temp files directly - NO metadata update - # This is safe because we never added them to metadata in the first place - for file_path in self._temp_files: - try: - file_path.unlink(missing_ok=True) - logger.debug(f"Removed temp file {file_path.name}") - except Exception as e: - logger.warning(f"Failed to remove temp file {file_path}: {e}") - - self._temp_files.clear() + if self._register_metadata: + # delete_table removes both the parquet file and the metadata entry + for name in self._temp_table_names: + try: + self._workspace.delete_table(name) + logger.debug(f"Deleted table {name} (file + metadata)") + except Exception as e: + logger.warning(f"Failed to delete table {name}: {e}") + self._temp_table_names.clear() + else: + for file_path in self._temp_files: + try: + file_path.unlink(missing_ok=True) + logger.debug(f"Removed temp file {file_path.name}") + except Exception as e: + logger.warning(f"Failed to remove temp file {file_path}: {e}") + self._temp_files.clear() # ============================================================================== diff --git a/py-src/data_formulator/mcp_server.py b/py-src/data_formulator/mcp_server.py index d1501ed0..72c26ae8 100644 --- a/py-src/data_formulator/mcp_server.py +++ b/py-src/data_formulator/mcp_server.py @@ -7,8 +7,10 @@ Exposes Data Formulator's AI-powered data visualization capabilities as an MCP (Model Context Protocol) server with the following tools: -1. visualize_data: Given data + instruction → transformed data + chart (PNG) -2. explore_data: Multi-turn iterative exploration → rounds of response + data + chart +1. list_demo_data: List predefined demo datasets with URLs +2. visualize_data: Given data URLs + instruction → transformed data + chart (PNG) +3. explore_data: Multi-turn iterative exploration → rounds of response + data + chart +4. create_chart: Create a chart directly from data URLs + field mappings Setup: # Install with uv (recommended) @@ -27,6 +29,27 @@ uv run py-src/data_formulator/mcp_server.py Configure in Claude Desktop (claude_desktop_config.json): + + Azure OpenAI with Azure AD auth (recommended for Microsoft users): + { + "mcpServers": { + "data-formulator": { + "command": "uv", + "args": [ + "--directory", "/path/to/data-formulator", + "run", "python", "-m", "data_formulator.mcp_server" + ], + "env": { + "DF_MCP_MODEL_ENDPOINT": "azure", + "DF_MCP_MODEL_NAME": "gpt-4o", + "DF_MCP_API_BASE": "https://YOUR_RESOURCE.openai.azure.com/", + "DF_MCP_API_VERSION": "2025-04-01-preview" + } + } + } + } + + OpenAI (with API key): { "mcpServers": { "data-formulator": { @@ -45,6 +68,29 @@ } Configure in VS Code (settings.json): + + Azure OpenAI with Azure AD auth (recommended for Microsoft users): + { + "mcp": { + "servers": { + "data-formulator": { + "command": "uv", + "args": [ + "--directory", "/path/to/data-formulator", + "run", "python", "-m", "data_formulator.mcp_server" + ], + "env": { + "DF_MCP_MODEL_ENDPOINT": "azure", + "DF_MCP_MODEL_NAME": "gpt-4o", + "DF_MCP_API_BASE": "https://YOUR_RESOURCE.openai.azure.com/", + "DF_MCP_API_VERSION": "2025-04-01-preview" + } + } + } + } + } + + OpenAI (with API key): { "mcp": { "servers": { @@ -65,12 +111,26 @@ } Environment variables: - OPENAI_API_KEY / ANTHROPIC_API_KEY / etc. - API keys for LLM providers - DF_MCP_MODEL_ENDPOINT - LLM provider (default: "openai") + DF_MCP_MODEL_ENDPOINT - LLM provider: "azure" | "openai" | "anthropic" | "gemini" | "ollama" + (default: "azure") DF_MCP_MODEL_NAME - Model name (default: "gpt-4o") - DF_MCP_API_KEY - API key (overrides provider-specific key) - DF_MCP_API_BASE - Custom API base URL (optional) + DF_MCP_API_BASE - API base URL (required for azure, e.g. "https://YOUR_RESOURCE.openai.azure.com/") + DF_MCP_API_VERSION - API version for Azure (default: "2025-04-01-preview") + DF_MCP_API_KEY - API key (optional for Azure AD auth; required for OpenAI/Anthropic) + OPENAI_API_KEY - Fallback API key for OpenAI endpoint + ANTHROPIC_API_KEY - Fallback API key for Anthropic endpoint DATALAKE_ROOT - Workspace root directory (optional) + + Azure AD auth (no API key needed): + When using DF_MCP_MODEL_ENDPOINT=azure with no API key set, the server + automatically uses DefaultAzureCredential for token-based auth. + Make sure you are logged in via `az login` or have a managed identity. + + Other providers: + export DF_MCP_MODEL_ENDPOINT="openai" && export OPENAI_API_KEY="sk-..." + export DF_MCP_MODEL_ENDPOINT="anthropic" && export ANTHROPIC_API_KEY="sk-ant-..." + export DF_MCP_MODEL_ENDPOINT="gemini" && export GEMINI_API_KEY="..." + export DF_MCP_MODEL_ENDPOINT="ollama" # no key needed, runs locally """ import os @@ -79,10 +139,13 @@ import base64 import logging import tempfile +from io import StringIO, BytesIO from pathlib import Path from typing import Any +from urllib.parse import urlparse import pandas as pd +import requests from dotenv import load_dotenv @@ -104,6 +167,7 @@ create_chart_spec, ) from data_formulator.workflows.exploration_flow import create_chart_spec_from_data +from data_formulator.example_datasets_config import EXAMPLE_DATASETS logger = logging.getLogger(__name__) @@ -113,7 +177,7 @@ def _get_model_config() -> dict[str, str]: """Build model config from environment variables.""" - endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "openai") + endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "azure") model = os.getenv("DF_MCP_MODEL_NAME", "gpt-4o") # Resolve API key: explicit > provider-specific @@ -143,32 +207,83 @@ def _get_workspace(session_id: str = "mcp_session") -> Workspace: return Workspace(session_id) -def _parse_data_input(data: str, data_format: str = "auto") -> pd.DataFrame: +def _detect_format_from_url(url: str) -> str: + """Detect data format from URL file extension.""" + path = urlparse(url).path.lower() + if path.endswith(".csv"): + return "csv" + elif path.endswith(".tsv"): + return "tsv" + elif path.endswith(".json"): + return "json" + elif path.endswith(".jsonl"): + return "jsonl" + elif path.endswith(".xlsx") or path.endswith(".xls"): + return "xlsx" + return "csv" # default to CSV + + +def _load_data_from_url(url: str, data_format: str = "auto") -> pd.DataFrame: """ - Parse data from a string (JSON or CSV) into a DataFrame. + Download and parse tabular data from a URL. + + Supported formats: csv, tsv, json, jsonl, xlsx. + If data_format is "auto", the format is detected from the URL extension. Args: - data: Raw data string (JSON array or CSV text) - data_format: "json", "csv", or "auto" (detect automatically) + url: URL pointing to a data file (csv, tsv, json, jsonl, or xlsx). + data_format: "csv", "tsv", "json", "jsonl", "xlsx", or "auto". Returns: pandas DataFrame """ if data_format == "auto": - stripped = data.strip() - if stripped.startswith("[") or stripped.startswith("{"): - data_format = "json" - else: - data_format = "csv" + data_format = _detect_format_from_url(url) + + resp = requests.get(url, timeout=60) + resp.raise_for_status() if data_format == "json": - parsed = json.loads(data) + parsed = resp.json() if isinstance(parsed, dict): parsed = [parsed] return pd.DataFrame(parsed) - else: - from io import StringIO - return pd.read_csv(StringIO(data)) + elif data_format == "jsonl": + lines = resp.text.strip().split("\n") + records = [json.loads(line) for line in lines if line.strip()] + return pd.DataFrame(records) + elif data_format == "tsv": + return pd.read_csv(StringIO(resp.text), sep="\t") + elif data_format in ("xlsx", "xls"): + return pd.read_excel(BytesIO(resp.content)) + else: # csv + return pd.read_csv(StringIO(resp.text)) + + +def _load_multiple_urls(data_urls: list[str], table_names: list[str] | None = None) -> list[dict]: + """ + Load multiple data URLs and return a list of table dicts. + + Args: + data_urls: List of URLs to load. + table_names: Optional list of names for each table. + If not provided, names are derived from the URL filename. + + Returns: + List of {"name": str, "rows": list[dict]} dicts. + """ + tables = [] + for i, url in enumerate(data_urls): + df = _load_data_from_url(url) + rows = json.loads(df.to_json(orient="records", date_format="iso")) + if table_names and i < len(table_names): + name = table_names[i] + else: + # Derive name from URL filename (strip extension) + filename = urlparse(url).path.split("/")[-1] + name = filename.rsplit(".", 1)[0] if "." in filename else filename + tables.append({"name": name, "rows": rows}) + return tables def _make_chart_image( @@ -196,25 +311,76 @@ def _make_chart_image( return None + + + # --------------------------------------------------------------------------- # MCP Server # --------------------------------------------------------------------------- mcp = FastMCP( "Data Formulator", - description=( + instructions=( "AI-powered data visualization server. " - "Transform data, generate charts, and explore datasets interactively." + "Transform data, generate charts, and explore datasets interactively. " + "Use list_demo_data to browse available demo datasets, then pass their " + "URLs to visualize_data, explore_data, or create_chart." ), ) +@mcp.tool() +def list_demo_data() -> dict[str, Any]: + """ + List predefined demo datasets available for visualization and exploration. + + Returns a curated list of datasets with their URLs, formats, descriptions, + and sample data. Use the returned URLs as input to visualize_data, + explore_data, or create_chart. + + Returns: + A dictionary with: + - status: "ok" + - datasets: List of dataset entries, each containing: + - name: Human-readable dataset name + - source: Data source (e.g. "vegadatasets", "tidytuesday") + - description: Short description of the dataset + - tables: List of tables, each with: + - url: URL to download the data file + - format: File format ("csv", "json", etc.) + - sample: A few sample rows (string or list) to preview the data + """ + datasets = [] + for ds in EXAMPLE_DATASETS: + entry = { + "name": ds["name"], + "source": ds.get("source", ""), + "description": ds.get("description", ""), + "tables": [], + } + for table in ds.get("tables", []): + t = { + "url": table["url"], + "format": table.get("format", "csv"), + } + # Include a short sample preview + sample = table.get("sample", "") + if isinstance(sample, list): + t["sample"] = sample[:5] # first 5 rows + elif isinstance(sample, str): + lines = sample.strip().split("\n") + t["sample"] = "\n".join(lines[:6]) # header + 5 rows + entry["tables"].append(t) + datasets.append(entry) + + return {"status": "ok", "datasets": datasets} + + @mcp.tool() def visualize_data( - data: str, + data_urls: list[str], instruction: str, - data_format: str = "auto", - table_name: str = "input_data", + table_names: list[str] | None = None, chart_type: str = "", x: str = "", y: str = "", @@ -226,10 +392,13 @@ def visualize_data( """ Transform data and generate a visualization based on a natural language instruction. - Given tabular data (JSON or CSV) and a natural language instruction, this tool: - 1. Uses an AI agent to understand the intent and generate transformation code - 2. Executes the transformation to produce the output data - 3. Creates a chart (PNG) from the transformed data + Given one or more data URLs and a natural language instruction, this tool: + 1. Downloads the data from the URLs (supports csv, tsv, json, jsonl, xlsx) + 2. Uses an AI agent to understand the intent and generate transformation code + 3. Executes the transformation to produce the output data + 4. Creates a chart (PNG) from the transformed data + + Use list_demo_data to discover available demo datasets and their URLs. Use this for one-shot data analysis tasks like: - "Show average sales by region as a bar chart" @@ -237,10 +406,11 @@ def visualize_data( - "Forecast the next 6 months of revenue" Args: - data: Tabular data as a JSON array of objects or CSV text. + data_urls: List of URLs pointing to data files (csv, tsv, json, jsonl, xlsx). + The format is auto-detected from the file extension. instruction: Natural language description of what visualization to create. - data_format: "json", "csv", or "auto" (default: auto-detect). - table_name: Name for the input table (default: "input_data"). + table_names: Optional list of names for each table (one per URL). + If not provided, names are derived from the URL filename. chart_type: Optional chart type hint ("bar", "point", "line", "area", "heatmap", "group_bar", "boxplot", "worldmap", "usmap"). Leave empty to let the AI decide. x: Optional field name for x-axis encoding. @@ -263,11 +433,8 @@ def visualize_data( - reasoning: The AI's reasoning about the transformation """ try: - # Parse input data - df = _parse_data_input(data, data_format) - rows = json.loads(df.to_json(orient="records", date_format="iso")) - - input_tables = [{"name": table_name, "rows": rows}] + # Load data from URLs + input_tables = _load_multiple_urls(data_urls, table_names) # Build chart encodings from optional hints chart_encodings = {} @@ -283,9 +450,9 @@ def visualize_data( # Set up workspace + agent client = _get_client() workspace = _get_workspace() - temp_data = [{"name": table_name, "rows": rows}] - with WorkspaceWithTempData(workspace, temp_data) as ws: + # Use register_metadata=True so agents can resolve tables via read_data_as_df + with WorkspaceWithTempData(workspace, input_tables, register_metadata=True) as ws: if mode == "recommendation": agent = DataRecAgent(client=client, workspace=ws) results = agent.run(input_tables, instruction, n=1) @@ -358,20 +525,22 @@ def visualize_data( @mcp.tool() def explore_data( - data: str, + data_urls: list[str], question: str, - data_format: str = "auto", - table_name: str = "input_data", + table_names: list[str] | None = None, max_iterations: int = 3, max_repair_attempts: int = 1, ) -> dict[str, Any]: """ Iteratively explore a dataset through multiple rounds of AI-driven analysis. - Given tabular data and a high-level exploration question, this tool: - 1. Breaks the question into a multi-step analysis plan - 2. For each step: transforms data, creates a chart, and decides the next step - 3. Returns all exploration steps with their data and charts + Given one or more data URLs and a high-level exploration question, this tool: + 1. Downloads the data from the URLs (supports csv, tsv, json, jsonl, xlsx) + 2. Breaks the question into a multi-step analysis plan + 3. For each step: transforms data, creates a chart, and decides the next step + 4. Returns all exploration steps with their data and charts + + Use list_demo_data to discover available demo datasets and their URLs. Use this for open-ended data exploration like: - "What are the key trends and patterns in this sales data?" @@ -379,10 +548,11 @@ def explore_data( - "Analyze the relationship between weather and energy consumption" Args: - data: Tabular data as a JSON array of objects or CSV text. + data_urls: List of URLs pointing to data files (csv, tsv, json, jsonl, xlsx). + The format is auto-detected from the file extension. question: High-level exploration question or topic. - data_format: "json", "csv", or "auto" (default: auto-detect). - table_name: Name for the input table (default: "input_data"). + table_names: Optional list of names for each table (one per URL). + If not provided, names are derived from the URL filename. max_iterations: Maximum number of exploration rounds (default: 3). max_repair_attempts: Max code repair retries per step (default: 1). @@ -403,19 +573,15 @@ def explore_data( - total_steps: Number of steps completed """ try: - # Parse input data - df = _parse_data_input(data, data_format) - rows = json.loads(df.to_json(orient="records", date_format="iso")) - - input_tables = [{"name": table_name, "rows": rows}] + # Load data from URLs + input_tables = _load_multiple_urls(data_urls, table_names) client = _get_client() workspace = _get_workspace() - temp_data = [{"name": table_name, "rows": rows}] - steps = [] - with WorkspaceWithTempData(workspace, temp_data) as ws: + # Use register_metadata=True so agents can resolve tables via read_data_as_df + with WorkspaceWithTempData(workspace, input_tables, register_metadata=True) as ws: rec_agent = DataRecAgent(client=client, workspace=ws) exploration_agent = ExplorationAgent(client=client, workspace=ws) @@ -534,7 +700,6 @@ def explore_data( except Exception as e: logger.warning(f"Exploration planning failed: {e}") break - # Build summary summary_parts = [] for s in steps: @@ -556,23 +721,24 @@ def explore_data( @mcp.tool() def create_chart( - data: str, + data_url: str, chart_type: str, x: str = "", y: str = "", color: str = "", size: str = "", facet: str = "", - data_format: str = "auto", ) -> dict[str, Any]: """ - Create a chart directly from data and field mappings (no AI, no transformation). + Create a chart directly from a data URL and field mappings (no AI, no transformation). This is a fast, deterministic tool for creating standard charts when you already know exactly which fields to use and how to map them. + Use list_demo_data to discover available demo datasets and their URLs. + Args: - data: Tabular data as a JSON array of objects or CSV text. + data_url: URL pointing to a data file (csv, tsv, json, jsonl, xlsx). chart_type: One of "bar", "point", "line", "area", "heatmap", "group_bar", "boxplot". x: Field name for x-axis. @@ -580,7 +746,6 @@ def create_chart( color: Optional field name for color encoding. size: Optional field name for size encoding. facet: Optional field name for faceting. - data_format: "json", "csv", or "auto". Returns: A dictionary with: @@ -590,7 +755,7 @@ def create_chart( - fields_used: List of fields mapped to channels """ try: - df = _parse_data_input(data, data_format) + df = _load_data_from_url(data_url) # Build encoding dict fields = [] diff --git a/py-src/tests/test_mcp_server.py b/py-src/tests/test_mcp_server.py index 27071c98..b212da8d 100644 --- a/py-src/tests/test_mcp_server.py +++ b/py-src/tests/test_mcp_server.py @@ -18,12 +18,28 @@ # Install dependencies with uv (from project root): uv pip install -e ".[mcp]" # or: - uv pip install mcp pandas vl-convert-python + uv pip install mcp pandas vl-convert-python requests - # Set your LLM API key: - export OPENAI_API_KEY="sk-..." # or ANTHROPIC_API_KEY, etc. - export DF_MCP_MODEL_ENDPOINT="openai" # openai | anthropic | azure | gemini | ollama - export DF_MCP_MODEL_NAME="gpt-4o" # model name + # Azure OpenAI with Azure AD auth (recommended for Microsoft users): + # No API key needed — uses DefaultAzureCredential (az login). + export DF_MCP_MODEL_ENDPOINT="azure" + export DF_MCP_MODEL_NAME="gpt-4o" + export DF_MCP_API_BASE="https://YOUR_RESOURCE.openai.azure.com/" + export DF_MCP_API_VERSION="2025-04-01-preview" # optional, has default + + # Alternative: OpenAI + # export DF_MCP_MODEL_ENDPOINT="openai" + # export DF_MCP_MODEL_NAME="gpt-4o" + # export OPENAI_API_KEY="sk-..." + + # Alternative: Anthropic + # export DF_MCP_MODEL_ENDPOINT="anthropic" + # export DF_MCP_MODEL_NAME="claude-sonnet-4-20250514" + # export ANTHROPIC_API_KEY="sk-ant-..." + + # Alternative: Ollama (local, no key) + # export DF_MCP_MODEL_ENDPOINT="ollama" + # export DF_MCP_MODEL_NAME="llama3" Usage: # Run all demos with uv (recommended): @@ -58,53 +74,17 @@ # ╔═════════════════════════════════════════════════════════════════════════╗ -# ║ SAMPLE DATA ║ +# ║ DEMO DATA URLS ║ # ╚═════════════════════════════════════════════════════════════════════════╝ -SAMPLE_CSV = """Country,Year,GDP_Billion,Population_Million,CO2_Emission_MT -United States,2018,20580,327,5280 -United States,2019,21430,329,5130 -United States,2020,20940,331,4570 -United States,2021,23320,332,5010 -United States,2022,25460,333,5060 -China,2018,13890,1393,10060 -China,2019,14280,1398,10170 -China,2020,14720,1402,10670 -China,2021,17730,1405,11470 -China,2022,17960,1406,11400 -Germany,2018,3970,83,759 -Germany,2019,3890,83,702 -Germany,2020,3890,83,644 -Germany,2021,4220,83,675 -Germany,2022,4070,84,666 -India,2018,2710,1353,2480 -India,2019,2870,1366,2600 -India,2020,2660,1380,2440 -India,2021,3180,1393,2710 -India,2022,3390,1407,2830 -Japan,2018,4970,126,1160 -Japan,2019,5080,126,1140 -Japan,2020,5040,126,1060 -Japan,2021,4940,125,1070 -Japan,2022,4230,125,1050 -Brazil,2018,1870,210,460 -Brazil,2019,1870,211,470 -Brazil,2020,1440,212,440 -Brazil,2021,1650,213,490 -Brazil,2022,1920,214,490""" - -SAMPLE_JSON = json.dumps([ - {"Student": "Alice", "Math": 92, "Science": 88, "English": 95, "History": 78, "Grade": "A"}, - {"Student": "Bob", "Math": 76, "Science": 82, "English": 71, "History": 89, "Grade": "B"}, - {"Student": "Charlie", "Math": 88, "Science": 91, "English": 84, "History": 92, "Grade": "A"}, - {"Student": "Diana", "Math": 65, "Science": 70, "English": 90, "History": 85, "Grade": "B"}, - {"Student": "Eve", "Math": 95, "Science": 97, "English": 92, "History": 88, "Grade": "A"}, - {"Student": "Frank", "Math": 58, "Science": 62, "English": 68, "History": 72, "Grade": "C"}, - {"Student": "Grace", "Math": 84, "Science": 79, "English": 88, "History": 91, "Grade": "B"}, - {"Student": "Henry", "Math": 91, "Science": 85, "English": 79, "History": 83, "Grade": "A"}, - {"Student": "Iris", "Math": 73, "Science": 68, "English": 82, "History": 76, "Grade": "B"}, - {"Student": "Jack", "Math": 87, "Science": 93, "English": 86, "History": 80, "Grade": "A"}, -]) +# These URLs come from the predefined demo datasets (also available via +# the list_demo_data MCP tool). You can use any publicly accessible URL +# pointing to a csv, tsv, json, jsonl, or xlsx file. + +GAPMINDER_URL = "https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/gapminder.json" +DISASTERS_URL = "https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/disasters.csv" +LIFE_EXPECTANCY_URL = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2023/2023-12-05/life_expectancy.csv" +MOVIES_URL = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2025/2025-07-29/movies.csv" # ╔═════════════════════════════════════════════════════════════════════════╗ @@ -147,23 +127,33 @@ def print_section(title: str) -> None: def demo_1_one_shot_visualization(): """ Demonstrates the `visualize_data` tool: - Input: data (CSV or JSON) + natural language instruction + Input: data URLs + natural language instruction Output: transformed data + chart image (PNG) + reasoning """ print_section("DEMO 1: One-shot Data Visualization") - # Import the MCP tool function directly - from data_formulator.mcp_server import visualize_data + # Import the MCP tool functions directly + from data_formulator.mcp_server import visualize_data, list_demo_data + + # --- Example 1.0: List demo datasets --- + print("\n📋 Example 1.0: List available demo datasets") + demo_data = list_demo_data() + print(f" Found {len(demo_data['datasets'])} demo datasets:") + for ds in demo_data["datasets"]: + urls = [t["url"] for t in ds["tables"]] + print(f" • {ds['name']}: {ds['description'][:60]}...") + for t in ds["tables"]: + print(f" URL: {t['url'][:80]}... ({t['format']})") - # --- Example 1a: CSV data, let AI recommend a visualization --- - print("\n📊 Example 1a: GDP trends (CSV, AI-recommended chart)") - print(" Instruction: 'Show GDP trends over time for each country as a line chart'") + # --- Example 1a: Gapminder (JSON URL), let AI recommend a visualization --- + print("\n📊 Example 1a: Gapminder life expectancy trends (JSON URL)") + print(f" URL: {GAPMINDER_URL}") + print(" Instruction: 'Show life expectancy trends over time for the top 5 most populous countries'") result = visualize_data( - data=SAMPLE_CSV, - instruction="Show GDP trends over time for each country as a line chart", - data_format="csv", - table_name="world_economy", + data_urls=[GAPMINDER_URL], + instruction="Show life expectancy trends over time for the top 5 most populous countries as a line chart", + table_names=["gapminder"], ) print(f" Status: {result['status']}") @@ -173,20 +163,20 @@ def demo_1_one_shot_visualization(): print(f" Encodings: {result['chart_encodings']}") print(f" Data rows: {result['transformed_data_full_count']}") print(f" Code:\n{result['code'][:300]}...") - save_chart(result.get("chart_image_base64"), "demo1a_gdp_trends.png") + save_chart(result.get("chart_image_base64"), "demo1a_gapminder.png") save_json_result(result, "demo1a_result.json") else: print(f" Error: {result.get('message', 'Unknown')}") - # --- Example 1b: JSON data, with encoding hints --- - print("\n📊 Example 1b: Student scores (JSON, with encoding hints)") - print(" Instruction: 'Compare students by their average score across all subjects'") + # --- Example 1b: Disasters (CSV URL) --- + print("\n📊 Example 1b: Natural disasters deaths over time (CSV URL)") + print(f" URL: {DISASTERS_URL}") + print(" Instruction: 'Show total deaths by disaster type over time'") result = visualize_data( - data=SAMPLE_JSON, - instruction="Compare students by their average score across all subjects", - data_format="json", - table_name="student_scores", + data_urls=[DISASTERS_URL], + instruction="Show the total deaths by disaster entity for the top 5 deadliest disaster types as a bar chart", + table_names=["disasters"], ) print(f" Status: {result['status']}") @@ -195,20 +185,20 @@ def demo_1_one_shot_visualization(): print(f" Chart type: {result['chart_type']}") print(f" Encodings: {result['chart_encodings']}") print(f" Output fields: {result['reasoning']['output_fields']}") - save_chart(result.get("chart_image_base64"), "demo1b_student_avg.png") + save_chart(result.get("chart_image_base64"), "demo1b_disasters.png") save_json_result(result, "demo1b_result.json") else: print(f" Error: {result.get('message', 'Unknown')}") - # --- Example 1c: CO2 per capita analysis --- - print("\n📊 Example 1c: CO2 per capita (CSV, computed metric)") - print(" Instruction: 'Calculate CO2 emissions per capita and show as a grouped bar chart by country and year'") + # --- Example 1c: Netflix movies (CSV URL), computed metric --- + print("\n📊 Example 1c: Netflix most viewed movies (CSV URL)") + print(f" URL: {MOVIES_URL}") + print(" Instruction: 'Show top 10 most viewed movies'") result = visualize_data( - data=SAMPLE_CSV, - instruction="Calculate CO2 emissions per capita (CO2 / Population) and show as a grouped bar chart by country for the latest year", - data_format="csv", - table_name="world_economy", + data_urls=[MOVIES_URL], + instruction="Show the top 10 most viewed movies as a horizontal bar chart sorted by views", + table_names=["netflix_movies"], ) print(f" Status: {result['status']}") @@ -216,7 +206,7 @@ def demo_1_one_shot_visualization(): print(f" Summary: {result['instruction_summary']}") print(f" Chart type: {result['chart_type']}") print(f" Data preview: {result['transformed_data'][:3]}") - save_chart(result.get("chart_image_base64"), "demo1c_co2_per_capita.png") + save_chart(result.get("chart_image_base64"), "demo1c_netflix.png") save_json_result(result, "demo1c_result.json") else: print(f" Error: {result.get('message', 'Unknown')}") @@ -229,7 +219,7 @@ def demo_1_one_shot_visualization(): def demo_2_iterative_exploration(): """ Demonstrates the `explore_data` tool: - Input: data + high-level question + Input: data URLs + high-level question Output: multiple rounds of analysis, each with data + chart + reasoning The AI agent: @@ -242,17 +232,17 @@ def demo_2_iterative_exploration(): from data_formulator.mcp_server import explore_data - # --- Example 2a: Explore world economy data --- - print("\n🔍 Example 2a: Explore world economy trends") - print(" Question: 'Explore the relationship between GDP growth, population, and CO2 emissions'") + # --- Example 2a: Explore Gapminder data --- + print("\n🔍 Example 2a: Explore Gapminder global development trends") + print(f" URL: {GAPMINDER_URL}") + print(" Question: 'Explore the relationship between population growth, life expectancy, and fertility'") print(" Max iterations: 3") print(" (This may take a minute as the AI performs multiple analysis rounds...)\n") result = explore_data( - data=SAMPLE_CSV, - question="Explore the relationship between GDP growth, population, and CO2 emissions across countries. What patterns emerge?", - data_format="csv", - table_name="world_economy", + data_urls=[GAPMINDER_URL], + question="Explore the relationship between population growth, life expectancy, and fertility rates across countries. What patterns emerge?", + table_names=["gapminder"], max_iterations=3, ) @@ -276,16 +266,16 @@ def demo_2_iterative_exploration(): save_json_result(result, "demo2a_exploration.json") - # --- Example 2b: Explore student performance --- - print("\n\n🔍 Example 2b: Explore student performance patterns") - print(" Question: 'Analyze student performance across subjects and identify strengths/weaknesses'") + # --- Example 2b: Explore life expectancy data --- + print("\n\n🔍 Example 2b: Explore life expectancy across countries") + print(f" URL: {LIFE_EXPECTANCY_URL}") + print(" Question: 'Analyze life expectancy trends and identify countries with the fastest improvements'") print(" Max iterations: 3\n") result = explore_data( - data=SAMPLE_JSON, - question="Analyze student performance across subjects. Which subjects are hardest? Do grades correlate with specific subjects?", - data_format="json", - table_name="student_scores", + data_urls=[LIFE_EXPECTANCY_URL], + question="Analyze life expectancy trends over time. Which regions improved the most? Are there any countries that regressed?", + table_names=["life_expectancy"], max_iterations=3, ) @@ -351,16 +341,23 @@ async def demo_3_mcp_client(): tools = await session.list_tools() print(f" 📦 Available tools: {[t.name for t in tools.tools]}") + # --- Call list_demo_data via MCP --- + print("\n 📋 Calling list_demo_data via MCP protocol...") + demo_result = await session.call_tool("list_demo_data", arguments={}) + for content in demo_result.content: + if hasattr(content, "text"): + result = json.loads(content.text) + print(f" Found {len(result.get('datasets', []))} demo datasets") + # --- Call visualize_data via MCP --- print("\n 📊 Calling visualize_data via MCP protocol...") viz_result = await session.call_tool( "visualize_data", arguments={ - "data": SAMPLE_CSV, - "instruction": "Show GDP per capita trends over time for each country", - "data_format": "csv", - "table_name": "world_economy", + "data_urls": [GAPMINDER_URL], + "instruction": "Show life expectancy vs fertility as a scatter plot colored by cluster", + "table_names": ["gapminder"], }, ) @@ -382,10 +379,9 @@ async def demo_3_mcp_client(): explore_result = await session.call_tool( "explore_data", arguments={ - "data": SAMPLE_CSV, - "question": "What are the key economic trends across countries?", - "data_format": "csv", - "table_name": "world_economy", + "data_urls": [DISASTERS_URL], + "question": "What are the most common and deadliest types of natural disasters?", + "table_names": ["disasters"], "max_iterations": 2, }, ) @@ -430,19 +426,26 @@ def main(): ) args = parser.parse_args() + endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "azure") + print("🚀 Data Formulator MCP Server Demo") print(f" Output directory: {OUTPUT_DIR}") - print(f" Model endpoint: {os.getenv('DF_MCP_MODEL_ENDPOINT', 'openai')}") + print(f" Model endpoint: {endpoint}") print(f" Model name: {os.getenv('DF_MCP_MODEL_NAME', 'gpt-4o')}") + if endpoint == "azure": + print(f" API base: {os.getenv('DF_MCP_API_BASE', '(not set)')}") + print(f" Auth: Azure AD (DefaultAzureCredential)") - # Check for API key - endpoint = os.getenv("DF_MCP_MODEL_ENDPOINT", "openai") + # Check for API key (not required for Azure AD auth) api_key = os.getenv("DF_MCP_API_KEY", os.getenv(f"{endpoint.upper()}_API_KEY", "")) - if not api_key: - print(f"\n⚠️ No API key found! Set one of:") + if not api_key and endpoint not in ("azure", "ollama"): + print(f"\n⚠️ No API key found for endpoint '{endpoint}'! Set one of:") print(f" export DF_MCP_API_KEY='your-key'") print(f" export {endpoint.upper()}_API_KEY='your-key'") print(f" (or set them in api-keys.env)") + print(f"\n For Azure OpenAI with AD auth (no key needed):") + print(f" export DF_MCP_MODEL_ENDPOINT=azure") + print(f" export DF_MCP_API_BASE=https://YOUR_RESOURCE.openai.azure.com/") sys.exit(1) if args.demo == "1": diff --git a/pyproject.toml b/pyproject.toml index e52ffb10..02300e56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "yfinance", "connectorx>=0.4.5", "pyarrow>=23.0.0", + "requests", "mcp>=1.26.0", ] diff --git a/uv.lock b/uv.lock index 473ea937..e5bfc3a1 100644 --- a/uv.lock +++ b/uv.lock @@ -740,6 +740,7 @@ dependencies = [ { name = "pymysql" }, { name = "pyodbc" }, { name = "python-dotenv" }, + { name = "requests" }, { name = "scikit-learn" }, { name = "vega-datasets" }, { name = "vl-convert-python" }, @@ -779,6 +780,7 @@ requires-dist = [ { name = "pymysql" }, { name = "pyodbc" }, { name = "python-dotenv" }, + { name = "requests" }, { name = "scikit-learn" }, { name = "vega-datasets" }, { name = "vl-convert-python" }, From d5fc9db658ebb0ab66789fd8e13c8f6581e970bc Mon Sep 17 00:00:00 2001 From: Andres Date: Fri, 13 Feb 2026 07:58:52 -0800 Subject: [PATCH 5/6] case fix --- src/app/App.tsx | 2 +- src/app/dfSlice.tsx | 4 ++-- src/app/tableThunks.ts | 2 +- src/app/useDataRefresh.tsx | 2 +- src/app/utils.tsx | 4 ++-- src/data/utils.ts | 2 +- src/views/ChartRecBox.tsx | 4 ++-- src/views/ChartRenderService.tsx | 2 +- src/views/ChartifactDialog.tsx | 2 +- src/views/ConceptCard.tsx | 4 ++-- src/views/ConceptShelf.tsx | 2 +- src/views/DBTableManager.tsx | 2 +- src/views/DataLoadingThread.tsx | 2 +- src/views/DataThread.tsx | 2 +- src/views/DataThreadCards.tsx | 2 +- src/views/DataView.tsx | 2 +- src/views/EncodingBox.tsx | 2 +- src/views/EncodingShelfCard.tsx | 4 ++-- src/views/EncodingShelfThread.tsx | 4 ++-- src/views/MultiTablePreview.tsx | 2 +- src/views/RefreshDataDialog.tsx | 2 +- src/views/ReportView.tsx | 2 +- src/views/SelectableDataGrid.tsx | 2 +- src/views/TableSelectionView.tsx | 2 +- src/views/UnifiedDataUploadDialog.tsx | 2 +- src/views/ViewUtils.tsx | 4 ++-- src/views/VisualizationView.tsx | 4 ++-- 27 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/app/App.tsx b/src/app/App.tsx index 09d65890..6ed6b5e5 100644 --- a/src/app/App.tsx +++ b/src/app/App.tsx @@ -66,7 +66,7 @@ import { import { About } from '../views/About'; import { MessageSnackbar } from '../views/MessageSnackbar'; import { ChartRenderService } from '../views/ChartRenderService'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { AppDispatch } from './store'; import dfLogo from '../assets/df-logo.png'; import { ModelSelectionButton } from '../views/ModelSelectionDialog'; diff --git a/src/app/dfSlice.tsx b/src/app/dfSlice.tsx index 46b4f079..5f82d4f2 100644 --- a/src/app/dfSlice.tsx +++ b/src/app/dfSlice.tsx @@ -2,9 +2,9 @@ // Licensed under the MIT License. import { createAsyncThunk, createSlice, PayloadAction, createSelector } from '@reduxjs/toolkit' -import { Channel, Chart, ChartTemplate, DataCleanBlock, DataSourceConfig, EncodingItem, EncodingMap, FieldItem, Trigger } from '../components/ComponentType' +import { Channel, Chart, ChartTemplate, DataCleanBlock, DataSourceConfig, EncodingItem, EncodingMap, FieldItem, Trigger } from '../components/componentType' import { enableMapSet } from 'immer'; -import { DictTable } from "../components/ComponentType"; +import { DictTable } from "../components/componentType"; import { Message } from '../views/MessageSnackbar'; import { getChartTemplate, getChartChannels } from "../components/ChartTemplates" import { recommendEncodings } from '../components/chartUtils'; diff --git a/src/app/tableThunks.ts b/src/app/tableThunks.ts index 4d8f3a61..e79b5e62 100644 --- a/src/app/tableThunks.ts +++ b/src/app/tableThunks.ts @@ -13,7 +13,7 @@ */ import { createAsyncThunk } from '@reduxjs/toolkit'; -import { DataSourceConfig, DictTable } from '../components/ComponentType'; +import { DataSourceConfig, DictTable } from '../components/componentType'; import { Type } from '../data/types'; import { inferTypeFromValueArray } from '../data/utils'; import { fetchWithIdentity, getUrls, computeContentHash } from './utils'; diff --git a/src/app/useDataRefresh.tsx b/src/app/useDataRefresh.tsx index c37c8751..b33731d9 100644 --- a/src/app/useDataRefresh.tsx +++ b/src/app/useDataRefresh.tsx @@ -5,7 +5,7 @@ import { useEffect, useRef, useCallback } from 'react'; import { useDispatch, useSelector } from 'react-redux'; import { DataFormulatorState, dfActions, selectRefreshConfigs } from './dfSlice'; import { AppDispatch } from './store'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { createTableFromText } from '../data/utils'; import { fetchWithIdentity, getUrls, computeContentHash } from './utils'; diff --git a/src/app/utils.tsx b/src/app/utils.tsx index a8eb7b5d..5334d0ef 100644 --- a/src/app/utils.tsx +++ b/src/app/utils.tsx @@ -5,8 +5,8 @@ import _, { } from "lodash"; import { useEffect, useRef } from "react"; import ts from "typescript"; import { ChannelGroups, getChartChannels, getChartTemplate } from "../components/ChartTemplates"; -import { Channel, Chart, ChartTemplate, ConceptTransformation, EncodingItem, EncodingMap, FieldItem, Trigger } from "../components/ComponentType"; -import { DictTable } from "../components/ComponentType"; +import { Channel, Chart, ChartTemplate, ConceptTransformation, EncodingItem, EncodingMap, FieldItem, Trigger } from "../components/componentType"; +import { DictTable } from "../components/componentType"; import { getDType, Type } from "../data/types"; import * as d3 from 'd3'; import { diff --git a/src/data/utils.ts b/src/data/utils.ts index 1b98deb8..775f9058 100644 --- a/src/data/utils.ts +++ b/src/data/utils.ts @@ -5,7 +5,7 @@ import * as d3 from 'd3'; import Column from './column'; import * as ExcelJS from 'exceljs'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { CoerceType, TestType, Type } from './types'; import { ColumnTable } from './table'; diff --git a/src/views/ChartRecBox.tsx b/src/views/ChartRecBox.tsx index d4cbb3b4..44002366 100644 --- a/src/views/ChartRecBox.tsx +++ b/src/views/ChartRecBox.tsx @@ -37,12 +37,12 @@ import { import React from 'react'; -import { Chart, FieldItem } from "../components/ComponentType"; +import { Chart, FieldItem } from "../components/componentType"; import _ from 'lodash'; import '../scss/EncodingShelf.scss'; -import { createDictTable, DictTable } from "../components/ComponentType"; +import { createDictTable, DictTable } from "../components/componentType"; import { getUrls, getTriggers, resolveRecommendedChart, fetchWithIdentity } from '../app/utils'; diff --git a/src/views/ChartRenderService.tsx b/src/views/ChartRenderService.tsx index f9b7a2af..3a24eada 100644 --- a/src/views/ChartRenderService.tsx +++ b/src/views/ChartRenderService.tsx @@ -20,7 +20,7 @@ import { FC, useEffect, useRef, useCallback } from 'react'; import { useSelector, useDispatch } from 'react-redux'; import { DataFormulatorState, dfActions, dfSelectors } from '../app/dfSlice'; -import { Chart, DictTable, FieldItem } from '../components/ComponentType'; +import { Chart, DictTable, FieldItem } from '../components/componentType'; import { assembleVegaChart, prepVisTable } from '../app/utils'; import { getDataTable, checkChartAvailability } from './VisualizationView'; import { getCachedChart, setCachedChart, computeCacheKey, invalidateChart, ChartCacheEntry } from '../app/chartCache'; diff --git a/src/views/ChartifactDialog.tsx b/src/views/ChartifactDialog.tsx index 4867291b..2293596f 100644 --- a/src/views/ChartifactDialog.tsx +++ b/src/views/ChartifactDialog.tsx @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -import { Chart, DictTable, FieldItem } from '../components/ComponentType'; +import { Chart, DictTable, FieldItem } from '../components/componentType'; import { assembleVegaChart, prepVisTable } from '../app/utils'; import { exportTableToDsv } from '../data/utils'; import { ClientConfig } from '../app/dfSlice'; diff --git a/src/views/ConceptCard.tsx b/src/views/ConceptCard.tsx index 82729bb6..283f4bee 100644 --- a/src/views/ConceptCard.tsx +++ b/src/views/ConceptCard.tsx @@ -30,7 +30,7 @@ import HideSourceIcon from '@mui/icons-material/HideSource'; import ArrowRightIcon from '@mui/icons-material/ArrowRight'; import AnimateHeight from 'react-animate-height'; -import { FieldItem, ConceptTransformation, duplicateField, FieldSource } from '../components/ComponentType'; +import { FieldItem, ConceptTransformation, duplicateField, FieldSource } from '../components/componentType'; import { testType, Type, TypeList } from "../data/types"; import React from 'react'; @@ -41,7 +41,7 @@ import { getIconFromType } from './ViewUtils'; import _ from 'lodash'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { CodeBox } from './VisualizationView'; import { CustomReactTable } from './ReactTable'; import { alpha } from '@mui/material/styles'; diff --git a/src/views/ConceptShelf.tsx b/src/views/ConceptShelf.tsx index 6272d98e..65b661f6 100644 --- a/src/views/ConceptShelf.tsx +++ b/src/views/ConceptShelf.tsx @@ -19,7 +19,7 @@ import { import CleaningServicesIcon from '@mui/icons-material/CleaningServices'; -import { FieldItem, Channel } from '../components/ComponentType'; +import { FieldItem, Channel } from '../components/componentType'; import React from 'react'; import { DataFormulatorState, dfActions, dfSelectors } from '../app/dfSlice'; diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx index 757dbc37..52baa243 100644 --- a/src/views/DBTableManager.tsx +++ b/src/views/DBTableManager.tsx @@ -50,7 +50,7 @@ type TableImportConfig = import { getUrls, fetchWithIdentity } from '../app/utils'; import { borderColor } from '../app/tokens'; import { CustomReactTable } from './ReactTable'; -import { DataSourceConfig, DictTable } from '../components/ComponentType'; +import { DataSourceConfig, DictTable } from '../components/componentType'; import { Type } from '../data/types'; import { useDispatch, useSelector } from 'react-redux'; import { dfActions, dfSelectors } from '../app/dfSlice'; diff --git a/src/views/DataLoadingThread.tsx b/src/views/DataLoadingThread.tsx index eecb34dd..7d756e70 100644 --- a/src/views/DataLoadingThread.tsx +++ b/src/views/DataLoadingThread.tsx @@ -29,7 +29,7 @@ import { useDispatch, useSelector } from 'react-redux'; import { AppDispatch } from '../app/store'; import { DataFormulatorState, dfActions, dfSelectors, fetchFieldSemanticType } from '../app/dfSlice'; import { borderColor, shadow, transition, radius } from '../app/tokens'; -import { DataCleanBlock, DataCleanTableOutput } from '../components/ComponentType'; +import { DataCleanBlock, DataCleanTableOutput } from '../components/componentType'; import { getUrls, fetchWithIdentity } from '../app/utils'; import { CustomReactTable } from './ReactTable'; import { createTableFromText } from '../data/utils'; diff --git a/src/views/DataThread.tsx b/src/views/DataThread.tsx index 4e164545..82862eed 100644 --- a/src/views/DataThread.tsx +++ b/src/views/DataThread.tsx @@ -32,7 +32,7 @@ import '../scss/VisualizationView.scss'; import { batch, useDispatch, useSelector } from 'react-redux'; import { DataFormulatorState, dfActions, SSEMessage } from '../app/dfSlice'; import { getTriggers } from '../app/utils'; -import { Chart, DictTable, Trigger } from "../components/ComponentType"; +import { Chart, DictTable, Trigger } from "../components/componentType"; import DeleteIcon from '@mui/icons-material/Delete'; import StarIcon from '@mui/icons-material/Star'; diff --git a/src/views/DataThreadCards.tsx b/src/views/DataThreadCards.tsx index d1f7c838..058b4793 100644 --- a/src/views/DataThreadCards.tsx +++ b/src/views/DataThreadCards.tsx @@ -16,7 +16,7 @@ import { } from '@mui/material'; import { dfActions } from '../app/dfSlice'; -import { Chart, DictTable, Trigger } from "../components/ComponentType"; +import { Chart, DictTable, Trigger } from "../components/componentType"; import DeleteIcon from '@mui/icons-material/Delete'; import AddchartIcon from '@mui/icons-material/Addchart'; diff --git a/src/views/DataView.tsx b/src/views/DataView.tsx index 570b77b1..e08487f3 100644 --- a/src/views/DataView.tsx +++ b/src/views/DataView.tsx @@ -10,7 +10,7 @@ import { alpha } from '@mui/material/styles'; import '../scss/DataView.scss'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { DataFormulatorState, dfActions, dfSelectors } from '../app/dfSlice'; import { useDispatch, useSelector } from 'react-redux'; import { Type } from '../data/types'; diff --git a/src/views/EncodingBox.tsx b/src/views/EncodingBox.tsx index 03fd500e..3cf79be8 100644 --- a/src/views/EncodingBox.tsx +++ b/src/views/EncodingBox.tsx @@ -42,7 +42,7 @@ import CalendarMonthIcon from '@mui/icons-material/CalendarMonth'; import QuestionMarkIcon from '@mui/icons-material/QuestionMark'; import { FieldItem, Channel, EncodingItem, AggrOp, AGGR_OP_LIST, - ConceptTransformation, Chart, duplicateField } from "../components/ComponentType"; + ConceptTransformation, Chart, duplicateField } from "../components/componentType"; import { EncodingDropResult } from "../views/ConceptShelf"; import _ from 'lodash'; diff --git a/src/views/EncodingShelfCard.tsx b/src/views/EncodingShelfCard.tsx index 282cc8b6..a0f2b335 100644 --- a/src/views/EncodingShelfCard.tsx +++ b/src/views/EncodingShelfCard.tsx @@ -39,12 +39,12 @@ import { import React from 'react'; import { ThinkingBufferEffect } from '../components/FunComponents'; -import { Channel, Chart, FieldItem, Trigger, duplicateChart } from "../components/ComponentType"; +import { Channel, Chart, FieldItem, Trigger, duplicateChart } from "../components/componentType"; import _ from 'lodash'; import '../scss/EncodingShelf.scss'; -import { createDictTable, DictTable } from "../components/ComponentType"; +import { createDictTable, DictTable } from "../components/componentType"; import { getUrls, resolveChartFields, getTriggers, assembleVegaChart, resolveRecommendedChart, fetchWithIdentity } from '../app/utils'; import { EncodingBox } from './EncodingBox'; diff --git a/src/views/EncodingShelfThread.tsx b/src/views/EncodingShelfThread.tsx index 76af21ad..49baa9bc 100644 --- a/src/views/EncodingShelfThread.tsx +++ b/src/views/EncodingShelfThread.tsx @@ -16,11 +16,11 @@ import { import React from 'react'; -import { Chart, Trigger } from "../components/ComponentType"; +import { Chart, Trigger } from "../components/componentType"; import '../scss/EncodingShelf.scss'; -import { DictTable } from "../components/ComponentType"; +import { DictTable } from "../components/componentType"; import { Type } from '../data/types'; import { getTriggers } from '../app/utils'; diff --git a/src/views/MultiTablePreview.tsx b/src/views/MultiTablePreview.tsx index 55a17c11..74fc4d66 100644 --- a/src/views/MultiTablePreview.tsx +++ b/src/views/MultiTablePreview.tsx @@ -14,7 +14,7 @@ import { Card, } from '@mui/material'; import DeleteIcon from '@mui/icons-material/Delete'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { CustomReactTable } from './ReactTable'; export interface MultiTablePreviewProps { diff --git a/src/views/RefreshDataDialog.tsx b/src/views/RefreshDataDialog.tsx index 063f7323..0462c306 100644 --- a/src/views/RefreshDataDialog.tsx +++ b/src/views/RefreshDataDialog.tsx @@ -25,7 +25,7 @@ import CloseIcon from '@mui/icons-material/Close'; import UploadFileIcon from '@mui/icons-material/UploadFile'; import { useSelector } from 'react-redux'; import { DataFormulatorState } from '../app/dfSlice'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { createTableFromText, loadTextDataWrapper, loadBinaryDataWrapper } from '../data/utils'; interface TabPanelProps { diff --git a/src/views/ReportView.tsx b/src/views/ReportView.tsx index 9477dd5c..5e7b5b20 100644 --- a/src/views/ReportView.tsx +++ b/src/views/ReportView.tsx @@ -45,7 +45,7 @@ import { getUrls, assembleVegaChart, getTriggers, prepVisTable, fetchWithIdentit import { MuiMarkdown, getOverrides } from 'mui-markdown'; import embed from 'vega-embed'; import { getDataTable } from './VisualizationView'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; import { AppDispatch } from '../app/store'; import { Collapse } from '@mui/material'; import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; diff --git a/src/views/SelectableDataGrid.tsx b/src/views/SelectableDataGrid.tsx index 24b86667..8e8e4cf6 100644 --- a/src/views/SelectableDataGrid.tsx +++ b/src/views/SelectableDataGrid.tsx @@ -21,7 +21,7 @@ import { getIconFromType } from './ViewUtils'; import { IconButton, TableSortLabel, Typography } from '@mui/material'; import _ from 'lodash'; -import { FieldSource, FieldItem } from '../components/ComponentType'; +import { FieldSource, FieldItem } from '../components/componentType'; import FileDownloadIcon from '@mui/icons-material/FileDownload'; import { TableIcon } from '../icons'; diff --git a/src/views/TableSelectionView.tsx b/src/views/TableSelectionView.tsx index c25fadbc..8045cf22 100644 --- a/src/views/TableSelectionView.tsx +++ b/src/views/TableSelectionView.tsx @@ -11,7 +11,7 @@ import { borderColor } from '../app/tokens'; import { StreamIcon } from '../icons'; import { createTableFromFromObjectArray } from '../data/utils'; import { MultiTablePreview } from './MultiTablePreview'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; // Update the interface to support multiple tables per dataset export interface DatasetMetadata { diff --git a/src/views/UnifiedDataUploadDialog.tsx b/src/views/UnifiedDataUploadDialog.tsx index 8023eb18..d92f61e0 100644 --- a/src/views/UnifiedDataUploadDialog.tsx +++ b/src/views/UnifiedDataUploadDialog.tsx @@ -37,7 +37,7 @@ import { useDispatch, useSelector } from 'react-redux'; import { DataFormulatorState, dfActions, fetchFieldSemanticType } from '../app/dfSlice'; import { AppDispatch } from '../app/store'; import { loadTable } from '../app/tableThunks'; -import { DataSourceConfig, DictTable } from '../components/ComponentType'; +import { DataSourceConfig, DictTable } from '../components/componentType'; import { createTableFromFromObjectArray, createTableFromText, loadTextDataWrapper, loadBinaryDataWrapper } from '../data/utils'; import { DataLoadingChat } from './DataLoadingChat'; import { DatasetSelectionView, DatasetMetadata } from './TableSelectionView'; diff --git a/src/views/ViewUtils.tsx b/src/views/ViewUtils.tsx index 3d654d32..aada1c98 100644 --- a/src/views/ViewUtils.tsx +++ b/src/views/ViewUtils.tsx @@ -4,7 +4,7 @@ import React from "react"; import ts from "typescript"; import { runCodeOnInputListsInVM } from "../app/utils"; -import { ConceptTransformation, FieldItem } from "../components/ComponentType"; +import { ConceptTransformation, FieldItem } from "../components/componentType"; import { Type } from "../data/types"; import { BooleanIcon, NumericalIcon, StringIcon, DateIcon, UnknownIcon } from '../icons'; @@ -12,7 +12,7 @@ import AutoFixHighIcon from '@mui/icons-material/AutoFixHigh'; import BarChartIcon from '@mui/icons-material/BarChart'; import CommitIcon from '@mui/icons-material/Commit'; -import { DictTable } from '../components/ComponentType'; +import { DictTable } from '../components/componentType'; export const groupConceptItems = (conceptShelfItems: FieldItem[], tables: DictTable[]) => { // group concepts based on which source table they belongs to diff --git a/src/views/VisualizationView.tsx b/src/views/VisualizationView.tsx index da0a67f0..acaa261b 100644 --- a/src/views/VisualizationView.tsx +++ b/src/views/VisualizationView.tsx @@ -44,8 +44,8 @@ import '../scss/VisualizationView.scss'; import { useDispatch, useSelector } from 'react-redux'; import { DataFormulatorState, dfActions } from '../app/dfSlice'; import { assembleVegaChart, extractFieldsFromEncodingMap, getUrls, prepVisTable, fetchWithIdentity } from '../app/utils'; -import { Chart, EncodingItem, EncodingMap, FieldItem } from '../components/ComponentType'; -import { DictTable } from "../components/ComponentType"; +import { Chart, EncodingItem, EncodingMap, FieldItem } from '../components/componentType'; +import { DictTable } from "../components/componentType"; import AddchartIcon from '@mui/icons-material/Addchart'; import DeleteIcon from '@mui/icons-material/Delete'; From 982b1145649c8dc8e5a489b398b7f1ad175cb645 Mon Sep 17 00:00:00 2001 From: Andres Date: Fri, 13 Feb 2026 08:03:32 -0800 Subject: [PATCH 6/6] StreamableHTTP server support --- py-src/data_formulator/mcp_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/py-src/data_formulator/mcp_server.py b/py-src/data_formulator/mcp_server.py index 72c26ae8..4d0d6a1c 100644 --- a/py-src/data_formulator/mcp_server.py +++ b/py-src/data_formulator/mcp_server.py @@ -154,6 +154,7 @@ load_dotenv(os.path.join(Path(__file__).parent, 'api-keys.env')) from mcp.server.fastmcp import FastMCP +from mcp.server.transport_security import TransportSecuritySettings from data_formulator.agents.client_utils import Client from data_formulator.agents.agent_data_rec import DataRecAgent @@ -326,6 +327,11 @@ def _make_chart_image( "Use list_demo_data to browse available demo datasets, then pass their " "URLs to visualize_data, explore_data, or create_chart." ), + stateless_http=True, # each HTTP request is independent; no session affinity needed + json_response=True, # all responses are JSON-serializable dicts + transport_security=TransportSecuritySettings( + enable_dns_rebinding_protection=False, # https://github.com/modelcontextprotocol/python-sdk/issues/1798 + ) ) @@ -797,3 +803,25 @@ def main(): if __name__ == "__main__": main() +else: + + # See https://github.com/modelcontextprotocol/python-sdk?tab=readme-ov-file#streamablehttp-servers + + from starlette.applications import Starlette + from starlette.routing import Mount + import contextlib + + # Create a lifespan context manager to run the session manager + @contextlib.asynccontextmanager + async def lifespan(app: Starlette): + async with mcp.session_manager.run(): + yield + + + # Mount the StreamableHTTP server to the existing ASGI server + app = Starlette( + routes=[ + Mount("/", app=mcp.streamable_http_app()), + ], + lifespan=lifespan, + ) \ No newline at end of file