diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index 8d796c4..89a5e7a 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -2,9 +2,7 @@ name: Pre-commit checks on: push: - branches: [main] pull_request: - branches: [main] jobs: pre-commit: diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 55c541d..73134f1 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -2,7 +2,6 @@ name: Tests on: push: - branches: [main] pull_request: jobs: diff --git a/README.md b/README.md index f11204d..a6bfdc6 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# M3: MIMIC-IV + MCP + Models šŸ„šŸ¤– +# M3: Medical Datasets ↔ MCP ↔ Models šŸ„šŸ¤–
M3 Logo
-> **Query MIMIC-IV medical data using natural language through MCP clients** +> **Query tabular PhysioNet medical data using natural language through MCP clients** Python MCP @@ -12,23 +12,42 @@ Code Quality PRs Welcome -Transform medical data analysis with AI! Ask questions about MIMIC-IV data in plain English and get instant insights. Choose between local demo data (free) or full cloud dataset (BigQuery). +Transform medical data analysis with AI! Ask questions about MIMIC-IV and other PhysioNet datasets in plain English and get instant insights. Choose between local data (free) or full cloud dataset (BigQuery). + +## šŸ’” How It Works + +M3 acts as a bridge between your **AI Client** (like Claude Desktop, Cursor, or LibreChat) and your medical data. + +1. **You** ask a question in your chat interface: *"How many patients in the ICU have high blood pressure?"* +2. **M3** securely translates this into a database query. +3. **M3** runs the query on your local or cloud data. +4. **The LLM** explains the results to you in plain English. + +*No SQL knowledge required.* ## Features -- šŸ” **Natural Language Queries**: Ask questions about MIMIC-IV data in plain English -- šŸ  **Local DuckDB + Parquet**: Fast local queries for demo and full dataset using Parquet files with DuckDB views +- šŸ” **Natural Language Queries**: Ask questions about your medical data in plain English +- šŸ  **Modular Datasets**: Support for any tabular PhysioNet dataset (MIMIC-IV, etc.) +- šŸ“‚ **Local DuckDB + Parquet**: Fast local queries using Parquet files with DuckDB views - ā˜ļø **BigQuery Support**: Access full MIMIC-IV dataset on Google Cloud - šŸ”’ **Enterprise Security**: OAuth2 authentication with JWT tokens and rate limiting - šŸ›”ļø **SQL Injection Protection**: Read-only queries with comprehensive validation +- 🧩 **Extensible Architecture**: Easily add new custom datasets via configuration or CLI ## šŸš€ Quick Start -> šŸ“ŗ **Prefer video tutorials?** Check out [step-by-step video guides](https://rafiattrach.github.io/m3/) covering setup, PhysioNet configuration, and more. +> **New to this?** šŸ“ŗ [Watch our 5-minute setup video](https://rafiattrach.github.io/m3/) to see it in action. + +### Prerequisites +You need an **MCP-compatible Client** to use M3. Popular options include: +- [Claude for Desktop](https://claude.ai/download) +- [Cursor](https://cursor.com) +- [LibreChat](https://www.librechat.ai/) -### Install uv (required for `uvx`) +### 1. Install `uv` (Required) -We use `uvx` to run the MCP server. Install `uv` from the official installer, then verify with `uv --version`. +We use `uvx` to run the MCP server efficiently. **macOS and Linux:** ```bash @@ -40,86 +59,87 @@ curl -LsSf https://astral.sh/uv/install.sh | sh powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" ``` -Verify installation: -```bash -uv --version -``` +### 2. Choose Your Data Source -### BigQuery Setup (Optional - Full Dataset) +Select **Option A** (Local) or **Option B** (Cloud). -**Skip this if using DuckDB demo database.** +#### Option A: Local Dataset (Free & Fast) +*Best for development, testing, and offline use.* -1. **Install Google Cloud SDK:** - - macOS: `brew install google-cloud-sdk` - - Windows/Linux: https://cloud.google.com/sdk/docs/install +1. **Create project directory:** + ```bash + mkdir m3 && cd m3 + ``` -2. **Authenticate:** - ```bash - gcloud auth application-default login - ``` - *Opens your browser - choose the Google account with BigQuery access to MIMIC-IV.* +2. **Initialize Dataset:** -### M3 Initialization + We will use MIMIC-IV as an example. -**Supported clients:** [Claude Desktop](https://www.claude.com/download), [Cursor](https://cursor.com/download), [Goose](https://block.github.io/goose/), and [more](https://github.com/punkpeye/awesome-mcp-clients). + **For Demo (Auto-download ~16MB):** + ```bash + uv init && uv add m3-mcp + uv run m3 init mimic-iv-demo + ``` - - - - - -
+ **For Full Data (Requires Manual Download):** + *Download CSVs from [PhysioNet](https://physionet.org/content/mimiciv/3.1/) first and place them in `m3_data/raw_files`.* + ```bash + uv init && uv add m3-mcp + uv run m3 init mimic-iv-full + ``` + *This can take 5-15 minutes depending on your machine* -**DuckDB (Demo or Full Dataset)** +3. **Configure Your Client:** + **For Claude Desktop (Shortcut):** + ```bash + uv run m3 config claude --quick + ``` -To create a m3 directory and navigate into it run: -```shell -mkdir m3 && cd m3 -``` -If you want to use the full dataset, download it manually from [PhysioNet](https://physionet.org/content/mimiciv/3.1/) and place it into `m3/m3_data/raw`. For using the demo set you can continue and run: + **For Other Clients (Cursor, LibreChat, etc.):** + ```bash + uv run m3 config --quick + ``` + *This generates the configuration JSON you need to paste into your client's settings.* -```shell -uv init && uv add m3-mcp && \ -uv run m3 init DATASET_NAME && uv run m3 config --quick -``` -Replace `DATASET_NAME` with `mimic-iv-demo` or `mimic-iv-full` and copy & paste the output of this command into your client config JSON file. +#### Option B: BigQuery (Full Cloud Dataset) +*Best for researchers with Google Cloud access.* -*Demo dataset (16MB raw download size) downloads automatically on first query.* +1. **Authenticate with Google:** + ```bash + gcloud auth application-default login + ``` -*Full dataset (10.6GB raw download size) needs to be downloaded manually.* +2. **Configure Client:** + ```bash + uv run m3 config --backend bigquery --project_id BIGQUERY_PROJECT_ID + ``` + *This also generates the configuration JSON you need to paste into your client's settings.* - -**BigQuery (Full Dataset)** -Requires GCP credentials and PhysioNet access. +### 3. Start Asking Questions! +Restart your MCP client and try: +- "What tools do you have for MIMIC-IV data?" +- "Show me patient demographics from the ICU" +- "What is the race distribution in admissions?" -Paste this into your client config JSON file: +--- -```json -{ - "mcpServers": { - "m3": { - "command": "uvx", - "args": ["m3-mcp"], - "env": { - "M3_BACKEND": "bigquery", - "M3_PROJECT_ID": "your-project-id" - } - } - } -} -``` +## šŸ”„ Managing Datasets -*Replace `your-project-id` with your Google Cloud project ID.* +Switch between available datasets instantly: -
+```bash +# Switch to full dataset +m3 use mimic-iv-full -**That's it!** Restart your MCP client and ask: -- "What tools do you have for MIMIC-IV data?" -- "Show me patient demographics from the ICU" -- "What is the race distribution in admissions?" +# Switch back to demo +m3 use mimic-iv-demo + +# Check status +m3 status +``` --- @@ -129,17 +149,47 @@ Paste this into your client config JSON file: |---------|---------------|---------------|-----------------| | **Cost** | Free | Free | BigQuery usage fees | | **Setup** | Zero config | Manual Download | GCP credentials required | -| **Data Size** | 100 patients, 275 admissions | 365k patients, 546k admissions | 365k patients, 546k admissions | +| **Credentials** | Not required | PhysioNet | PhysioNet | +| **Data Size** | 100 patients | 365k patients | 365k patients | | **Speed** | Fast (local) | Fast (local) | Network latency | -| **Use Case** | Learning, development | Research (local) | Research, production | +| **Use Case** | Learning | Research (local) | Research, production | + +--- + +## āž• Adding Custom Datasets + +M3 is designed to be modular. You can add support for any tabular dataset on PhysioNet easily. Let's take eICU as an example: + +### JSON Definition Method + +1. Create a definition file: `m3_data/datasets/eicu.json` + ```json + { + "name": "eicu", + "description": "eICU Collaborative Research Database", + "file_listing_url": "https://physionet.org/files/eicu-crd/2.0/", + "subdirectories_to_scan": [], + "primary_verification_table": "eicu_crd_patient", + "tags": ["clinical", "eicu"], + "requires_authentication": true, + "bigquery_project_id": "physionet-data", + "bigquery_dataset_ids": ["eicu_crd"] + } + ``` + +2. Initialize it: + ```bash + m3 init eicu --src /path/to/raw/csvs + ``` + *M3 will convert CSVs to Parquet and create DuckDB views automatically.* --- ## Alternative Installation Methods -> Already have Docker or prefer pip? Here are other ways to run m3: +> Already have Docker or prefer pip? -### 🐳 Docker (No Python Required) +### 🐳 Docker @@ -182,385 +232,100 @@ docker run -d --name m3-server \ } ``` -Stop: `docker stop m3-server && docker rm m3-server` - -### pip Install + CLI Tools +### pip Install ```bash pip install m3-mcp -``` - -> šŸ’” **CLI commands:** Run `m3 --help` to see all available options. - -**Useful CLI commands:** -- `m3 init mimic-iv-demo` - Download demo database -- `m3 config` - Generate MCP configuration interactively -- `m3 config claude --backend bigquery --project-id YOUR_PROJECT_ID` - Quick BigQuery setup - -**Example MCP config:** -```json -{ - "mcpServers": { - "m3": { - "command": "m3-mcp-server", - "env": { - "M3_BACKEND": "duckdb" - } - } - } -} +m3 config --quick ``` ### Local Development For contributors: -```bash -git clone https://github.com/rafiattrach/m3.git && cd m3 -python -m venv .venv -source .venv/bin/activate # Windows: .venv\Scripts\activate -pip install -e ".[dev]" -pre-commit install -``` - -**MCP config:** -```json -{ - "mcpServers": { - "m3": { - "command": "/path/to/m3/.venv/bin/python", - "args": ["-m", "m3.mcp_server"], - "cwd": "/path/to/m3", - "env": { - "M3_BACKEND": "duckdb" +1. **Clone & Install (using `uv`):** + ```bash + git clone https://github.com/rafiattrach/m3.git + cd m3 + uv venv + uv sync + ``` + +2. **MCP Config:** + ```json + { + "mcpServers": { + "m3": { + "command": "/absolute/path/to/m3/.venv/bin/python", + "args": ["-m", "m3.mcp_server"], + "cwd": "/absolute/path/to/m3", + "env": { "M3_BACKEND": "duckdb" } + } } } - } -} -``` - -#### Using `UV` (Recommended) -Assuming you have [UV](https://docs.astral.sh/uv/getting-started/installation/) installed. - -**Step 1: Clone and Navigate** -```bash -# Clone the repository -git clone https://github.com/rafiattrach/m3.git -cd m3 -``` - -**Step 2: Create `UV` Virtual Environment** -```bash -# Create virtual environment -uv venv -``` - -**Step 3: Install M3** -```bash -uv sync -# Do not forget to use `uv run` to any subsequent commands to ensure you're using the `uv` virtual environment -``` - -### šŸ—„ļø Database Configuration - -After installation, choose your data source: - -#### Option A: Local Demo (DuckDB + Parquet) - -**Perfect for learning and development - completely free!** - -1. **Initialize demo dataset**: - ```bash - m3 init mimic-iv-demo - ``` - -2. **Setup MCP Client**: - ```bash - m3 config - ``` - - *Alternative: For Claude Desktop specifically:* - ```bash - m3 config claude --backend duckdb --db-path /Users/you/path/to/m3_data/databases/mimic_iv_demo.duckdb - ``` - -5. **Restart your MCP client** and ask: - - - "What tools do you have for MIMIC-IV data?" - - "Show me patient demographics from the ICU" - -#### Option B: Local Full Dataset (DuckDB + Parquet) - -**Run the entire MIMIC-IV dataset locally with DuckDB views over Parquet.** - -1. **Acquire CSVs** (requires PhysioNet credentials): - - Download the official MIMIC-IV CSVs from PhysioNet and place them under: - - `/Users/you/path/to/m3/m3_data/raw_files/mimic-iv-full/hosp/` - - `/Users/you/path/to/m3/m3_data/raw_files/mimic-iv-full/icu/` - - Note: `m3 init`'s auto-download function currently only supports the demo dataset. Use your browser or `wget` to obtain the full dataset. - -2. **Initialize full dataset**: - ```bash - m3 init mimic-iv-full - ``` - - This may take up to 30 minutes, depending on your system (e.g. 10 minutes for MacBook Pro M3) - - Performance knobs (optional): - ```bash - export M3_CONVERT_MAX_WORKERS=6 # number of parallel files (default=4) - export M3_DUCKDB_MEM=4GB # DuckDB memory limit per worker (default=3GB) - export M3_DUCKDB_THREADS=4 # DuckDB threads per worker (default=2) - ``` - Pay attention to your system specifications, especially if you have enough memory. - -3. **Select dataset and verify**: - ```bash - m3 use full # optional, as this automatically got set to full - m3 status - ``` - - Status prints active dataset, local DB path, Parquet presence, quick row counts and total Parquet size. - -4. **Configure MCP client** (uses the full local DB): - ```bash - m3 config - # or - m3 config claude --backend duckdb --db-path /Users/you/path/to/m3/m3_data/databases/mimic_iv_full.duckdb - ``` - -#### Option C: BigQuery (Full Dataset) - -**For researchers needing complete MIMIC-IV data** - -##### Prerequisites -- Google Cloud account and project with billing enabled -- Access to MIMIC-IV on BigQuery (requires PhysioNet credentialing) - -##### Setup Steps + ``` -1. **Install Google Cloud CLI**: - - **macOS (with Homebrew):** - ```bash - brew install google-cloud-sdk - ``` - - **Windows:** Download from https://cloud.google.com/sdk/docs/install - - **Linux:** - ```bash - curl https://sdk.cloud.google.com | bash - ``` - -2. **Authenticate**: - ```bash - gcloud auth application-default login - ``` - *This will open your browser - choose the Google account that has access to your BigQuery project with MIMIC-IV data.* - -3. **Setup MCP Client for BigQuery**: - ```bash - m3 config - ``` - - *Alternative: For Claude Desktop specifically:* - ```bash - m3 config claude --backend bigquery --project-id YOUR_PROJECT_ID - ``` - -4. **Test BigQuery Access** - Restart your MCP client and ask: - ``` - Use the get_race_distribution function to show me the top 5 races in MIMIC-IV admissions. - ``` +--- ## šŸ”§ Advanced Configuration -Need to configure other MCP clients or customize settings? Use these commands: - -### Interactive Configuration (Universal) +**Interactive Config Generator:** ```bash m3 config ``` -Generates configuration for any MCP client with step-by-step guidance. -### Quick Configuration Examples +**OAuth2 Authentication:** +For secure production deployments: ```bash -# Quick universal config with defaults -m3 config --quick - -# Universal config with custom DuckDB database -m3 config --quick --backend duckdb --db-path /path/to/database.duckdb - -# Save config to file for other MCP clients -m3 config --output my_config.json -``` - -### OAuth2 Authentication (Optional) - -For production deployments requiring secure access to medical data: - -```bash -# Enable OAuth2 with Claude Desktop m3 config claude --enable-oauth2 \ --oauth2-issuer https://your-auth-provider.com \ - --oauth2-audience m3-api \ - --oauth2-scopes "read:mimic-data" - -# Or configure interactively -m3 config # Choose OAuth2 option during setup + --oauth2-audience m3-api ``` - -**Supported OAuth2 Providers:** -- Auth0, Google Identity Platform, Microsoft Azure AD, Keycloak -- Any OAuth2/OpenID Connect compliant provider - -**Key Benefits:** -- šŸ”’ **JWT Token Validation**: Industry-standard security -- šŸŽÆ **Scope-based Access**: Fine-grained permissions -- šŸ›”ļø **Rate Limiting**: Abuse protection -- šŸ“Š **Audit Logging**: Security monitoring - -> šŸ“– **Complete OAuth2 Setup Guide**: See [`docs/OAUTH2_AUTHENTICATION.md`](docs/OAUTH2_AUTHENTICATION.md) for detailed configuration, troubleshooting, and production deployment guidelines. +> See [`docs/OAUTH2_AUTHENTICATION.md`](docs/OAUTH2_AUTHENTICATION.md) for details. --- ## šŸ› ļø Available MCP Tools -When your MCP client processes questions, it uses these tools automatically: - - **get_database_schema**: List all available tables -- **get_table_info**: Get column info and sample data for a table +- **get_table_info**: Get column info and sample data - **execute_mimic_query**: Execute SQL SELECT queries -- **get_icu_stays**: ICU stay information and length of stay data +- **get_icu_stays**: ICU stay info & length of stay - **get_lab_results**: Laboratory test results -- **get_race_distribution**: Patient race distribution +- **get_race_distribution**: Patient race statistics ## Example Prompts -Try asking your MCP client these questions: - -**Demographics & Statistics:** - -- `Prompt:` *What is the race distribution in MIMIC-IV admissions?* -- `Prompt:` *Show me patient demographics for ICU stays* -- `Prompt:` *How many total admissions are in the database?* +**Demographics:** +- *What is the race distribution in MIMIC-IV admissions?* +- *Show me patient demographics for ICU stays* **Clinical Data:** +- *Find lab results for patient X* +- *What lab tests are most commonly ordered?* -- `Prompt:` *Find lab results for patient X* -- `Prompt:` *What lab tests are most commonly ordered?* -- `Prompt:` *Show me recent ICU admissions* - -**Data Exploration:** - -- `Prompt:` *What tables are available in the database?* -- `Prompt:` *What tools do you have for MIMIC-IV data?* - -## šŸŽ© Pro Tips - -- Do you want to pre-approve the usage of all tools in Claude Desktop? Use the prompt below and then select **Always Allow** - - `Prompt:` *Can you please call all your tools in a logical sequence?* - -## šŸ” Troubleshooting - -### Common Issues - -**Local "Parquet not found" or view errors:** -Rerun the `m3 init` command for your chosen dataset. - -**MCP client server not starting:** -1. Check your MCP client logs (for Claude Desktop: Help → View Logs) -2. Verify configuration file location and format -3. Restart your MCP client completely - -### OAuth2 Authentication Issues - -**"Missing OAuth2 access token" errors:** -```bash -# Set your access token -export M3_OAUTH2_TOKEN="Bearer your-access-token-here" -``` - -**"OAuth2 authentication failed" errors:** -- Verify your token hasn't expired -- Check that required scopes are included in your token -- Ensure your OAuth2 provider configuration is correct - -**Rate limit exceeded:** -- Wait for the rate limit window to reset -- Contact your administrator to adjust limits if needed - -> šŸ”§ **OAuth2 Troubleshooting**: See [`OAUTH2_AUTHENTICATION.md`](docs/OAUTH2_AUTHENTICATION.md) for detailed OAuth2 troubleshooting and configuration guides. - -### BigQuery Issues +**Exploration:** +- *What tables are available in the database?* -**"Access Denied" errors:** -- Ensure you have MIMIC-IV access on PhysioNet -- Verify your Google Cloud project has BigQuery API enabled -- Check that you're authenticated: `gcloud auth list` - -**"Dataset not found" errors:** -- Confirm your project ID is correct -- Ensure you have access to `physionet-data` project - -**Authentication issues:** -```bash -# Re-authenticate -gcloud auth application-default login - -# Check current authentication -gcloud auth list -``` - -## For Developers - -> See "Local Development" section above for setup instructions. - -### Running Tests - -```bash -pytest # All tests (includes OAuth2 and BigQuery mocks) -pytest tests/test_mcp_server.py -v # MCP server tests -pytest tests/test_oauth2_auth.py -v # OAuth2 authentication tests -``` - -### Test BigQuery Locally - -```bash -# Set environment variables -export M3_BACKEND=bigquery -export M3_PROJECT_ID=your-project-id -export GOOGLE_CLOUD_PROJECT=your-project-id - -# Optional: Test with OAuth2 authentication -export M3_OAUTH2_ENABLED=true -export M3_OAUTH2_ISSUER_URL=https://your-provider.com -export M3_OAUTH2_AUDIENCE=m3-api -export M3_OAUTH2_TOKEN="Bearer your-test-token" - -# Test MCP server -m3-mcp-server -``` - -## Roadmap - -- šŸ  **Complete Local Full Dataset**: Complete the support for `mimic-iv-full` (Download CLI) -- šŸ”§ **Advanced Tools**: More specialized medical data functions -- šŸ“Š **Visualization**: Built-in plotting and charting tools -- šŸ” **Enhanced Security**: Role-based access control, audit logging -- 🌐 **Multi-tenant Support**: Organization-level data isolation +--- -## Contributing +## Troubleshooting -We welcome contributions! Please: +- **"Parquet not found"**: Rerun `m3 init `. +- **MCP client not starting**: Check logs (Claude Desktop: Help → View Logs). +- **BigQuery Access Denied**: Run `gcloud auth application-default login` and verify project ID. -1. Fork the repository -2. Create a feature branch -3. Add tests for new functionality -4. Submit a pull request +--- -## Citation +## Contributing & Citation -If you use M3 in your research, please cite: +### For Developers +We welcome contributions! +1. **Setup:** Follow the "Local Development" steps above. +2. **Test:** Run `uv run pre-commit --all-files` to ensure everything is working and linted. +3. **Submit:** Open a Pull Request with your changes. +**Citation:** ```bibtex @article{attrach2025conversational, title={Conversational LLMs Simplify Secure Clinical Data Access, Understanding, and Analysis}, diff --git a/src/m3/cli.py b/src/m3/cli.py index cc7a4dc..43df89a 100644 --- a/src/m3/cli.py +++ b/src/m3/cli.py @@ -8,7 +8,6 @@ from m3 import __version__ from m3.config import ( - SUPPORTED_DATASETS, detect_available_local_datasets, get_active_dataset, get_dataset_config, @@ -24,6 +23,7 @@ init_duckdb_from_parquet, verify_table_rowcount, ) +from m3.datasets import DatasetRegistry app = typer.Typer( name="m3", @@ -81,7 +81,7 @@ def dataset_init_cmd( typer.Argument( help=( "Dataset to initialize (local). Default: 'mimic-iv-demo'. " - f"Supported: {', '.join(SUPPORTED_DATASETS.keys())}" + f"Supported: {', '.join([ds.name for ds in DatasetRegistry.list_all()])}" ), metavar="DATASET_NAME", ), @@ -111,9 +111,8 @@ def dataset_init_cmd( - If neither exists: download (demo only), convert, then initialize Notes: - - Auto-download currently supports only 'mimic-iv-demo'. For 'mimic-iv-full', - place the official raw CSV.gz files under /m3_data/raw_files// - with 'hosp/' and 'icu/' subdirectories, then re-run this command. + - Auto-download is based on the dataset definition URL. + - For datasets without a download URL (e.g. mimic-iv-full), you must provide the --src path or place files in the expected location. """ logger.info(f"CLI 'init' called for dataset: '{dataset_name}'") @@ -126,7 +125,7 @@ def dataset_init_cmd( err=True, ) typer.secho( - f"Supported datasets are: {', '.join(SUPPORTED_DATASETS.keys())}", + f"Supported datasets are: {', '.join([ds.name for ds in DatasetRegistry.list_all()])}", fg=typer.colors.YELLOW, err=True, ) @@ -144,7 +143,6 @@ def dataset_init_cmd( csv_root = Path(src).resolve() if src else csv_root_default # Presence detection (check for any parquet or csv.gz files) - # NOTE: Checks need to be more robust as soon as we support the full dataset for download (don't just check for any file, but that no files are missing) parquet_present = any(pq_root.rglob("*.parquet")) raw_present = any(csv_root.rglob("*.csv.gz")) @@ -152,14 +150,40 @@ def dataset_init_cmd( typer.echo(f"Raw root: {csv_root} (present={raw_present})") typer.echo(f"Parquet root: {pq_root} (present={parquet_present})") - # Step 1: Ensure raw dataset exists (download demo if missing; for full, inform and return) + # Step 1: Ensure raw dataset exists (download if missing, for requires_authentication datasets, inform and return) if not raw_present and not parquet_present: - if dataset_key == "mimic-iv-demo": + requires_auth = dataset_config.get("requires_authentication", False) + + if requires_auth: + base_url = dataset_config.get("file_listing_url") + + typer.secho( + f"āŒ Files not found for credentialed dataset '{dataset_key}'.", + fg=typer.colors.RED, + ) + typer.echo("To download this credentialed dataset:") + typer.echo( + f"1. Ensure you have signed the DUA at: {base_url or 'https://physionet.org'}" + ) + typer.echo( + "2. Run this command (you will be asked for your PhysioNet password):" + ) + typer.echo("") + + # Wget command tailored to the user's path + wget_cmd = f"wget -r -N -c -np --user YOUR_USERNAME --ask-password {base_url} -P {csv_root}" + typer.secho(f" {wget_cmd}", fg=typer.colors.CYAN) + typer.echo("") + typer.echo(f"3. Re-run 'm3 init {dataset_key}'") + return + + listing_url = dataset_config.get("file_listing_url") + if listing_url: out_dir = csv_root_default out_dir.mkdir(parents=True, exist_ok=True) typer.echo(f"Downloading dataset: '{dataset_key}'") - typer.echo(f"Listing URL: {dataset_config.get('file_listing_url')}") + typer.echo(f"Listing URL: {listing_url}") typer.echo(f"Output directory: {out_dir}") ok = download_dataset(dataset_key, out_dir) @@ -177,16 +201,16 @@ def dataset_init_cmd( raw_present = True else: typer.secho( - "Auto-download is only supported for 'mimic-iv-demo'.", + f"Auto-download is not available for '{dataset_key}'.", fg=typer.colors.YELLOW, ) typer.secho( ( - "To initialize 'mimic-iv-full':\n" - "1) Download the official MIMIC-IV dataset from PhysioNet (this requires a PhysioNet account with dataset access)\n" - "2) Place the raw CSV.gz files under: {csv_root_default}\n" - " Ensure the structure includes 'hosp/' and 'icu/' subdirectories.\n" - "3) Then re-run: m3 init mimic-iv-full" + "To initialize this dataset:\n" + "1) Download the raw data manually.\n" + f"2) Place the raw CSV.gz files under: {csv_root_default}\n" + " (or use --src to point to their location)\n" + f"3) Then re-run: m3 init {dataset_key}" ), fg=typer.colors.WHITE, ) @@ -207,7 +231,7 @@ def dataset_init_cmd( raise typer.Exit(code=1) typer.secho("āœ… Conversion complete.", fg=typer.colors.GREEN) - # Step 2: Initialize DuckDB over Parquet + # Step 3: Initialize DuckDB over Parquet final_db_path = ( Path(db_path_str).resolve() if db_path_str @@ -287,10 +311,7 @@ def dataset_init_cmd( ) # Set active dataset to match init target - if dataset_key == "mimic-iv-demo": - set_active_dataset("demo") - elif dataset_key == "mimic-iv-full": - set_active_dataset("full") + set_active_dataset(dataset_key) @app.command("use") @@ -298,31 +319,60 @@ def use_cmd( target: Annotated[ str, typer.Argument( - help="Select active dataset: demo | full | bigquery", metavar="TARGET" + help="Select active dataset: name (e.g., mimic-iv-full)", metavar="TARGET" ), ], ): """Set the active dataset selection for the project.""" target = target.lower() - if target not in ("demo", "full", "bigquery"): + + # 1. Check if dataset is registered + # We use detect_available_local_datasets just to get the list + status, + # but we could also just check DatasetRegistry directly. + availability = detect_available_local_datasets().get(target) + + if not availability: typer.secho( - "Target must be one of: demo, full, bigquery", fg=typer.colors.RED, err=True + f"Dataset '{target}' not found or not registered.", + fg=typer.colors.RED, + err=True, ) + # List available + supported = ", ".join([ds.name for ds in DatasetRegistry.list_all()]) + typer.secho(f"Supported datasets: {supported}", fg=typer.colors.YELLOW) raise typer.Exit(code=1) - if target in ("demo", "full"): - availability = detect_available_local_datasets()[target] - if not availability["parquet_present"]: - typer.secho( - f"Parquet directory missing at {availability['parquet_root']}. Cannot activate '{target}'.", - fg=typer.colors.RED, - err=True, - ) - raise typer.Exit(code=1) - + # 2. Set it active immediately (don't block on files) set_active_dataset(target) typer.secho(f"Active dataset set to '{target}'.", fg=typer.colors.GREEN) + # 3. Warn if local files are missing (helpful info, not a blocker) + if not availability["parquet_present"]: + typer.secho( + f"āš ļø Note: Local Parquet files not found at {availability['parquet_root']}.", + fg=typer.colors.YELLOW, + ) + typer.echo( + " This is fine if you are using the BigQuery backend.\n" + " If you intend to use DuckDB (local), run 'm3 init' first." + ) + else: + typer.secho( + " Local: Available", + ) + + # 4. Check BigQuery support + ds_def = DatasetRegistry.get(target) + if ds_def: + if not ds_def.bigquery_dataset_ids: + typer.secho( + "āš ļø Warning: This dataset is not configured for BigQuery.", + fg=typer.colors.YELLOW, + ) + typer.echo(" If you are using the BigQuery backend, queries will fail.") + else: + typer.echo(f" BigQuery: Available (Project: {ds_def.bigquery_project_id})") + @app.command("status") def status_cmd(): @@ -334,9 +384,11 @@ def status_cmd(): ) availability = detect_available_local_datasets() + if not availability: + typer.echo("No datasets detected.") + return - for label in ("demo", "full"): - info = availability[label] + for label, info in availability.items(): typer.secho(f"\n=== {label.upper()} ===", fg=typer.colors.BRIGHT_BLUE) parquet_icon = "āœ…" if info["parquet_present"] else "āŒ" @@ -354,9 +406,14 @@ def status_cmd(): except Exception: typer.echo(" parquet_size_gb: (skipped)") + # Show BigQuery status + ds_def = DatasetRegistry.get(label) + if ds_def: + bq_status = "āœ…" if ds_def.bigquery_dataset_ids else "āŒ" + typer.echo(f" BigQuery Support: {bq_status}") + # Try a quick rowcount on the verification table if db present - ds_name = "mimic-iv-demo" if label == "demo" else "mimic-iv-full" - cfg = get_dataset_config(ds_name) + cfg = get_dataset_config(label) if info["db_present"] and cfg: try: count = verify_table_rowcount( @@ -507,17 +564,10 @@ def config_cmd( if backend != "duckdb": cmd.extend(["--backend", backend]) - # For duckdb, infer db_path from active dataset if not provided - if backend == "duckdb": - if db_path: - inferred_db_path = Path(db_path).resolve() - else: - active_dataset = get_active_dataset() - if not active_dataset: - # default to demo if nothing is set - inferred_db_path = get_default_database_path("mimic-iv-demo") - else: - inferred_db_path = get_default_database_path(active_dataset) + # For duckdb, pass db_path only if explicitly provided. + # If omitted, the server will resolve it dynamically based on the active dataset. + if backend == "duckdb" and db_path: + inferred_db_path = Path(db_path).resolve() cmd.extend(["--db-path", str(inferred_db_path)]) elif backend == "bigquery" and project_id: diff --git a/src/m3/config.py b/src/m3/config.py index fd094e7..b368b3b 100644 --- a/src/m3/config.py +++ b/src/m3/config.py @@ -1,6 +1,11 @@ +import dataclasses import json import logging +import os from pathlib import Path +from typing import Any + +from m3.datasets import DatasetDefinition, DatasetRegistry APP_NAME = "m3" @@ -38,38 +43,37 @@ def _get_project_root() -> Path: _DEFAULT_DATABASES_DIR = _PROJECT_DATA_DIR / "databases" _DEFAULT_PARQUET_DIR = _PROJECT_DATA_DIR / "parquet" _RUNTIME_CONFIG_PATH = _PROJECT_DATA_DIR / "config.json" - -# -------------------------------------------------- -# Dataset configurations (add more entries as needed) -# -------------------------------------------------- -SUPPORTED_DATASETS = { - "mimic-iv-demo": { - "file_listing_url": "https://physionet.org/files/mimic-iv-demo/2.2/", - "subdirectories_to_scan": ["hosp", "icu"], - "default_duckdb_filename": "mimic_iv_demo.duckdb", - "primary_verification_table": "hosp_admissions", - }, - "mimic-iv-full": { - "file_listing_url": None, - "subdirectories_to_scan": ["hosp", "icu"], - "default_duckdb_filename": "mimic_iv_full.duckdb", - "primary_verification_table": "hosp_admissions", - }, -} - -# Dataset name aliases used on the CLI -CLI_DATASET_ALIASES = { - "demo": "mimic-iv-demo", - "full": "mimic-iv-full", -} +_CUSTOM_DATASETS_DIR = _PROJECT_DATA_DIR / "datasets" # -------------------------------------------------- # Helper functions # -------------------------------------------------- +def _load_custom_datasets(): + """Load custom dataset definitions from JSON files in m3_data/datasets/.""" + if not _CUSTOM_DATASETS_DIR.exists(): + logger.warning( + f"Custom datasets directory does not exist: {_CUSTOM_DATASETS_DIR}" + ) + return + + for f in _CUSTOM_DATASETS_DIR.glob("*.json"): + try: + data = json.loads(f.read_text()) + # Basic validation/loading + ds = DatasetDefinition(**data) + DatasetRegistry.register(ds) + except Exception as e: + logger.warning(f"Failed to load custom dataset from {f}: {e}") + + def get_dataset_config(dataset_name: str) -> dict | None: """Retrieve the configuration for a given dataset (case-insensitive).""" - return SUPPORTED_DATASETS.get(dataset_name.lower()) + # Ensure custom datasets are loaded + _load_custom_datasets() + + ds = DatasetRegistry.get(dataset_name.lower()) + return dataclasses.asdict(ds) if ds else None def get_default_database_path(dataset_name: str) -> Path | None: @@ -77,7 +81,6 @@ def get_default_database_path(dataset_name: str) -> Path | None: Return the default local DuckDB path for a given dataset, under /m3_data/databases/. """ - cfg = get_dataset_config(dataset_name) if not cfg: logger.warning( @@ -116,19 +119,16 @@ def _ensure_data_dirs(): _DEFAULT_DATABASES_DIR.mkdir(parents=True, exist_ok=True) _DEFAULT_PARQUET_DIR.mkdir(parents=True, exist_ok=True) _PROJECT_DATA_DIR.mkdir(parents=True, exist_ok=True) + _CUSTOM_DATASETS_DIR.mkdir(parents=True, exist_ok=True) def _get_default_runtime_config() -> dict: + # We initialize with empty overrides. + # Paths are derived dynamically from registry unless overridden here. return { "active_dataset": None, - "duckdb_paths": { - "demo": str(get_default_database_path("mimic-iv-demo") or ""), - "full": str(get_default_database_path("mimic-iv-full") or ""), - }, - "parquet_roots": { - "demo": str(get_dataset_parquet_root("mimic-iv-demo") or ""), - "full": str(get_dataset_parquet_root("mimic-iv-full") or ""), - }, + "duckdb_paths": {}, # Map dataset_name -> path + "parquet_roots": {}, # Map dataset_name -> path } @@ -153,76 +153,86 @@ def _has_parquet_files(path: Path | None) -> bool: return bool(path and path.exists() and any(path.rglob("*.parquet"))) -def detect_available_local_datasets() -> dict: - """Return presence flags for demo/full based on Parquet roots and DuckDB files.""" +def detect_available_local_datasets() -> dict[str, dict[str, Any]]: + """Return presence flags for all registered datasets.""" + _load_custom_datasets() cfg = load_runtime_config() - demo_parquet_path = ( - Path(cfg["parquet_roots"]["demo"]) - if cfg["parquet_roots"]["demo"] - else get_dataset_parquet_root("mimic-iv-demo") - ) - full_parquet_path = ( - Path(cfg["parquet_roots"]["full"]) - if cfg["parquet_roots"]["full"] - else get_dataset_parquet_root("mimic-iv-full") - ) - demo_db_path = ( - Path(cfg["duckdb_paths"]["demo"]) - if cfg["duckdb_paths"]["demo"] - else get_default_database_path("mimic-iv-demo") - ) - full_db_path = ( - Path(cfg["duckdb_paths"]["full"]) - if cfg["duckdb_paths"]["full"] - else get_default_database_path("mimic-iv-full") - ) - return { - "demo": { - "parquet_present": _has_parquet_files(demo_parquet_path), - "db_present": bool(demo_db_path and demo_db_path.exists()), - "parquet_root": str(demo_parquet_path) if demo_parquet_path else "", - "db_path": str(demo_db_path) if demo_db_path else "", - }, - "full": { - "parquet_present": _has_parquet_files(full_parquet_path), - "db_present": bool(full_db_path and full_db_path.exists()), - "parquet_root": str(full_parquet_path) if full_parquet_path else "", - "db_path": str(full_db_path) if full_db_path else "", - }, - } + + results = {} + + # Check all registered datasets + for ds in DatasetRegistry.list_all(): + name = ds.name + + # Determine paths (check config overrides first) + parquet_root_str = cfg.get("parquet_roots", {}).get(name) + parquet_root = ( + Path(parquet_root_str) + if parquet_root_str + else get_dataset_parquet_root(name) + ) + + db_path_str = cfg.get("duckdb_paths", {}).get(name) + db_path = Path(db_path_str) if db_path_str else get_default_database_path(name) + + results[name] = { + "parquet_present": _has_parquet_files(parquet_root), + "db_present": bool(db_path and db_path.exists()), + "parquet_root": str(parquet_root) if parquet_root else "", + "db_path": str(db_path) if db_path else "", + } + + return results def get_active_dataset() -> str | None: + """Get the active dataset name.""" + # Ensure custom datasets are loaded so they can be found in the registry + _load_custom_datasets() + + # Priority 1: Environment variable + env_dataset = os.getenv("M3_DATASET") + if env_dataset: + return env_dataset + + # Priority 2: Config file cfg = load_runtime_config() active = cfg.get("active_dataset") - if active in CLI_DATASET_ALIASES: - return CLI_DATASET_ALIASES[active] - if active == "bigquery": - return "bigquery" - # Auto-detect default: prefer demo, then full - availability = detect_available_local_datasets() - if availability["demo"]["parquet_present"]: - return CLI_DATASET_ALIASES["demo"] - if availability["full"]["parquet_present"]: - return CLI_DATASET_ALIASES["full"] - logger.warning("Unknown active_dataset value in config: %s", active) - return None + # Priority 3: Auto-detect default: prefer demo, then full + if not active: + availability = detect_available_local_datasets() + if availability.get("mimic-iv-demo", {}).get("parquet_present"): + active = "mimic-iv-demo" + elif availability.get("mimic-iv-full", {}).get("parquet_present"): + active = "mimic-iv-full" + else: + active = None + + return active def set_active_dataset(choice: str) -> None: - if choice not in ("demo", "full", "bigquery"): - raise ValueError("active_dataset must be one of: demo, full, bigquery") + # Allow registered names + valid_names = {ds.name for ds in DatasetRegistry.list_all()} + + if choice not in valid_names: + # It might be a new custom dataset not yet loaded in this process? + # We'll allow it if it's in the registry now. + _load_custom_datasets() + if not DatasetRegistry.get(choice): + raise ValueError( + f"active_dataset must be a registered dataset. Got: {choice}" + ) + cfg = load_runtime_config() cfg["active_dataset"] = choice save_runtime_config(cfg) def get_duckdb_path_for(choice: str) -> Path | None: - key = "mimic-iv-demo" if choice == "demo" else "mimic-iv-full" - return get_default_database_path(key) if choice in ("demo", "full") else None + return get_default_database_path(choice) def get_parquet_root_for(choice: str) -> Path | None: - key = "mimic-iv-demo" if choice == "demo" else "mimic-iv-full" - return get_dataset_parquet_root(key) if choice in ("demo", "full") else None + return get_dataset_parquet_root(choice) diff --git a/src/m3/data_io.py b/src/m3/data_io.py index f5d7d92..d60adf0 100644 --- a/src/m3/data_io.py +++ b/src/m3/data_io.py @@ -113,15 +113,23 @@ def _download_dataset_files( all_files_to_process = [] # List of (url, local_target_path) - for subdir_name in subdirs_to_scan: - subdir_listing_url = urljoin(base_listing_url, f"{subdir_name}/") - logger.info(f"Scanning subdirectory for CSVs: {subdir_listing_url}") - csv_urls_in_subdir = _scrape_urls_from_html_page(subdir_listing_url, session) + # Prepare list of (subdir_name, listing_url) + # If subdirs_to_scan is empty, we scan the base_listing_url directly (root) + scan_targets = [] + if not subdirs_to_scan: + scan_targets.append(("", base_listing_url)) + else: + for subdir in subdirs_to_scan: + # Ensure slash for directory joining + subdir_url = urljoin(base_listing_url, f"{subdir}/") + scan_targets.append((subdir, subdir_url)) + + for subdir_name, listing_url in scan_targets: + logger.info(f"Scanning for CSVs: {listing_url}") + csv_urls_in_subdir = _scrape_urls_from_html_page(listing_url, session) if not csv_urls_in_subdir: - logger.warning( - f"No .csv.gz files found in subdirectory: {subdir_listing_url}" - ) + logger.warning(f"No .csv.gz files found in location: {listing_url}") continue for file_url in csv_urls_in_subdir: @@ -160,10 +168,7 @@ def _download_dataset_files( all_files_to_process.append((file_url, local_target_path)) if not all_files_to_process: - logger.error( - f"No '.csv.gz' download links found after scanning {base_listing_url} " - f"and its subdirectories {subdirs_to_scan} for dataset '{dataset_name}'." - ) + logger.error(f"No '.csv.gz' download links found for dataset '{dataset_name}'.") return False # Deduplicate and sort for consistent processing order @@ -199,6 +204,15 @@ def download_dataset(dataset_name: str, output_root: Path) -> bool: if not cfg: logger.error(f"Unsupported dataset: {dataset_name}") return False + + # Prevent accidental scraping of credentialed datasets + if cfg.get("requires_authentication"): + logger.error( + f"Dataset '{dataset_name}' requires authentication and cannot be auto-downloaded. " + "Please download files manually." + ) + return False + if not cfg.get("file_listing_url"): logger.error( f"Dataset '{dataset_name}' does not have a configured listing URL. " @@ -359,11 +373,12 @@ def init_duckdb_from_parquet(dataset_name: str, db_target_path: Path) -> bool: def _create_duckdb_with_views(db_path: Path, parquet_root: Path) -> bool: """ Create a DuckDB database and define one view per Parquet file, - using the proper table naming structure that matches MIMIC-IV expectations. + using a generic table naming structure: folder_subfolder_filename. For example: - hosp/admissions.parquet → view: hosp_admissions - icu/chartevents.parquet → view: icu_chartevents + - data.parquet → view: data """ con = duckdb.connect(str(db_path)) try: @@ -460,7 +475,7 @@ def ensure_duckdb_for_dataset( dataset_key: str, ) -> tuple[bool, Path | None, Path | None]: """ - Ensure DuckDB exists and views are created for the dataset ('mimic-iv-demo'|'mimic-iv-full'). + Ensure DuckDB exists and views are created for the dataset. Returns (ok, db_path, parquet_root). """ db_path = get_default_database_path(dataset_key) diff --git a/src/m3/datasets.py b/src/m3/datasets.py new file mode 100644 index 0000000..cc08735 --- /dev/null +++ b/src/m3/datasets.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass, field +from typing import ClassVar + + +@dataclass +class DatasetDefinition: + name: str + description: str = "" + version: str = "1.0" + file_listing_url: str | None = None + subdirectories_to_scan: list[str] = field(default_factory=list) + default_duckdb_filename: str | None = None + primary_verification_table: str | None = None + tags: list[str] = field(default_factory=list) + + # For backward compatibility or ease of use, we might add a way to access as dict if needed, + # but we'll try to use object access. + + # BigQuery Configuration + bigquery_project_id: str | None = "physionet-data" + bigquery_dataset_ids: list[str] = field(default_factory=list) + + # Authentication & Download Helpers + requires_authentication: bool = False + + def __post_init__(self): + if not self.default_duckdb_filename: + self.default_duckdb_filename = f"{self.name.replace('-', '_')}.duckdb" + + +class DatasetRegistry: + _registry: ClassVar[dict[str, DatasetDefinition]] = {} + + @classmethod + def register(cls, dataset: DatasetDefinition): + cls._registry[dataset.name.lower()] = dataset + + @classmethod + def get(cls, name: str) -> DatasetDefinition | None: + return cls._registry.get(name.lower()) + + @classmethod + def list_all(cls) -> list[DatasetDefinition]: + return list(cls._registry.values()) + + @classmethod + def reset(cls): + cls._registry.clear() + cls._register_builtins() + + @classmethod + def _register_builtins(cls): + # Built-in datasets + mimic_iv_demo = DatasetDefinition( + name="mimic-iv-demo", + description="MIMIC-IV Clinical Database Demo", + file_listing_url="https://physionet.org/files/mimic-iv-demo/2.2/", + subdirectories_to_scan=["hosp", "icu"], + primary_verification_table="hosp_admissions", + tags=["mimic", "clinical", "demo"], + bigquery_project_id=None, + bigquery_dataset_ids=None, + ) + + mimic_iv_full = DatasetDefinition( + name="mimic-iv-full", + description="MIMIC-IV Clinical Database (Full)", + file_listing_url="https://physionet.org/files/mimiciv/3.1/", + subdirectories_to_scan=["hosp", "icu"], + primary_verification_table="hosp_admissions", + tags=["mimic", "clinical", "full"], + bigquery_project_id="physionet-data", + bigquery_dataset_ids=["mimiciv_3_1_hosp", "mimiciv_3_1_icu"], + requires_authentication=True, + ) + + cls.register(mimic_iv_demo) + cls.register(mimic_iv_full) + + +# Initialize registry +DatasetRegistry._register_builtins() diff --git a/src/m3/mcp_client_configs/dynamic_mcp_config.py b/src/m3/mcp_client_configs/dynamic_mcp_config.py index a879761..567981f 100644 --- a/src/m3/mcp_client_configs/dynamic_mcp_config.py +++ b/src/m3/mcp_client_configs/dynamic_mcp_config.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Any -from m3.config import get_active_dataset, get_default_database_path +from m3.config import get_default_database_path # Error messages _DATABASE_PATH_ERROR_MSG = ( @@ -86,17 +86,7 @@ def generate_config( if backend == "duckdb": if db_path: env["M3_DB_PATH"] = db_path - else: - active = get_active_dataset() - if not active: - raise ValueError( - "Could not determine default DuckDB path; run `m3 init ...` first " - "or pass --db-path explicitly." - ) - default_path = get_default_database_path(active) - if not default_path: - raise ValueError(_DATABASE_PATH_ERROR_MSG) - env["M3_DB_PATH"] = str(default_path) + # If no db_path, we rely on dynamic resolution in the server elif backend == "bigquery" and project_id: env["M3_PROJECT_ID"] = project_id @@ -194,9 +184,12 @@ def interactive_config(self) -> dict[str, Any]: raise ValueError(_DATABASE_PATH_ERROR_MSG) print(f"Default database path: {default_db_path}") + print( + "\nLeaving database path empty allows switching datasets dynamically via 'm3 use'." + ) db_path = ( input( - "DuckDB database path (optional, press Enter to use default): " + "DuckDB database path (optional, press Enter for dynamic): " ).strip() or None ) diff --git a/src/m3/mcp_server.py b/src/m3/mcp_server.py index 1a7ad6f..2f3cf63 100644 --- a/src/m3/mcp_server.py +++ b/src/m3/mcp_server.py @@ -11,16 +11,86 @@ from fastmcp import FastMCP from m3.auth import init_oauth2, require_oauth2 -from m3.config import get_default_database_path +from m3.config import get_active_dataset, get_default_database_path +from m3.datasets import DatasetRegistry # Create FastMCP server instance mcp = FastMCP("m3") # Global variables for backend configuration _backend = None -_db_path = None -_bq_client = None -_project_id = None +# Cache for BigQuery client to avoid re-initializing on every request +_bq_client_cache = {"client": None, "project_id": None} + + +def _get_active_dataset_def(): + """Get the currently active dataset definition.""" + # 1. Try currently active dataset from config/env + active_ds_name = get_active_dataset() + if active_ds_name: + return DatasetRegistry.get(active_ds_name) + + # 2. Fallback for BigQuery: try to find a full definition + if _backend == "bigquery": + # Use mimic-iv-full as reference if available, else demo + return DatasetRegistry.get("mimic-iv-full") or DatasetRegistry.get( + "mimic-iv-demo" + ) + + # 3. Fallback for DuckDB: demo + return DatasetRegistry.get("mimic-iv-demo") + + +def _get_db_path(): + """Get the current DuckDB path.""" + # 1. Env var overrides everything (static mode) + env_path = os.getenv("M3_DB_PATH") + if env_path: + return env_path + + # 2. Dynamic resolution based on active dataset + ds_def = _get_active_dataset_def() + if ds_def: + path = get_default_database_path(ds_def.name) + return str(path) if path else None + + return None + + +def _get_bq_client(): + """Get or create a BigQuery client for the current project.""" + try: + from google.cloud import bigquery + except ImportError: + raise ImportError( + "BigQuery dependencies not found. Install with: pip install google-cloud-bigquery" + ) + + # Determine target project ID + # Priority: Env Var > Dataset Config > Default + env_project = os.getenv("M3_PROJECT_ID") + ds_def = _get_active_dataset_def() + ds_project = ds_def.bigquery_project_id if ds_def else None + + target_project_id = env_project or ds_project or "physionet-data" + + # Check cache + if ( + _bq_client_cache["client"] + and _bq_client_cache["project_id"] == target_project_id + ): + return _bq_client_cache["client"], target_project_id + + # Create new client + try: + client = bigquery.Client(project=target_project_id) + _bq_client_cache["client"] = client + _bq_client_cache["project_id"] = target_project_id + return client, target_project_id + except Exception as e: + raise RuntimeError( + f"Failed to initialize BigQuery client for project {target_project_id}: {e}" + ) def _validate_limit(limit: int) -> bool: @@ -131,51 +201,34 @@ def _is_safe_query(sql_query: str, internal_tool: bool = False) -> tuple[bool, s def _init_backend(): """Initialize the backend based on environment variables.""" - global _backend, _db_path, _bq_client, _project_id + global _backend # Initialize OAuth2 authentication init_oauth2() _backend = os.getenv("M3_BACKEND", "duckdb") - if _backend == "duckdb": - _db_path = os.getenv("M3_DB_PATH") - if not _db_path: - path = get_default_database_path("mimic-iv-demo") - _db_path = str(path) if path else None - if not _db_path or not Path(_db_path).exists(): - raise FileNotFoundError(f"DuckDB database not found: {_db_path}") - - elif _backend == "bigquery": - try: - from google.cloud import bigquery - except ImportError: - raise ImportError( - "BigQuery dependencies not found. Install with: pip install google-cloud-bigquery" - ) - - # User's GCP project ID for authentication and billing - # MIMIC-IV data resides in the public 'physionet-data' project - _project_id = os.getenv("M3_PROJECT_ID", "physionet-data") - try: - _bq_client = bigquery.Client(project=_project_id) - except Exception as e: - raise RuntimeError(f"Failed to initialize BigQuery client: {e}") - - else: - raise ValueError(f"Unsupported backend: {_backend}") + if _backend not in ["duckdb", "bigquery"]: + raise ValueError( + f"Unsupported backend: {_backend}. Supported backends: duckdb, bigquery" + ) -# Initialize backend when module is imported _init_backend() def _get_backend_info() -> str: """Get current backend information for display in responses.""" + ds_def = _get_active_dataset_def() + ds_name = ds_def.name if ds_def else "unknown" + if _backend == "duckdb": - return f"šŸ”§ **Current Backend:** DuckDB (local database)\nšŸ“ **Database Path:** {_db_path}\n" + db_path = _get_db_path() + return f"šŸ”§ **Current Backend:** DuckDB (local database)\nšŸ“¦ **Active Dataset:** {ds_name}\nšŸ“ **Database Path:** {db_path}\n" else: - return f"šŸ”§ **Current Backend:** BigQuery (cloud database)\nā˜ļø **Project ID:** {_project_id}\n" + # Resolve project ID dynamically for display + _, project_id = _get_bq_client() + return f"šŸ”§ **Current Backend:** BigQuery (cloud database)\nšŸ“¦ **Active Dataset:** {ds_name}\nā˜ļø **Project ID:** {project_id}\n" # ========================================== @@ -188,8 +241,12 @@ def _get_backend_info() -> str: def _execute_duckdb_query(sql_query: str) -> str: """Execute DuckDB query - internal function.""" + db_path = _get_db_path() + if not db_path or not Path(db_path).exists(): + return "āŒ Error: Database file not found. Please initialize a dataset using 'm3 init'." + try: - conn = duckdb.connect(_db_path) + conn = duckdb.connect(db_path) try: df = conn.execute(sql_query).df() if df.empty: @@ -214,8 +271,10 @@ def _execute_bigquery_query(sql_query: str) -> str: try: from google.cloud import bigquery + client, _ = _get_bq_client() + job_config = bigquery.QueryJobConfig() - query_job = _bq_client.query(sql_query, job_config=job_config) + query_job = client.query(sql_query, job_config=job_config) df = query_job.to_dataframe() if df.empty: @@ -362,15 +421,27 @@ def get_database_schema() -> str: return f"{_get_backend_info()}\nšŸ“‹ **Available Tables:**\n{result}" elif _backend == "bigquery": - # Show fully qualified table names that are ready to copy-paste into queries - query = """ - SELECT CONCAT('`physionet-data.mimiciv_3_1_hosp.', table_name, '`') as query_ready_table_name - FROM `physionet-data.mimiciv_3_1_hosp.INFORMATION_SCHEMA.TABLES` - UNION ALL - SELECT CONCAT('`physionet-data.mimiciv_3_1_icu.', table_name, '`') as query_ready_table_name - FROM `physionet-data.mimiciv_3_1_icu.INFORMATION_SCHEMA.TABLES` - ORDER BY query_ready_table_name - """ + # Dynamic schema discovery based on active dataset definition + ds_def = _get_active_dataset_def() + if not ds_def or not ds_def.bigquery_dataset_ids: + return f"{_get_backend_info()}āŒ **Error:** No BigQuery datasets configured for the active dataset." + + project_id = ds_def.bigquery_project_id or "physionet-data" + queries = [] + + for dataset_id in ds_def.bigquery_dataset_ids: + queries.append(f""" + SELECT CONCAT('`{project_id}.{dataset_id}.', table_name, '`') as query_ready_table_name + FROM `{project_id}.{dataset_id}.INFORMATION_SCHEMA.TABLES` + """) + + if not queries: + return ( + f"{_get_backend_info()}āŒ **Error:** No BigQuery datasets configured." + ) + + query = " UNION ALL ".join(queries) + " ORDER BY query_ready_table_name" + result = _execute_query_internal(query) return f"{_get_backend_info()}\nšŸ“‹ **Available Tables (query-ready names):**\n{result}\n\nšŸ’” **Copy-paste ready:** These table names can be used directly in your SQL queries!" @@ -421,8 +492,10 @@ def get_table_info(table_name: str, show_sample: bool = True) -> str: else: # bigquery # Handle both simple names (patients) and fully qualified names (`physionet-data.mimiciv_3_1_hosp.patients`) - # Detect qualified names by content: dots + physionet pattern - if "." in table_name and "physionet-data" in table_name: + # Detect qualified names by content: dots + project ID pattern or backticks + is_qualified = "." in table_name + + if is_qualified: # Qualified name (format-agnostic: works with or without backticks) clean_name = table_name.strip("`") full_table_name = f"`{clean_name}`" @@ -433,26 +506,23 @@ def get_table_info(table_name: str, show_sample: bool = True) -> str: error_msg = ( f"{backend_info}āŒ **Invalid qualified table name:** `{table_name}`\n\n" "**Expected format:** `project.dataset.table`\n" - "**Example:** `physionet-data.mimiciv_3_1_hosp.diagnoses_icd`\n\n" - "**Available MIMIC-IV datasets:**\n" - "- `physionet-data.mimiciv_3_1_hosp.*` (hospital module)\n" - "- `physionet-data.mimiciv_3_1_icu.*` (ICU module)" + "**Example:** `physionet-data.mimiciv_3_1_hosp.diagnoses_icd`\n" ) return error_msg simple_table_name = parts[2] # table name - dataset = f"{parts[0]}.{parts[1]}" # project.dataset + dataset_ref = f"{parts[0]}.{parts[1]}" # project.dataset else: - # Simple name - try both datasets to find the table + # Simple name - try to find it in configured datasets simple_table_name = table_name full_table_name = None - dataset = None + dataset_ref = None # If we have a fully qualified name, try that first if full_table_name: try: # Get column information using the dataset from the full name - dataset_parts = dataset.split(".") + dataset_parts = dataset_ref.split(".") if len(dataset_parts) >= 2: project_dataset = f"`{dataset_parts[0]}.{dataset_parts[1]}`" info_query = f""" @@ -473,35 +543,36 @@ def get_table_info(table_name: str, show_sample: bool = True) -> str: return result except Exception: - pass # Fall through to try simple name approach + pass # Fall through to try search approach if direct lookup fails (unlikely but safe) - # Try both datasets with simple name (fallback or original approach) - for dataset in ["mimiciv_3_1_hosp", "mimiciv_3_1_icu"]: - try: - full_table_name = f"`physionet-data.{dataset}.{simple_table_name}`" - - # Get column information - info_query = f""" - SELECT column_name, data_type, is_nullable - FROM `physionet-data.{dataset}.INFORMATION_SCHEMA.COLUMNS` - WHERE table_name = '{simple_table_name}' - ORDER BY ordinal_position - """ - - info_result = _execute_bigquery_query(info_query) - if "No results found" not in info_result: - result = f"{backend_info}šŸ“‹ **Table:** {full_table_name}\n\n**Column Information:**\n{info_result}" - - if show_sample: - sample_query = f"SELECT * FROM {full_table_name} LIMIT 3" - sample_result = _execute_bigquery_query(sample_query) - result += ( - f"\n\nšŸ“Š **Sample Data (first 3 rows):**\n{sample_result}" - ) - - return result - except Exception: - continue + # Try configured datasets with simple name + ds_def = _get_active_dataset_def() + if ds_def and ds_def.bigquery_dataset_ids: + project_id = ds_def.bigquery_project_id or "physionet-data" + for dataset_id in ds_def.bigquery_dataset_ids: + try: + full_table_name = f"`{project_id}.{dataset_id}.{simple_table_name}`" + + # Get column information + info_query = f""" + SELECT column_name, data_type, is_nullable + FROM `{project_id}.{dataset_id}.INFORMATION_SCHEMA.COLUMNS` + WHERE table_name = '{simple_table_name}' + ORDER BY ordinal_position + """ + + info_result = _execute_bigquery_query(info_query) + if "No results found" not in info_result: + result = f"{backend_info}šŸ“‹ **Table:** {full_table_name}\n\n**Column Information:**\n{info_result}" + + if show_sample: + sample_query = f"SELECT * FROM {full_table_name} LIMIT 3" + sample_result = _execute_bigquery_query(sample_query) + result += f"\n\nšŸ“Š **Sample Data (first 3 rows):**\n{sample_result}" + + return result + except Exception: + continue return f"{backend_info}āŒ Table '{table_name}' not found in any dataset. Use get_database_schema() to see available tables." @@ -549,6 +620,11 @@ def get_icu_stays(patient_id: int | None = None, limit: int = 10) -> str: Returns: ICU stay data as formatted text or guidance if table not found """ + # Check dataset compatibility + ds_def = _get_active_dataset_def() + if ds_def and "mimic" not in ds_def.tags: + return f"āŒ **Error:** This tool is optimized for MIMIC datasets. The current dataset '{ds_def.name}' does not appear to be a MIMIC dataset." + # Security validation if not _validate_limit(limit): return "Error: Invalid limit. Must be a positive integer between 1 and 10000." @@ -557,7 +633,22 @@ def get_icu_stays(patient_id: int | None = None, limit: int = 10) -> str: if _backend == "duckdb": icustays_table = "icu_icustays" else: # bigquery - icustays_table = "`physionet-data.mimiciv_3_1_icu.icustays`" + # Try to find icustays in configured datasets + project_id = ( + ds_def.bigquery_project_id or "physionet-data" + if ds_def + else "physionet-data" + ) + found = False + dataset_ids = ds_def.bigquery_dataset_ids if ds_def else [] + for ds in dataset_ids: + if "icu" in ds: + icustays_table = f"`{project_id}.{ds}.icustays`" + found = True + break + if not found: + # Fallback + icustays_table = "`physionet-data.mimiciv_3_1_icu.icustays`" if patient_id: query = f"SELECT * FROM {icustays_table} WHERE subject_id = {patient_id}" @@ -599,6 +690,11 @@ def get_lab_results( Returns: Lab results as formatted text or guidance if table not found """ + # Check dataset compatibility + ds_def = _get_active_dataset_def() + if ds_def and "mimic" not in ds_def.tags: + return f"āŒ **Error:** This tool is optimized for MIMIC datasets. The current dataset '{ds_def.name}' does not appear to be a MIMIC dataset." + # Security validation if not _validate_limit(limit): return "Error: Invalid limit. Must be a positive integer between 1 and 10000." @@ -607,7 +703,22 @@ def get_lab_results( if _backend == "duckdb": labevents_table = "hosp_labevents" else: # bigquery - labevents_table = "`physionet-data.mimiciv_3_1_hosp.labevents`" + # Try to find labevents in configured datasets + project_id = ( + ds_def.bigquery_project_id or "physionet-data" + if ds_def + else "physionet-data" + ) + found = False + dataset_ids = ds_def.bigquery_dataset_ids if ds_def else [] + for ds in dataset_ids: + if "hosp" in ds: + labevents_table = f"`{project_id}.{ds}.labevents`" + found = True + break + if not found: + # Fallback + labevents_table = "`physionet-data.mimiciv_3_1_hosp.labevents`" # Build query conditions conditions = [] @@ -654,6 +765,11 @@ def get_race_distribution(limit: int = 10) -> str: Returns: Race distribution as formatted text or guidance if table not found """ + # Check dataset compatibility + ds_def = _get_active_dataset_def() + if ds_def and "mimic" not in ds_def.tags: + return f"āŒ **Error:** This tool is optimized for MIMIC datasets. The current dataset '{ds_def.name}' does not appear to be a MIMIC dataset." + # Security validation if not _validate_limit(limit): return "Error: Invalid limit. Must be a positive integer between 1 and 10000." @@ -662,7 +778,22 @@ def get_race_distribution(limit: int = 10) -> str: if _backend == "duckdb": admissions_table = "hosp_admissions" else: # bigquery - admissions_table = "`physionet-data.mimiciv_3_1_hosp.admissions`" + # Try to find admissions in configured datasets + project_id = ( + ds_def.bigquery_project_id or "physionet-data" + if ds_def + else "physionet-data" + ) + found = False + dataset_ids = ds_def.bigquery_dataset_ids if ds_def else [] + for ds in dataset_ids: + if "hosp" in ds: + admissions_table = f"`{project_id}.{ds}.admissions`" + found = True + break + if not found: + # Fallback + admissions_table = "`physionet-data.mimiciv_3_1_hosp.admissions`" query = f"SELECT race, COUNT(*) as count FROM {admissions_table} GROUP BY race ORDER BY count DESC LIMIT {limit}" diff --git a/tests/test_cli.py b/tests/test_cli.py index 8e55966..ff52159 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -162,13 +162,9 @@ def test_config_claude_infers_db_path_demo( result = runner.invoke(app, ["config", "claude"]) assert result.exit_code == 0 - # subprocess run should be called with inferred --db-path + # subprocess run should NOT be called with inferred --db-path (dynamic resolution) call_args = mock_subprocess.call_args[0][0] - assert "--db-path" in call_args - assert "/tmp/inferred-demo.duckdb" in call_args - - # Should have asked for demo duckdb path - mock_get_default.assert_called() + assert "--db-path" not in call_args @patch("subprocess.run") @@ -177,7 +173,7 @@ def test_config_claude_infers_db_path_demo( def test_config_claude_infers_db_path_full( mock_active, mock_get_default, mock_subprocess ): - mock_active.return_value = "full" + mock_active.return_value = "mimic-iv-full" mock_get_default.return_value = Path("/tmp/inferred-full.duckdb") mock_subprocess.return_value = MagicMock(returncode=0) @@ -185,21 +181,20 @@ def test_config_claude_infers_db_path_full( assert result.exit_code == 0 call_args = mock_subprocess.call_args[0][0] - assert "--db-path" in call_args - assert "/tmp/inferred-full.duckdb" in call_args + assert "--db-path" not in call_args @patch("m3.cli.set_active_dataset") @patch("m3.cli.detect_available_local_datasets") def test_use_full_happy_path(mock_detect, mock_set_active): mock_detect.return_value = { - "demo": { + "mimic-iv-demo": { "parquet_present": False, "db_present": False, "parquet_root": "/tmp/demo", "db_path": "/tmp/demo.duckdb", }, - "full": { + "mimic-iv-full": { "parquet_present": True, "db_present": False, "parquet_root": "/tmp/full", @@ -207,24 +202,24 @@ def test_use_full_happy_path(mock_detect, mock_set_active): }, } - result = runner.invoke(app, ["use", "full"]) + result = runner.invoke(app, ["use", "mimic-iv-full"]) assert result.exit_code == 0 - assert "Active dataset set to 'full'." in result.stdout - mock_set_active.assert_called_once_with("full") + assert "Active dataset set to 'mimic-iv-full'." in result.stdout + mock_set_active.assert_called_once_with("mimic-iv-full") @patch("m3.cli.compute_parquet_dir_size", return_value=123) -@patch("m3.cli.get_active_dataset", return_value="full") +@patch("m3.cli.get_active_dataset", return_value="mimic-iv-full") @patch("m3.cli.detect_available_local_datasets") def test_status_happy_path(mock_detect, mock_active, mock_size): mock_detect.return_value = { - "demo": { + "mimic-iv-demo": { "parquet_present": True, "db_present": False, "parquet_root": "/tmp/demo", "db_path": "/tmp/demo.duckdb", }, - "full": { + "mimic-iv-full": { "parquet_present": True, "db_present": False, "parquet_root": "/tmp/full", @@ -234,6 +229,6 @@ def test_status_happy_path(mock_detect, mock_active, mock_size): result = runner.invoke(app, ["status"]) assert result.exit_code == 0 - assert "Active dataset: full" in result.stdout + assert "Active dataset: mimic-iv-full" in result.stdout size_gb = 123 / (1024**3) assert f"parquet_size_gb: {size_gb:.4f} GB" in result.stdout diff --git a/tests/test_dynamic_switching.py b/tests/test_dynamic_switching.py new file mode 100644 index 0000000..65e1a26 --- /dev/null +++ b/tests/test_dynamic_switching.py @@ -0,0 +1,68 @@ +import m3.config as config_mod +import m3.mcp_server as server +from m3.config import set_active_dataset + + +def test_dynamic_dataset_switching(tmp_path, monkeypatch): + # Setup mock data dir + data_dir = tmp_path / "m3_data" + data_dir.mkdir() + + # Patch config module to use our temp data dir + monkeypatch.setattr(config_mod, "_PROJECT_DATA_DIR", data_dir) + monkeypatch.setattr(config_mod, "_DEFAULT_DATABASES_DIR", data_dir / "databases") + monkeypatch.setattr(config_mod, "_DEFAULT_PARQUET_DIR", data_dir / "parquet") + monkeypatch.setattr(config_mod, "_RUNTIME_CONFIG_PATH", data_dir / "config.json") + monkeypatch.setattr(config_mod, "_CUSTOM_DATASETS_DIR", data_dir / "datasets") + + # Ensure dirs exist + (data_dir / "databases").mkdir() + (data_dir / "parquet").mkdir() + (data_dir / "datasets").mkdir() + + # 1. Start with no active dataset + # Verify server defaults to mimic-iv-demo (or falls back) + monkeypatch.setenv("M3_BACKEND", "duckdb") + monkeypatch.delenv("M3_DB_PATH", raising=False) + + # Ensure config is empty/default + if (data_dir / "config.json").exists(): + (data_dir / "config.json").unlink() + + # Check default fallback + ds_def = server._get_active_dataset_def() + assert ds_def.name == "mimic-iv-demo" + + db_path = server._get_db_path() + # Should point to demo db in our temp dir + # Note: get_default_database_path uses the patched _DEFAULT_DATABASES_DIR + assert "mimic_iv_demo.duckdb" in str(db_path) + + # 2. Set active dataset to something else (simulating 'm3 use') + # We can use 'mimic-iv-full' as it is registered + set_active_dataset("mimic-iv-full") + + # Verify config file was written + assert (data_dir / "config.json").exists() + + # Verify server picks it up + ds_def = server._get_active_dataset_def() + assert ds_def.name == "mimic-iv-full" + + db_path = server._get_db_path() + assert "mimic_iv_full.duckdb" in str(db_path) + + # 3. Simulate environment variable override (static mode) + monkeypatch.setenv("M3_DB_PATH", "/custom/path/to/db.duckdb") + + db_path = server._get_db_path() + assert db_path == "/custom/path/to/db.duckdb" + + # Active dataset def should still track the config/env + ds_def = server._get_active_dataset_def() + assert ds_def.name == "mimic-iv-full" + + # 4. Unset env var, should go back to dynamic + monkeypatch.delenv("M3_DB_PATH") + db_path = server._get_db_path() + assert "mimic_iv_full.duckdb" in str(db_path) diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 643a158..ffc79a6 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -9,6 +9,9 @@ import pytest from fastmcp import Client +# Define DatasetDefinition locally if imports fail (shouldn't happen in test env) +from m3.datasets import DatasetDefinition + # Mock the database path check during import to handle CI environments with patch("pathlib.Path.exists", return_value=True): with patch( @@ -31,6 +34,14 @@ def _bigquery_available(): class TestMCPServerSetup: """Test MCP server setup and configuration.""" + @pytest.fixture(autouse=True) + def reset_bq_cache(self): + """Reset the BigQuery client cache before each test.""" + import m3.mcp_server + + if hasattr(m3.mcp_server, "_bq_client_cache"): + m3.mcp_server._bq_client_cache = {"client": None, "project_id": None} + def test_server_instance_exists(self): """Test that the FastMCP server instance exists.""" assert mcp is not None @@ -62,24 +73,53 @@ def test_backend_init_duckdb_missing_db(self): with patch("m3.mcp_server.get_default_database_path") as mock_path: mock_path.return_value = Path("/fake/path.duckdb") with patch("pathlib.Path.exists", return_value=False): - with pytest.raises(FileNotFoundError): - _init_backend() + _init_backend() + # Verify that we didn't crash and that the path is set, + # allowing the runtime check in _execute_duckdb_query to handle it gracefully. + import m3.mcp_server + + # _db_path was removed, check behavior via internal getter or backend info + assert m3.mcp_server._get_db_path() == str( + Path("/fake/path.duckdb") + ) + assert m3.mcp_server._backend == "duckdb" @pytest.mark.skipif( not _bigquery_available(), reason="BigQuery dependencies not available" ) def test_backend_init_bigquery(self): - """Test BigQuery backend initialization.""" + """Test BigQuery backend initialization and client creation.""" + mock_ds = DatasetDefinition( + name="mock-ds", + bigquery_project_id="test-project", + bigquery_dataset_ids=["ds1"], + tags=["mimic"], + ) + with patch.dict( os.environ, {"M3_BACKEND": "bigquery", "M3_PROJECT_ID": "test-project"}, clear=True, ): - with patch("google.cloud.bigquery.Client") as mock_client: - mock_client.return_value = Mock() - _init_backend() - # If no exception raised, initialization succeeded - mock_client.assert_called_once_with(project="test-project") + with patch("m3.mcp_server.DatasetRegistry.get", return_value=mock_ds): + with patch("google.cloud.bigquery.Client") as mock_client: + mock_client.return_value = Mock() + _init_backend() + + # _init_backend no longer creates the client eagerly + mock_client.assert_not_called() + + # Call the internal getter to trigger creation + import m3.mcp_server + + client, project_id = m3.mcp_server._get_bq_client() + + assert project_id == "test-project" + mock_client.assert_called_once_with(project="test-project") + + # Second call should be cached (no new client init) + m3.mcp_server._get_bq_client() + mock_client.assert_called_once_with(project="test-project") def test_backend_init_invalid(self): """Test initialization with invalid backend.""" @@ -155,37 +195,46 @@ async def test_tools_via_client(self, test_db): clear=True, ): # Initialize backend - _init_backend() - - # Test via FastMCP client - async with Client(mcp) as client: - # Test execute_mimic_query tool - result = await client.call_tool( - "execute_mimic_query", - {"sql_query": "SELECT COUNT(*) as count FROM icu_icustays"}, - ) - result_text = str(result) - assert "count" in result_text - assert "2" in result_text - - # Test get_icu_stays tool - result = await client.call_tool( - "get_icu_stays", {"patient_id": 10000032, "limit": 10} - ) - result_text = str(result) - assert "10000032" in result_text - - # Test get_lab_results tool - result = await client.call_tool( - "get_lab_results", {"patient_id": 10000032, "limit": 20} - ) - result_text = str(result) - assert "10000032" in result_text + # Mock DatasetRegistry to return a mimic dataset so tools work + mock_ds = DatasetDefinition(name="mimic-demo", tags=["mimic"]) + with patch("m3.mcp_server.DatasetRegistry.get", return_value=mock_ds): + with patch( + "m3.mcp_server.get_active_dataset", return_value="mimic-demo" + ): + _init_backend() - # Test get_database_schema tool - result = await client.call_tool("get_database_schema", {}) - result_text = str(result) - assert "icu_icustays" in result_text or "hosp_labevents" in result_text + # Test via FastMCP client + async with Client(mcp) as client: + # Test execute_mimic_query tool + result = await client.call_tool( + "execute_mimic_query", + {"sql_query": "SELECT COUNT(*) as count FROM icu_icustays"}, + ) + result_text = str(result) + assert "count" in result_text + assert "2" in result_text + + # Test get_icu_stays tool + result = await client.call_tool( + "get_icu_stays", {"patient_id": 10000032, "limit": 10} + ) + result_text = str(result) + assert "10000032" in result_text + + # Test get_lab_results tool + result = await client.call_tool( + "get_lab_results", {"patient_id": 10000032, "limit": 20} + ) + result_text = str(result) + assert "10000032" in result_text + + # Test get_database_schema tool + result = await client.call_tool("get_database_schema", {}) + result_text = str(result) + assert ( + "icu_icustays" in result_text + or "hosp_labevents" in result_text + ) @pytest.mark.asyncio async def test_security_checks(self, test_db): @@ -297,53 +346,74 @@ async def test_oauth2_authentication_required(self, test_db): class TestBigQueryIntegration: """Test BigQuery integration with mocks (no real API calls).""" + @pytest.fixture(autouse=True) + def reset_bq_cache(self): + """Reset the BigQuery client cache before each test.""" + import m3.mcp_server + + if hasattr(m3.mcp_server, "_bq_client_cache"): + m3.mcp_server._bq_client_cache = {"client": None, "project_id": None} + @pytest.mark.skipif( not _bigquery_available(), reason="BigQuery dependencies not available" ) @pytest.mark.asyncio async def test_bigquery_tools(self): """Test BigQuery tools functionality with mocks.""" + + # Mock Dataset definition for BigQuery + mock_ds = DatasetDefinition( + name="mimic-test", + bigquery_project_id="test-project", + bigquery_dataset_ids=["mimic_hosp", "mimic_icu"], + tags=["mimic"], + ) + with patch.dict( os.environ, {"M3_BACKEND": "bigquery", "M3_PROJECT_ID": "test-project"}, clear=True, ): - with patch("google.cloud.bigquery.Client") as mock_client: - # Mock BigQuery client and query results - mock_job = Mock() - mock_df = Mock() - mock_df.empty = False - mock_df.to_string.return_value = "Mock BigQuery result" - mock_df.__len__ = Mock(return_value=5) - mock_job.to_dataframe.return_value = mock_df - - mock_client_instance = Mock() - mock_client_instance.query.return_value = mock_job - mock_client.return_value = mock_client_instance - - _init_backend() + with patch("m3.mcp_server.DatasetRegistry.get", return_value=mock_ds): + with patch( + "m3.mcp_server.get_active_dataset", return_value="mimic-test" + ): + with patch("google.cloud.bigquery.Client") as mock_client: + # Mock BigQuery client and query results + mock_job = Mock() + mock_df = Mock() + mock_df.empty = False + mock_df.to_string.return_value = "Mock BigQuery result" + mock_df.__len__ = Mock(return_value=5) + mock_job.to_dataframe.return_value = mock_df + + mock_client_instance = Mock() + mock_client_instance.query.return_value = mock_job + mock_client.return_value = mock_client_instance - async with Client(mcp) as client: - # Test execute_mimic_query tool - result = await client.call_tool( - "execute_mimic_query", - { - "sql_query": "SELECT COUNT(*) FROM `physionet-data.mimiciv_3_1_icu.icustays`" - }, - ) - result_text = str(result) - assert "Mock BigQuery result" in result_text - - # Test get_race_distribution tool - result = await client.call_tool( - "get_race_distribution", {"limit": 5} - ) - result_text = str(result) - assert "Mock BigQuery result" in result_text + _init_backend() - # Verify BigQuery client was called - mock_client.assert_called_once_with(project="test-project") - assert mock_client_instance.query.called + async with Client(mcp) as client: + # Test execute_mimic_query tool + result = await client.call_tool( + "execute_mimic_query", + { + "sql_query": "SELECT COUNT(*) FROM `physionet-data.mimiciv_3_1_icu.icustays`" + }, + ) + result_text = str(result) + assert "Mock BigQuery result" in result_text + + # Test get_race_distribution tool + result = await client.call_tool( + "get_race_distribution", {"limit": 5} + ) + result_text = str(result) + assert "Mock BigQuery result" in result_text + + # Verify BigQuery client was called + mock_client.assert_called_once_with(project="test-project") + assert mock_client_instance.query.called class TestServerIntegration: