From f9179e1e79a0f2dd37c842345f5868d9406f9781 Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Thu, 15 Jan 2026 22:49:15 +0000
Subject: [PATCH 1/6] Add BigQuery Monitoring Dashboard custom app

Features:
- Real-time BigQuery dataset and table monitoring
- Interactive visualizations (time series, histograms, box plots)
- Sample data viewing and CSV export
- Comprehensive data profiling using ydata-profiling
- Auto-refresh capability
- Data freshness indicators

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 .../.devcontainer.json                        |  38 ++
 src/bq-monitoring-dashboard/Dockerfile        |  38 ++
 src/bq-monitoring-dashboard/README.md         |  84 ++++
 src/bq-monitoring-dashboard/app.py            | 454 ++++++++++++++++++
 .../devcontainer-template.json                |  20 +
 .../docker-compose.yaml                       |  33 ++
 src/bq-monitoring-dashboard/requirements.txt  |   6 +
 7 files changed, 673 insertions(+)
 create mode 100644 src/bq-monitoring-dashboard/.devcontainer.json
 create mode 100644 src/bq-monitoring-dashboard/Dockerfile
 create mode 100644 src/bq-monitoring-dashboard/README.md
 create mode 100644 src/bq-monitoring-dashboard/app.py
 create mode 100644 src/bq-monitoring-dashboard/devcontainer-template.json
 create mode 100644 src/bq-monitoring-dashboard/docker-compose.yaml
 create mode 100644 src/bq-monitoring-dashboard/requirements.txt

diff --git a/src/bq-monitoring-dashboard/.devcontainer.json b/src/bq-monitoring-dashboard/.devcontainer.json
new file mode 100644
index 00000000..c26f52eb
--- /dev/null
+++ b/src/bq-monitoring-dashboard/.devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "name": "bq-monitoring-dashboard",
+  "dockerComposeFile": "docker-compose.yaml",
+  "service": "app",
+  "shutdownAction": "none",
+  "workspaceFolder": "/workspace",
+  "postCreateCommand": [
+    "./startupscript/post-startup.sh",
+    "streamlit",
+    "/home/streamlit",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "postStartCommand": [
+    "./startupscript/remount-on-restart.sh",
+    "streamlit",
+    "/home/streamlit",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "features": {
+    "ghcr.io/devcontainers/features/java:1": {
+      "version": "17"
+    },
+    "ghcr.io/devcontainers/features/aws-cli:1": {},
+    "ghcr.io/dhoeric/features/google-cloud-cli:1": {}
+  },
+  "remoteUser": "root",
+  "customizations": {
+    "workbench": {
+      "opens": {
+        "extensions": [".py", ".csv", ".json", ".md"],
+        "fileUrlSuffix": "/",
+        "folderUrlSuffix": "/"
+      }
+    }
+  }
+}
diff --git a/src/bq-monitoring-dashboard/Dockerfile b/src/bq-monitoring-dashboard/Dockerfile
new file mode 100644
index 00000000..3bbadf2b
--- /dev/null
+++ b/src/bq-monitoring-dashboard/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.11-slim
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better caching
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY app.py .
+
+# Create non-root user
+RUN useradd -m -u 1000 streamlit && \
+    chown -R streamlit:streamlit /app
+
+# Switch to non-root user
+USER streamlit
+
+# Expose Streamlit port
+EXPOSE 8501
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8501/_stcore/health || exit 1
+
+# Run Streamlit
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"]
diff --git a/src/bq-monitoring-dashboard/README.md b/src/bq-monitoring-dashboard/README.md
new file mode 100644
index 00000000..0869aa3c
--- /dev/null
+++ b/src/bq-monitoring-dashboard/README.md
@@ -0,0 +1,84 @@
+# BigQuery Monitoring Dashboard
+
+A comprehensive real-time monitoring dashboard for BigQuery datasets with detailed data profiling capabilities.
+
+## Features
+
+### 📈 Overview Tab
+- **Key Metrics**: Total rows, table size, column count, table type
+- **Timestamps**: Creation and last modification dates
+- **Data Freshness Indicators**: Visual alerts for data age
+  - 🟢 Very fresh (< 1 hour)
+  - 🟡 Fresh (< 1 day)
+  - 🟠 Moderate (< 1 week)
+  - 🔴 Stale (> 1 week)
+
+### 📋 Sample Data Tab
+- View sample data from selected tables
+- Adjustable row count (10-1000 rows)
+- Download sample data as CSV
+
+### 📊 Visualizations Tab
+- **Time Series Analysis**: Automatic detection of timestamp columns
+- **Distribution Analysis**: Histograms for numeric columns
+- **Box Plots**: Statistical distribution visualization
+- **Categorical Analysis**: Top value frequency charts
+
+### 🔍 Detailed Data Characteristics Tab
+- Comprehensive data profiling using **ydata-profiling**
+- Includes:
+  - Dataset overview and statistics
+  - Variable type detection and analysis
+  - Correlation matrices (Pearson, Spearman, Kendall)
+  - Missing value analysis
+  - Interaction plots
+  - Sample data preview
+- Downloadable HTML reports
+
+## Configuration
+
+### Auto-refresh
+- Enable auto-refresh in the sidebar
+- Configurable interval (10-300 seconds)
+
+### Dataset & Table Selection
+- Dropdown selection for all available datasets
+- Dropdown selection for tables within selected dataset
+
+## Technical Details
+
+- **Framework**: Streamlit
+- **Port**: 8501
+- **Base Image**: Python 3.11-slim
+- **Key Dependencies**:
+  - `streamlit`: Web UI framework
+  - `google-cloud-bigquery`: BigQuery client
+  - `plotly`: Interactive visualizations
+  - `ydata-profiling`: Comprehensive data profiling
+  - `pandas`: Data manipulation
+
+## Usage in Workbench
+
+1. Create a custom app in Workbench
+2. Point to this repository
+3. Select "BigQuery Monitoring Dashboard" template
+4. Choose cloud provider (GCP recommended)
+5. Launch the app
+6. Access via Workbench UI
+
+## Authentication
+
+The app uses Google Cloud Application Default Credentials (ADC) automatically configured by Workbench. No manual authentication required.
+
+## Performance Notes
+
+- Table metadata is cached for 5 minutes
+- Query results are cached for 1 minute
+- Data profiling is limited to 10,000 rows for performance
+- Auto-refresh can be adjusted based on data update frequency
+
+## Requirements
+
+- Workbench workspace with BigQuery access
+- GCP project with enabled BigQuery API
+- Sufficient IAM permissions to query BigQuery datasets
diff --git a/src/bq-monitoring-dashboard/app.py b/src/bq-monitoring-dashboard/app.py
new file mode 100644
index 00000000..3acffac3
--- /dev/null
+++ b/src/bq-monitoring-dashboard/app.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+BigQuery Monitoring Dashboard
+Real-time monitoring of BigQuery datasets with data profiling
+"""
+
+import streamlit as st
+import pandas as pd
+from google.cloud import bigquery
+from google.auth import default
+import plotly.express as px
+import plotly.graph_objects as go
+from datetime import datetime, timedelta
+import time
+from ydata_profiling import ProfileReport
+import streamlit.components.v1 as components
+import tempfile
+import os
+
+# Page configuration
+st.set_page_config(
+    page_title="BigQuery Monitoring Dashboard",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+
+# Initialize BigQuery client
+@st.cache_resource
+def get_bigquery_client():
+    """Initialize and cache BigQuery client"""
+    try:
+        credentials, project = default()
+        client = bigquery.Client(credentials=credentials, project=project)
+        return client, project
+    except Exception as e:
+        st.error(f"Failed to initialize BigQuery client: {e}")
+        return None, None
+
+# Get all datasets in the project
+@st.cache_data(ttl=300)  # Cache for 5 minutes
+def get_datasets(project):
+    """Fetch all datasets in the project"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return []
+
+        datasets = list(client.list_datasets())
+        return [dataset.dataset_id for dataset in datasets]
+    except Exception as e:
+        st.error(f"Error fetching datasets: {e}")
+        return []
+
+# Get all tables in a dataset
+@st.cache_data(ttl=300)
+def get_tables(project, dataset_id):
+    """Fetch all tables in a dataset"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return []
+
+        tables = list(client.list_tables(dataset_id))
+        return [table.table_id for table in tables]
+    except Exception as e:
+        st.error(f"Error fetching tables: {e}")
+        return []
+
+# Get table metadata
+@st.cache_data(ttl=60)  # Cache for 1 minute
+def get_table_info(project, dataset_id, table_id):
+    """Get table metadata including row count and last modified"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        table_ref = f"{project}.{dataset_id}.{table_id}"
+        table = client.get_table(table_ref)
+
+        return {
+            "table_id": table_id,
+            "num_rows": table.num_rows,
+            "num_bytes": table.num_bytes,
+            "created": table.created,
+            "modified": table.modified,
+            "schema_fields": len(table.schema),
+            "table_type": table.table_type
+        }
+    except Exception as e:
+        st.error(f"Error fetching table info: {e}")
+        return None
+
+# Query table data
+@st.cache_data(ttl=60)
+def query_table_sample(project, dataset_id, table_id, limit=100):
+    """Query sample data from table"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        query = f"""
+        SELECT *
+        FROM `{project}.{dataset_id}.{table_id}`
+        LIMIT {limit}
+        """
+
+        df = client.query(query).to_dataframe()
+        return df
+    except Exception as e:
+        st.error(f"Error querying table: {e}")
+        return None
+
+# Query for time-based metrics (if timestamp column exists)
+@st.cache_data(ttl=60)
+def get_time_series_data(project, dataset_id, table_id, time_column=None):
+    """Get time series data for visualization"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        # Try to find a timestamp column if not provided
+        if not time_column:
+            table_ref = f"{project}.{dataset_id}.{table_id}"
+            table = client.get_table(table_ref)
+
+            # Look for common timestamp column names
+            time_columns = [field.name for field in table.schema
+                          if field.field_type in ['TIMESTAMP', 'DATE', 'DATETIME']]
+
+            if not time_columns:
+                return None
+
+            time_column = time_columns[0]
+
+        query = f"""
+        SELECT
+            DATE({time_column}) as date,
+            COUNT(*) as count
+        FROM `{project}.{dataset_id}.{table_id}`
+        WHERE {time_column} IS NOT NULL
+        GROUP BY date
+        ORDER BY date DESC
+        LIMIT 365
+        """
+
+        df = client.query(query).to_dataframe()
+        return df, time_column
+    except Exception as e:
+        # Silently fail if no time column exists
+        return None
+
+# Generate profiling report
+def generate_profiling_report(df, title="Data Profile"):
+    """Generate ydata-profiling report"""
+    try:
+        with st.spinner("Generating detailed data profile... This may take a moment."):
+            # Limit rows for profiling to prevent timeout
+            df_sample = df.head(10000) if len(df) > 10000 else df
+
+            profile = ProfileReport(
+                df_sample,
+                title=title,
+                minimal=False,
+                explorative=True,
+                progress_bar=False
+            )
+
+            return profile
+    except Exception as e:
+        st.error(f"Error generating profile: {e}")
+        return None
+
+# Main dashboard
+def main():
+    st.title("📊 BigQuery Monitoring Dashboard")
+    st.markdown("Real-time monitoring of BigQuery datasets with detailed data profiling")
+
+    # Initialize client
+    client, project = get_bigquery_client()
+
+    if not client:
+        st.error("Cannot connect to BigQuery. Please check your credentials.")
+        return
+
+    st.success(f"Connected to project: **{project}**")
+
+    # Sidebar configuration
+    st.sidebar.header("Configuration")
+
+    # Auto-refresh settings
+    auto_refresh = st.sidebar.checkbox("Auto-refresh", value=False)
+    if auto_refresh:
+        refresh_interval = st.sidebar.slider(
+            "Refresh interval (seconds)",
+            min_value=10,
+            max_value=300,
+            value=30
+        )
+
+    # Dataset selection
+    datasets = get_datasets(project)
+
+    if not datasets:
+        st.warning("No datasets found in this project.")
+        return
+
+    selected_dataset = st.sidebar.selectbox(
+        "Select Dataset",
+        datasets,
+        index=0
+    )
+
+    # Table selection
+    tables = get_tables(project, selected_dataset)
+
+    if not tables:
+        st.warning(f"No tables found in dataset '{selected_dataset}'")
+        return
+
+    selected_table = st.sidebar.selectbox(
+        "Select Table",
+        tables,
+        index=0
+    )
+
+    # Display timestamp
+    st.sidebar.markdown("---")
+    st.sidebar.markdown(f"**Last updated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    # Main content area
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "📈 Overview",
+        "📋 Sample Data",
+        "📊 Visualizations",
+        "🔍 Detailed Data Characteristics"
+    ])
+
+    # Tab 1: Overview
+    with tab1:
+        st.header(f"Table: {selected_dataset}.{selected_table}")
+
+        # Get table info
+        table_info = get_table_info(project, selected_dataset, selected_table)
+
+        if table_info:
+            # Display KPIs
+            col1, col2, col3, col4 = st.columns(4)
+
+            with col1:
+                st.metric(
+                    "Total Rows",
+                    f"{table_info['num_rows']:,}"
+                )
+
+            with col2:
+                size_mb = table_info['num_bytes'] / (1024 * 1024)
+                st.metric(
+                    "Size (MB)",
+                    f"{size_mb:,.2f}"
+                )
+
+            with col3:
+                st.metric(
+                    "Columns",
+                    table_info['schema_fields']
+                )
+
+            with col4:
+                st.metric(
+                    "Table Type",
+                    table_info['table_type']
+                )
+
+            # Display timestamps
+            st.markdown("---")
+            col1, col2 = st.columns(2)
+
+            with col1:
+                st.markdown(f"**Created:** {table_info['created'].strftime('%Y-%m-%d %H:%M:%S')}")
+
+            with col2:
+                st.markdown(f"**Last Modified:** {table_info['modified'].strftime('%Y-%m-%d %H:%M:%S')}")
+
+                # Data freshness indicator
+                time_since_modified = datetime.now(table_info['modified'].tzinfo) - table_info['modified']
+
+                if time_since_modified < timedelta(hours=1):
+                    st.success("🟢 Very fresh (< 1 hour)")
+                elif time_since_modified < timedelta(days=1):
+                    st.info("🟡 Fresh (< 1 day)")
+                elif time_since_modified < timedelta(days=7):
+                    st.warning("🟠 Moderate (< 1 week)")
+                else:
+                    st.error("🔴 Stale (> 1 week)")
+
+    # Tab 2: Sample Data
+    with tab2:
+        st.header("Sample Data")
+
+        sample_size = st.slider("Number of rows to display", 10, 1000, 100)
+
+        df = query_table_sample(project, selected_dataset, selected_table, limit=sample_size)
+
+        if df is not None and not df.empty:
+            st.dataframe(df, use_container_width=True, height=400)
+
+            # Download button
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="Download as CSV",
+                data=csv,
+                file_name=f"{selected_dataset}_{selected_table}_sample.csv",
+                mime="text/csv"
+            )
+        else:
+            st.warning("No data available")
+
+    # Tab 3: Visualizations
+    with tab3:
+        st.header("Data Visualizations")
+
+        df = query_table_sample(project, selected_dataset, selected_table, limit=1000)
+
+        if df is not None and not df.empty:
+            # Time series visualization
+            time_data = get_time_series_data(project, selected_dataset, selected_table)
+
+            if time_data:
+                time_df, time_column = time_data
+                st.subheader(f"Time Series: Records per Day (by {time_column})")
+
+                fig = px.line(
+                    time_df,
+                    x='date',
+                    y='count',
+                    title=f"Records over time",
+                    labels={'date': 'Date', 'count': 'Record Count'}
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+
+            # Column statistics
+            st.subheader("Column Statistics")
+
+            numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
+
+            if numeric_columns:
+                selected_column = st.selectbox("Select numeric column", numeric_columns)
+
+                col1, col2 = st.columns(2)
+
+                with col1:
+                    # Histogram
+                    fig = px.histogram(
+                        df,
+                        x=selected_column,
+                        title=f"Distribution of {selected_column}",
+                        nbins=50
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+
+                with col2:
+                    # Box plot
+                    fig = px.box(
+                        df,
+                        y=selected_column,
+                        title=f"Box Plot of {selected_column}"
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("No numeric columns found for visualization")
+
+            # Categorical columns
+            categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
+
+            if categorical_columns:
+                st.subheader("Categorical Analysis")
+                selected_cat_column = st.selectbox("Select categorical column", categorical_columns)
+
+                # Value counts
+                value_counts = df[selected_cat_column].value_counts().head(10)
+
+                fig = px.bar(
+                    x=value_counts.index,
+                    y=value_counts.values,
+                    title=f"Top 10 values in {selected_cat_column}",
+                    labels={'x': selected_cat_column, 'y': 'Count'}
+                )
+                st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No data available for visualization")
+
+    # Tab 4: Detailed Data Characteristics (ydata-profiling)
+    with tab4:
+        st.header("🔍 Detailed Data Characteristics")
+        st.markdown("Comprehensive data profiling using ydata-profiling")
+
+        # Option to generate profile
+        if st.button("Generate Detailed Profile Report", type="primary"):
+            df = query_table_sample(project, selected_dataset, selected_table, limit=10000)
+
+            if df is not None and not df.empty:
+                profile = generate_profiling_report(
+                    df,
+                    title=f"{selected_dataset}.{selected_table} - Data Profile"
+                )
+
+                if profile:
+                    # Save to temp file and display
+                    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as f:
+                        profile.to_file(f.name)
+
+                        # Read and display
+                        with open(f.name, 'r', encoding='utf-8') as html_file:
+                            html_content = html_file.read()
+                            components.html(html_content, height=1000, scrolling=True)
+
+                        # Download button
+                        st.download_button(
+                            label="Download Profile Report",
+                            data=html_content,
+                            file_name=f"{selected_dataset}_{selected_table}_profile.html",
+                            mime="text/html"
+                        )
+
+                        # Clean up
+                        os.unlink(f.name)
+            else:
+                st.warning("No data available for profiling")
+        else:
+            st.info("👆 Click the button above to generate a comprehensive data profile report")
+            st.markdown("""
+            The detailed profile report includes:
+            - **Overview**: Dataset statistics, variable types, warnings
+            - **Variables**: Detailed analysis of each column
+            - **Interactions**: Correlation matrices and scatter plots
+            - **Correlations**: Pearson, Spearman, Kendall correlations
+            - **Missing Values**: Analysis of missing data patterns
+            - **Sample**: First and last rows of the dataset
+
+            *Note: For large tables, only the first 10,000 rows are profiled to ensure performance.*
+            """)
+
+    # Auto-refresh logic
+    if auto_refresh:
+        time.sleep(refresh_interval)
+        st.rerun()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/bq-monitoring-dashboard/devcontainer-template.json b/src/bq-monitoring-dashboard/devcontainer-template.json
new file mode 100644
index 00000000..076843c6
--- /dev/null
+++ b/src/bq-monitoring-dashboard/devcontainer-template.json
@@ -0,0 +1,20 @@
+{
+  "id": "bq-monitoring-dashboard",
+  "version": "1.0.0",
+  "name": "BigQuery Monitoring Dashboard",
+  "description": "Real-time BigQuery monitoring dashboard with data profiling. Displays dataset/table metrics, visualizations, and detailed data characteristics using ydata-profiling.",
+  "options": {
+    "cloud": {
+      "type": "string",
+      "enum": ["gcp", "aws"],
+      "default": "gcp",
+      "description": "Cloud provider (gcp or aws)"
+    },
+    "login": {
+      "type": "string",
+      "description": "Whether to log in to workbench CLI",
+      "proposals": ["true", "false"],
+      "default": "false"
+    }
+  }
+}
diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml
new file mode 100644
index 00000000..12fdbab4
--- /dev/null
+++ b/src/bq-monitoring-dashboard/docker-compose.yaml
@@ -0,0 +1,33 @@
+version: "2.4"
+
+services:
+  app:
+    container_name: "application-server"
+    build:
+      context: .
+      dockerfile: Dockerfile
+    restart: always
+    volumes:
+      - .:/workspace:cached
+      - work:/home/streamlit/work
+    ports:
+      - 8501:8501
+    networks:
+      - app-network
+    # Required for gcsfuse (GCS bucket mounting)
+    cap_add:
+      - SYS_ADMIN
+    devices:
+      - /dev/fuse
+    security_opt:
+      - apparmor:unconfined
+    environment:
+      - GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp_credentials.json
+      - DEFAULT_WORKSPACE=/config
+
+volumes:
+  work:
+
+networks:
+  app-network:
+    external: true
diff --git a/src/bq-monitoring-dashboard/requirements.txt b/src/bq-monitoring-dashboard/requirements.txt
new file mode 100644
index 00000000..c3c90ca3
--- /dev/null
+++ b/src/bq-monitoring-dashboard/requirements.txt
@@ -0,0 +1,6 @@
+streamlit==1.31.0
+pandas==2.1.4
+google-cloud-bigquery==3.14.1
+google-auth==2.25.2
+plotly==5.18.0
+ydata-profiling==4.6.4

From 5f504f7487fd5a009d0a5094a8d668d1598a482b Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Thu, 15 Jan 2026 23:08:53 +0000
Subject: [PATCH 2/6] Fix: Update volume mount to repository root for startup
 scripts

The volume mount needs to point to the repository root (../..) so that
the startup scripts at ./startupscript/ are accessible to the container.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/bq-monitoring-dashboard/docker-compose.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml
index 12fdbab4..0b090d4d 100644
--- a/src/bq-monitoring-dashboard/docker-compose.yaml
+++ b/src/bq-monitoring-dashboard/docker-compose.yaml
@@ -8,7 +8,7 @@ services:
       dockerfile: Dockerfile
     restart: always
     volumes:
-      - .:/workspace:cached
+      - ../..:/workspace:cached
       - work:/home/streamlit/work
     ports:
       - 8501:8501

From 7b561750992464157fa2493091654031f36a3940 Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Thu, 15 Jan 2026 23:23:27 +0000
Subject: [PATCH 3/6] Fix: Simplify Dockerfile and correct build context

Changes:
- Set build context to repository root for proper volume mounting
- Remove user switching to match remoteUser: root in devcontainer.json
- Add sudo for streamlit user
- Update CMD to reference app from /workspace path
- Fix requirements.txt copy path for new build context

This should resolve permission issues and path resolution problems.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/bq-monitoring-dashboard/Dockerfile        | 31 ++++++-------------
 .../docker-compose.yaml                       |  6 ++--
 2 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/src/bq-monitoring-dashboard/Dockerfile b/src/bq-monitoring-dashboard/Dockerfile
index 3bbadf2b..2d5adefe 100644
--- a/src/bq-monitoring-dashboard/Dockerfile
+++ b/src/bq-monitoring-dashboard/Dockerfile
@@ -1,38 +1,27 @@
 FROM python:3.11-slim
 
-# Set working directory
-WORKDIR /app
-
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     gcc \
     g++ \
     git \
     curl \
+    sudo \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy requirements first for better caching
-COPY requirements.txt .
-
 # Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
+COPY src/bq-monitoring-dashboard/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
 
-# Copy application code
-COPY app.py .
+# Create streamlit user with home directory
+RUN useradd -m -u 1000 -s /bin/bash streamlit && \
+    echo "streamlit ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
 
-# Create non-root user
-RUN useradd -m -u 1000 streamlit && \
-    chown -R streamlit:streamlit /app
-
-# Switch to non-root user
-USER streamlit
+# Set working directory
+WORKDIR /workspace
 
 # Expose Streamlit port
 EXPOSE 8501
 
-# Health check
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD curl -f http://localhost:8501/_stcore/health || exit 1
-
-# Run Streamlit
-CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"]
+# Run Streamlit from workspace
+CMD ["streamlit", "run", "/workspace/src/bq-monitoring-dashboard/app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"]
diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml
index 0b090d4d..d76adb57 100644
--- a/src/bq-monitoring-dashboard/docker-compose.yaml
+++ b/src/bq-monitoring-dashboard/docker-compose.yaml
@@ -4,11 +4,11 @@ services:
   app:
     container_name: "application-server"
     build:
-      context: .
-      dockerfile: Dockerfile
+      context: ../..
+      dockerfile: src/bq-monitoring-dashboard/Dockerfile
     restart: always
     volumes:
-      - ../..:/workspace:cached
+      - .:/workspace:cached
       - work:/home/streamlit/work
     ports:
       - 8501:8501

From 7fb353c4321c74b3f47166a8e7dbafbdfd39be27 Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Thu, 15 Jan 2026 23:41:20 +0000
Subject: [PATCH 4/6] Add simplified test version without custom Dockerfile

This version uses python:3.11-slim pre-built image and installs
dependencies at runtime to help debug build issues.

Use this version if bq-monitoring-dashboard fails to build.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/bq-dashboard-simple/.devcontainer.json    |  29 ++
 src/bq-dashboard-simple/app.py                | 454 ++++++++++++++++++
 .../devcontainer-template.json                |  20 +
 src/bq-dashboard-simple/docker-compose.yaml   |  36 ++
 4 files changed, 539 insertions(+)
 create mode 100644 src/bq-dashboard-simple/.devcontainer.json
 create mode 100644 src/bq-dashboard-simple/app.py
 create mode 100644 src/bq-dashboard-simple/devcontainer-template.json
 create mode 100644 src/bq-dashboard-simple/docker-compose.yaml

diff --git a/src/bq-dashboard-simple/.devcontainer.json b/src/bq-dashboard-simple/.devcontainer.json
new file mode 100644
index 00000000..736e8b77
--- /dev/null
+++ b/src/bq-dashboard-simple/.devcontainer.json
@@ -0,0 +1,29 @@
+{
+  "name": "bq-dashboard-simple",
+  "dockerComposeFile": "docker-compose.yaml",
+  "service": "app",
+  "shutdownAction": "none",
+  "workspaceFolder": "/workspace",
+  "postCreateCommand": [
+    "./startupscript/post-startup.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "postStartCommand": [
+    "./startupscript/remount-on-restart.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "features": {
+    "ghcr.io/devcontainers/features/java:1": {
+      "version": "17"
+    },
+    "ghcr.io/devcontainers/features/aws-cli:1": {},
+    "ghcr.io/dhoeric/features/google-cloud-cli:1": {}
+  },
+  "remoteUser": "root"
+}
diff --git a/src/bq-dashboard-simple/app.py b/src/bq-dashboard-simple/app.py
new file mode 100644
index 00000000..3acffac3
--- /dev/null
+++ b/src/bq-dashboard-simple/app.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+BigQuery Monitoring Dashboard
+Real-time monitoring of BigQuery datasets with data profiling
+"""
+
+import streamlit as st
+import pandas as pd
+from google.cloud import bigquery
+from google.auth import default
+import plotly.express as px
+import plotly.graph_objects as go
+from datetime import datetime, timedelta
+import time
+from ydata_profiling import ProfileReport
+import streamlit.components.v1 as components
+import tempfile
+import os
+
+# Page configuration
+st.set_page_config(
+    page_title="BigQuery Monitoring Dashboard",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+
+# Initialize BigQuery client
+@st.cache_resource
+def get_bigquery_client():
+    """Initialize and cache BigQuery client"""
+    try:
+        credentials, project = default()
+        client = bigquery.Client(credentials=credentials, project=project)
+        return client, project
+    except Exception as e:
+        st.error(f"Failed to initialize BigQuery client: {e}")
+        return None, None
+
+# Get all datasets in the project
+@st.cache_data(ttl=300)  # Cache for 5 minutes
+def get_datasets(project):
+    """Fetch all datasets in the project"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return []
+
+        datasets = list(client.list_datasets())
+        return [dataset.dataset_id for dataset in datasets]
+    except Exception as e:
+        st.error(f"Error fetching datasets: {e}")
+        return []
+
+# Get all tables in a dataset
+@st.cache_data(ttl=300)
+def get_tables(project, dataset_id):
+    """Fetch all tables in a dataset"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return []
+
+        tables = list(client.list_tables(dataset_id))
+        return [table.table_id for table in tables]
+    except Exception as e:
+        st.error(f"Error fetching tables: {e}")
+        return []
+
+# Get table metadata
+@st.cache_data(ttl=60)  # Cache for 1 minute
+def get_table_info(project, dataset_id, table_id):
+    """Get table metadata including row count and last modified"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        table_ref = f"{project}.{dataset_id}.{table_id}"
+        table = client.get_table(table_ref)
+
+        return {
+            "table_id": table_id,
+            "num_rows": table.num_rows,
+            "num_bytes": table.num_bytes,
+            "created": table.created,
+            "modified": table.modified,
+            "schema_fields": len(table.schema),
+            "table_type": table.table_type
+        }
+    except Exception as e:
+        st.error(f"Error fetching table info: {e}")
+        return None
+
+# Query table data
+@st.cache_data(ttl=60)
+def query_table_sample(project, dataset_id, table_id, limit=100):
+    """Query sample data from table"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        query = f"""
+        SELECT *
+        FROM `{project}.{dataset_id}.{table_id}`
+        LIMIT {limit}
+        """
+
+        df = client.query(query).to_dataframe()
+        return df
+    except Exception as e:
+        st.error(f"Error querying table: {e}")
+        return None
+
+# Query for time-based metrics (if timestamp column exists)
+@st.cache_data(ttl=60)
+def get_time_series_data(project, dataset_id, table_id, time_column=None):
+    """Get time series data for visualization"""
+    try:
+        client, _ = get_bigquery_client()
+        if not client:
+            return None
+
+        # Try to find a timestamp column if not provided
+        if not time_column:
+            table_ref = f"{project}.{dataset_id}.{table_id}"
+            table = client.get_table(table_ref)
+
+            # Look for common timestamp column names
+            time_columns = [field.name for field in table.schema
+                          if field.field_type in ['TIMESTAMP', 'DATE', 'DATETIME']]
+
+            if not time_columns:
+                return None
+
+            time_column = time_columns[0]
+
+        query = f"""
+        SELECT
+            DATE({time_column}) as date,
+            COUNT(*) as count
+        FROM `{project}.{dataset_id}.{table_id}`
+        WHERE {time_column} IS NOT NULL
+        GROUP BY date
+        ORDER BY date DESC
+        LIMIT 365
+        """
+
+        df = client.query(query).to_dataframe()
+        return df, time_column
+    except Exception as e:
+        # Silently fail if no time column exists
+        return None
+
+# Generate profiling report
+def generate_profiling_report(df, title="Data Profile"):
+    """Generate ydata-profiling report"""
+    try:
+        with st.spinner("Generating detailed data profile... This may take a moment."):
+            # Limit rows for profiling to prevent timeout
+            df_sample = df.head(10000) if len(df) > 10000 else df
+
+            profile = ProfileReport(
+                df_sample,
+                title=title,
+                minimal=False,
+                explorative=True,
+                progress_bar=False
+            )
+
+            return profile
+    except Exception as e:
+        st.error(f"Error generating profile: {e}")
+        return None
+
+# Main dashboard
+def main():
+    st.title("📊 BigQuery Monitoring Dashboard")
+    st.markdown("Real-time monitoring of BigQuery datasets with detailed data profiling")
+
+    # Initialize client
+    client, project = get_bigquery_client()
+
+    if not client:
+        st.error("Cannot connect to BigQuery. Please check your credentials.")
+        return
+
+    st.success(f"Connected to project: **{project}**")
+
+    # Sidebar configuration
+    st.sidebar.header("Configuration")
+
+    # Auto-refresh settings
+    auto_refresh = st.sidebar.checkbox("Auto-refresh", value=False)
+    if auto_refresh:
+        refresh_interval = st.sidebar.slider(
+            "Refresh interval (seconds)",
+            min_value=10,
+            max_value=300,
+            value=30
+        )
+
+    # Dataset selection
+    datasets = get_datasets(project)
+
+    if not datasets:
+        st.warning("No datasets found in this project.")
+        return
+
+    selected_dataset = st.sidebar.selectbox(
+        "Select Dataset",
+        datasets,
+        index=0
+    )
+
+    # Table selection
+    tables = get_tables(project, selected_dataset)
+
+    if not tables:
+        st.warning(f"No tables found in dataset '{selected_dataset}'")
+        return
+
+    selected_table = st.sidebar.selectbox(
+        "Select Table",
+        tables,
+        index=0
+    )
+
+    # Display timestamp
+    st.sidebar.markdown("---")
+    st.sidebar.markdown(f"**Last updated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    # Main content area
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "📈 Overview",
+        "📋 Sample Data",
+        "📊 Visualizations",
+        "🔍 Detailed Data Characteristics"
+    ])
+
+    # Tab 1: Overview
+    with tab1:
+        st.header(f"Table: {selected_dataset}.{selected_table}")
+
+        # Get table info
+        table_info = get_table_info(project, selected_dataset, selected_table)
+
+        if table_info:
+            # Display KPIs
+            col1, col2, col3, col4 = st.columns(4)
+
+            with col1:
+                st.metric(
+                    "Total Rows",
+                    f"{table_info['num_rows']:,}"
+                )
+
+            with col2:
+                size_mb = table_info['num_bytes'] / (1024 * 1024)
+                st.metric(
+                    "Size (MB)",
+                    f"{size_mb:,.2f}"
+                )
+
+            with col3:
+                st.metric(
+                    "Columns",
+                    table_info['schema_fields']
+                )
+
+            with col4:
+                st.metric(
+                    "Table Type",
+                    table_info['table_type']
+                )
+
+            # Display timestamps
+            st.markdown("---")
+            col1, col2 = st.columns(2)
+
+            with col1:
+                st.markdown(f"**Created:** {table_info['created'].strftime('%Y-%m-%d %H:%M:%S')}")
+
+            with col2:
+                st.markdown(f"**Last Modified:** {table_info['modified'].strftime('%Y-%m-%d %H:%M:%S')}")
+
+                # Data freshness indicator
+                time_since_modified = datetime.now(table_info['modified'].tzinfo) - table_info['modified']
+
+                if time_since_modified < timedelta(hours=1):
+                    st.success("🟢 Very fresh (< 1 hour)")
+                elif time_since_modified < timedelta(days=1):
+                    st.info("🟡 Fresh (< 1 day)")
+                elif time_since_modified < timedelta(days=7):
+                    st.warning("🟠 Moderate (< 1 week)")
+                else:
+                    st.error("🔴 Stale (> 1 week)")
+
+    # Tab 2: Sample Data
+    with tab2:
+        st.header("Sample Data")
+
+        sample_size = st.slider("Number of rows to display", 10, 1000, 100)
+
+        df = query_table_sample(project, selected_dataset, selected_table, limit=sample_size)
+
+        if df is not None and not df.empty:
+            st.dataframe(df, use_container_width=True, height=400)
+
+            # Download button
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="Download as CSV",
+                data=csv,
+                file_name=f"{selected_dataset}_{selected_table}_sample.csv",
+                mime="text/csv"
+            )
+        else:
+            st.warning("No data available")
+
+    # Tab 3: Visualizations
+    with tab3:
+        st.header("Data Visualizations")
+
+        df = query_table_sample(project, selected_dataset, selected_table, limit=1000)
+
+        if df is not None and not df.empty:
+            # Time series visualization
+            time_data = get_time_series_data(project, selected_dataset, selected_table)
+
+            if time_data:
+                time_df, time_column = time_data
+                st.subheader(f"Time Series: Records per Day (by {time_column})")
+
+                fig = px.line(
+                    time_df,
+                    x='date',
+                    y='count',
+                    title=f"Records over time",
+                    labels={'date': 'Date', 'count': 'Record Count'}
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+
+            # Column statistics
+            st.subheader("Column Statistics")
+
+            numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
+
+            if numeric_columns:
+                selected_column = st.selectbox("Select numeric column", numeric_columns)
+
+                col1, col2 = st.columns(2)
+
+                with col1:
+                    # Histogram
+                    fig = px.histogram(
+                        df,
+                        x=selected_column,
+                        title=f"Distribution of {selected_column}",
+                        nbins=50
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+
+                with col2:
+                    # Box plot
+                    fig = px.box(
+                        df,
+                        y=selected_column,
+                        title=f"Box Plot of {selected_column}"
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("No numeric columns found for visualization")
+
+            # Categorical columns
+            categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
+
+            if categorical_columns:
+                st.subheader("Categorical Analysis")
+                selected_cat_column = st.selectbox("Select categorical column", categorical_columns)
+
+                # Value counts
+                value_counts = df[selected_cat_column].value_counts().head(10)
+
+                fig = px.bar(
+                    x=value_counts.index,
+                    y=value_counts.values,
+                    title=f"Top 10 values in {selected_cat_column}",
+                    labels={'x': selected_cat_column, 'y': 'Count'}
+                )
+                st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No data available for visualization")
+
+    # Tab 4: Detailed Data Characteristics (ydata-profiling)
+    with tab4:
+        st.header("🔍 Detailed Data Characteristics")
+        st.markdown("Comprehensive data profiling using ydata-profiling")
+
+        # Option to generate profile
+        if st.button("Generate Detailed Profile Report", type="primary"):
+            df = query_table_sample(project, selected_dataset, selected_table, limit=10000)
+
+            if df is not None and not df.empty:
+                profile = generate_profiling_report(
+                    df,
+                    title=f"{selected_dataset}.{selected_table} - Data Profile"
+                )
+
+                if profile:
+                    # Save to temp file and display
+                    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as f:
+                        profile.to_file(f.name)
+
+                        # Read and display
+                        with open(f.name, 'r', encoding='utf-8') as html_file:
+                            html_content = html_file.read()
+                            components.html(html_content, height=1000, scrolling=True)
+
+                        # Download button
+                        st.download_button(
+                            label="Download Profile Report",
+                            data=html_content,
+                            file_name=f"{selected_dataset}_{selected_table}_profile.html",
+                            mime="text/html"
+                        )
+
+                        # Clean up
+                        os.unlink(f.name)
+            else:
+                st.warning("No data available for profiling")
+        else:
+            st.info("👆 Click the button above to generate a comprehensive data profile report")
+            st.markdown("""
+            The detailed profile report includes:
+            - **Overview**: Dataset statistics, variable types, warnings
+            - **Variables**: Detailed analysis of each column
+            - **Interactions**: Correlation matrices and scatter plots
+            - **Correlations**: Pearson, Spearman, Kendall correlations
+            - **Missing Values**: Analysis of missing data patterns
+            - **Sample**: First and last rows of the dataset
+
+            *Note: For large tables, only the first 10,000 rows are profiled to ensure performance.*
+            """)
+
+    # Auto-refresh logic
+    if auto_refresh:
+        time.sleep(refresh_interval)
+        st.rerun()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/bq-dashboard-simple/devcontainer-template.json b/src/bq-dashboard-simple/devcontainer-template.json
new file mode 100644
index 00000000..f82621b9
--- /dev/null
+++ b/src/bq-dashboard-simple/devcontainer-template.json
@@ -0,0 +1,20 @@
+{
+  "id": "bq-dashboard-simple",
+  "version": "1.0.0",
+  "name": "BigQuery Dashboard (Simple Test)",
+  "description": "Simplified version for testing - installs dependencies at runtime",
+  "options": {
+    "cloud": {
+      "type": "string",
+      "enum": ["gcp", "aws"],
+      "default": "gcp",
+      "description": "Cloud provider (gcp or aws)"
+    },
+    "login": {
+      "type": "string",
+      "description": "Whether to log in to workbench CLI",
+      "proposals": ["true", "false"],
+      "default": "false"
+    }
+  }
+}
diff --git a/src/bq-dashboard-simple/docker-compose.yaml b/src/bq-dashboard-simple/docker-compose.yaml
new file mode 100644
index 00000000..7eb3e86e
--- /dev/null
+++ b/src/bq-dashboard-simple/docker-compose.yaml
@@ -0,0 +1,36 @@
+version: "2.4"
+
+services:
+  app:
+    container_name: "application-server"
+    image: "python:3.11-slim"
+    restart: always
+    working_dir: /workspace
+    command: >
+      bash -c "
+      apt-get update && apt-get install -y curl gcc g++ git sudo &&
+      pip install streamlit pandas google-cloud-bigquery google-auth plotly ydata-profiling &&
+      streamlit run /workspace/src/bq-dashboard-simple/app.py --server.port=8501 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false
+      "
+    volumes:
+      - .:/workspace:cached
+      - work:/home/python/work
+    ports:
+      - 8501:8501
+    networks:
+      - app-network
+    cap_add:
+      - SYS_ADMIN
+    devices:
+      - /dev/fuse
+    security_opt:
+      - apparmor:unconfined
+    environment:
+      - GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp_credentials.json
+
+volumes:
+  work:
+
+networks:
+  app-network:
+    external: true

From d78e3bdc1bfbf095888a6a2c713f01df80f87312 Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Fri, 16 Jan 2026 02:22:39 +0000
Subject: [PATCH 5/6] Add SQL Query Executor app

Simple BigQuery SQL interface for executing queries in Workbench.

Features:
- Text editor for SQL queries
- Execute SELECT statements
- View results in interactive table
- Query statistics (bytes processed, slot time)
- CSV download
- Schema viewer
- Helper queries to list datasets and tables
- Support for cross-project queries (shared datasets)

Uses runtime dependency installation to avoid build issues.
Simple configuration based on working example app.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/sql-query-executor/.devcontainer.json     |  29 +++
 src/sql-query-executor/README.md              |  77 ++++++
 src/sql-query-executor/app.py                 | 238 ++++++++++++++++++
 .../devcontainer-template.json                |  20 ++
 src/sql-query-executor/docker-compose.yaml    |  35 +++
 5 files changed, 399 insertions(+)
 create mode 100644 src/sql-query-executor/.devcontainer.json
 create mode 100644 src/sql-query-executor/README.md
 create mode 100644 src/sql-query-executor/app.py
 create mode 100644 src/sql-query-executor/devcontainer-template.json
 create mode 100644 src/sql-query-executor/docker-compose.yaml

diff --git a/src/sql-query-executor/.devcontainer.json b/src/sql-query-executor/.devcontainer.json
new file mode 100644
index 00000000..ab5895b7
--- /dev/null
+++ b/src/sql-query-executor/.devcontainer.json
@@ -0,0 +1,29 @@
+{
+  "name": "sql-query-executor",
+  "dockerComposeFile": "docker-compose.yaml",
+  "service": "app",
+  "shutdownAction": "none",
+  "workspaceFolder": "/workspace",
+  "postCreateCommand": [
+    "./startupscript/post-startup.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "postStartCommand": [
+    "./startupscript/remount-on-restart.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "features": {
+    "ghcr.io/devcontainers/features/java:1": {
+      "version": "17"
+    },
+    "ghcr.io/devcontainers/features/aws-cli:1": {},
+    "ghcr.io/dhoeric/features/google-cloud-cli:1": {}
+  },
+  "remoteUser": "root"
+}
diff --git a/src/sql-query-executor/README.md b/src/sql-query-executor/README.md
new file mode 100644
index 00000000..899b775a
--- /dev/null
+++ b/src/sql-query-executor/README.md
@@ -0,0 +1,77 @@
+# SQL Query Executor
+
+A simple web interface to execute SQL queries against BigQuery datasets in your Workbench workspace.
+
+## Features
+
+- **SQL Text Editor**: Write and execute SELECT queries
+- **Query Results**: View results in an interactive table
+- **Query Statistics**: See bytes processed, slot time, and row count
+- **CSV Export**: Download query results
+- **Schema Viewer**: Inspect column types and sample values
+- **Helper Queries**: Quick buttons to list datasets and tables
+
+## Usage
+
+### Basic Query
+```sql
+SELECT *
+FROM `project.dataset.table`
+LIMIT 100
+```
+
+### Fully Qualified Table Names
+When querying datasets from other projects (shared datasets), use the full path:
+```sql
+SELECT *
+FROM `source-project.dataset.table`
+LIMIT 100
+```
+
+### List Your Datasets
+```sql
+SELECT schema_name as dataset
+FROM `your-project.INFORMATION_SCHEMA.SCHEMATA`
+ORDER BY schema_name
+```
+
+### Show Tables in a Dataset
+```sql
+SELECT
+  table_schema as dataset,
+  table_name,
+  table_type,
+  row_count
+FROM `your-project.dataset.__TABLES__`
+ORDER BY table_name
+```
+
+## Query Features Supported
+
+- ✅ SELECT statements
+- ✅ JOINs across tables
+- ✅ Aggregations (COUNT, SUM, AVG, etc.)
+- ✅ GROUP BY and ORDER BY
+- ✅ WHERE clauses
+- ✅ CTEs (WITH clauses)
+- ✅ Subqueries
+- ✅ Cross-project queries
+
+## Technical Details
+
+- **Framework**: Streamlit
+- **Port**: 8501
+- **Base Image**: python:3.11-slim
+- **Dependencies**: streamlit, pandas, google-cloud-bigquery
+
+## Authentication
+
+Uses Google Cloud Application Default Credentials (ADC) automatically configured by Workbench.
+
+## Tips
+
+1. **Always use LIMIT** to avoid processing large datasets
+2. **Use backticks** around table names: `` `project.dataset.table` ``
+3. **Check bytes processed** before running expensive queries
+4. **Download results** as CSV for further analysis
+5. **Use the sidebar helpers** to explore your datasets
diff --git a/src/sql-query-executor/app.py b/src/sql-query-executor/app.py
new file mode 100644
index 00000000..fae12428
--- /dev/null
+++ b/src/sql-query-executor/app.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""
+Simple BigQuery SQL Query Executor
+Execute SQL queries against your Workbench datasets
+"""
+
+import streamlit as st
+import pandas as pd
+from google.cloud import bigquery
+from google.auth import default
+import re
+
+# Page configuration
+st.set_page_config(
+    page_title="SQL Query Executor",
+    page_icon="🔍",
+    layout="wide"
+)
+
+# Initialize BigQuery client
+@st.cache_resource
+def get_bigquery_client():
+    """Initialize and cache BigQuery client"""
+    try:
+        credentials, project = default()
+        client = bigquery.Client(credentials=credentials, project=project)
+        return client, project
+    except Exception as e:
+        st.error(f"Failed to initialize BigQuery client: {e}")
+        return None, None
+
+def execute_query(client, query):
+    """Execute a SQL query and return results as DataFrame"""
+    try:
+        # Run the query
+        query_job = client.query(query)
+
+        # Get results
+        results = query_job.result()
+        df = results.to_dataframe()
+
+        # Get query stats
+        stats = {
+            "bytes_processed": query_job.total_bytes_processed,
+            "bytes_billed": query_job.total_bytes_billed,
+            "slot_time": query_job.slot_millis,
+            "rows_returned": len(df)
+        }
+
+        return df, stats, None
+    except Exception as e:
+        return None, None, str(e)
+
+def format_bytes(bytes_val):
+    """Format bytes to human-readable format"""
+    if bytes_val == 0:
+        return "0 B"
+
+    units = ["B", "KB", "MB", "GB", "TB"]
+    i = 0
+    while bytes_val >= 1024 and i < len(units) - 1:
+        bytes_val /= 1024.0
+        i += 1
+
+    return f"{bytes_val:.2f} {units[i]}"
+
+# Main app
+def main():
+    st.title("🔍 SQL Query Executor")
+    st.markdown("Execute SQL queries against BigQuery datasets in your Workbench workspace")
+
+    # Initialize client
+    client, project = get_bigquery_client()
+
+    if not client:
+        st.error("Cannot connect to BigQuery. Please check your credentials.")
+        return
+
+    st.success(f"Connected to project: **{project}**")
+
+    # Sidebar with help
+    with st.sidebar:
+        st.header("Quick Help")
+
+        st.markdown("### Query Format")
+        st.code("""
+SELECT *
+FROM `project.dataset.table`
+LIMIT 100
+        """, language="sql")
+
+        st.markdown("### Your Project")
+        st.info(f"**{project}**")
+
+        st.markdown("### Tips")
+        st.markdown("""
+- Use backticks `` for table names
+- Include LIMIT to avoid large results
+- Use fully qualified names:
+  `project.dataset.table`
+- Press Ctrl+Enter to execute
+        """)
+
+        st.markdown("### Examples")
+
+        if st.button("📊 List All Datasets"):
+            st.session_state['query'] = f"""
+SELECT schema_name as dataset
+FROM `{project}.INFORMATION_SCHEMA.SCHEMATA`
+ORDER BY schema_name
+            """.strip()
+
+        if st.button("📋 Show Dataset Tables"):
+            st.session_state['query'] = f"""
+SELECT
+  table_schema as dataset,
+  table_name,
+  table_type,
+  TIMESTAMP_MILLIS(creation_time) as created,
+  row_count,
+  size_bytes
+FROM `{project}.__TABLES__`
+ORDER BY table_schema, table_name
+            """.strip()
+
+    # Main query area
+    st.header("SQL Query")
+
+    # Get query from session state or default
+    default_query = st.session_state.get('query', f"""
+SELECT *
+FROM `{project}.DATASET.TABLE`
+LIMIT 100
+    """.strip())
+
+    query = st.text_area(
+        "Enter your SQL query:",
+        value=default_query,
+        height=200,
+        key="sql_input"
+    )
+
+    # Store query in session state
+    st.session_state['query'] = query
+
+    col1, col2, col3 = st.columns([1, 1, 4])
+
+    with col1:
+        execute_button = st.button("▶️ Execute Query", type="primary", use_container_width=True)
+
+    with col2:
+        if st.button("🗑️ Clear", use_container_width=True):
+            st.session_state['query'] = ""
+            st.rerun()
+
+    # Execute query
+    if execute_button and query.strip():
+        with st.spinner("Executing query..."):
+            df, stats, error = execute_query(client, query)
+
+            if error:
+                st.error(f"Query failed: {error}")
+            else:
+                # Show stats
+                st.subheader("Query Statistics")
+                col1, col2, col3, col4 = st.columns(4)
+
+                with col1:
+                    st.metric("Rows Returned", f"{stats['rows_returned']:,}")
+
+                with col2:
+                    st.metric("Bytes Processed", format_bytes(stats['bytes_processed']))
+
+                with col3:
+                    st.metric("Bytes Billed", format_bytes(stats['bytes_billed']))
+
+                with col4:
+                    slot_seconds = stats['slot_time'] / 1000.0
+                    st.metric("Slot Time", f"{slot_seconds:.2f}s")
+
+                # Show results
+                st.subheader("Query Results")
+
+                if len(df) > 0:
+                    # Display dataframe
+                    st.dataframe(df, use_container_width=True, height=400)
+
+                    # Download button
+                    csv = df.to_csv(index=False)
+                    st.download_button(
+                        label="📥 Download CSV",
+                        data=csv,
+                        file_name="query_results.csv",
+                        mime="text/csv"
+                    )
+
+                    # Show schema
+                    with st.expander("📋 Show Schema"):
+                        schema_df = pd.DataFrame({
+                            "Column": df.columns,
+                            "Type": [str(dtype) for dtype in df.dtypes],
+                            "Sample": [df[col].iloc[0] if len(df) > 0 else None for col in df.columns]
+                        })
+                        st.dataframe(schema_df, use_container_width=True)
+                else:
+                    st.info("Query executed successfully but returned no rows.")
+
+    elif execute_button:
+        st.warning("Please enter a SQL query.")
+
+    # Query history (simple version)
+    st.markdown("---")
+
+    with st.expander("ℹ️ About"):
+        st.markdown("""
+        ### BigQuery SQL Query Executor
+
+        This app allows you to execute SQL queries against BigQuery datasets in your Workbench workspace.
+
+        **Features:**
+        - Execute any SELECT query
+        - View query statistics (bytes processed, slot time)
+        - Download results as CSV
+        - View schema information
+
+        **Supported:**
+        - All standard SQL queries
+        - Queries across datasets and projects
+        - JOIN operations
+        - Aggregations and GROUP BY
+        - CTEs (WITH clauses)
+
+        **Authentication:**
+        Uses Google Cloud Application Default Credentials configured by Workbench.
+        """)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sql-query-executor/devcontainer-template.json b/src/sql-query-executor/devcontainer-template.json
new file mode 100644
index 00000000..db96e672
--- /dev/null
+++ b/src/sql-query-executor/devcontainer-template.json
@@ -0,0 +1,20 @@
+{
+  "id": "sql-query-executor",
+  "version": "1.0.0",
+  "name": "SQL Query Executor",
+  "description": "Simple BigQuery SQL query interface. Execute SELECT queries, view results, and download as CSV.",
+  "options": {
+    "cloud": {
+      "type": "string",
+      "enum": ["gcp", "aws"],
+      "default": "gcp",
+      "description": "Cloud provider (gcp or aws)"
+    },
+    "login": {
+      "type": "string",
+      "description": "Whether to log in to workbench CLI",
+      "proposals": ["true", "false"],
+      "default": "false"
+    }
+  }
+}
diff --git a/src/sql-query-executor/docker-compose.yaml b/src/sql-query-executor/docker-compose.yaml
new file mode 100644
index 00000000..14ea65a5
--- /dev/null
+++ b/src/sql-query-executor/docker-compose.yaml
@@ -0,0 +1,35 @@
+version: "2.4"
+
+services:
+  app:
+    container_name: "application-server"
+    image: "python:3.11-slim"
+    restart: always
+    working_dir: /workspace
+    command: >
+      bash -c "
+      apt-get update &&
+      apt-get install -y curl git &&
+      pip install --no-cache-dir streamlit pandas google-cloud-bigquery google-auth &&
+      streamlit run /workspace/src/sql-query-executor/app.py --server.port=8501 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false
+      "
+    volumes:
+      - .:/workspace:cached
+      - work:/root/work
+    ports:
+      - 8501:8501
+    networks:
+      - app-network
+    cap_add:
+      - SYS_ADMIN
+    devices:
+      - /dev/fuse
+    security_opt:
+      - apparmor:unconfined
+
+volumes:
+  work:
+
+networks:
+  app-network:
+    external: true

From 4fd22d16527ff710721451bf1729dfd9b3bbded6 Mon Sep 17 00:00:00 2001
From: Sowmya Ingarsal <sowmyaingarsal@verily.health>
Date: Fri, 16 Jan 2026 02:37:26 +0000
Subject: [PATCH 6/6] Add SQL Query Tool based on working example app

This app uses the EXACT structure of the working 'example' app but
runs a SQL query interface instead of Jupyter.

Key points:
- Copied from src/example/ (proven to work in Workbench)
- Same port (8888), same network config, same startup scripts
- Only change: runs Streamlit SQL app instead of Jupyter
- Runtime dependency installation (no Dockerfile build)

Since the example app deploys successfully, this should too.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/sql-jupyter/.devcontainer.json         |  29 +++
 src/sql-jupyter/README.md                  |  37 ++++
 src/sql-jupyter/app.py                     | 238 +++++++++++++++++++++
 src/sql-jupyter/devcontainer-template.json |  20 ++
 src/sql-jupyter/docker-compose.yaml        |  33 +++
 5 files changed, 357 insertions(+)
 create mode 100644 src/sql-jupyter/.devcontainer.json
 create mode 100644 src/sql-jupyter/README.md
 create mode 100644 src/sql-jupyter/app.py
 create mode 100644 src/sql-jupyter/devcontainer-template.json
 create mode 100644 src/sql-jupyter/docker-compose.yaml

diff --git a/src/sql-jupyter/.devcontainer.json b/src/sql-jupyter/.devcontainer.json
new file mode 100644
index 00000000..42e886c5
--- /dev/null
+++ b/src/sql-jupyter/.devcontainer.json
@@ -0,0 +1,29 @@
+{
+  "name": "sql-jupyter",
+  "dockerComposeFile": "docker-compose.yaml",
+  "service": "app",
+  "shutdownAction": "none",
+  "workspaceFolder": "/workspace",
+  "postCreateCommand": [
+    "./startupscript/post-startup.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "postStartCommand": [
+    "./startupscript/remount-on-restart.sh",
+    "root",
+    "/root",
+    "${templateOption:cloud}",
+    "${templateOption:login}"
+  ],
+  "features": {
+    "ghcr.io/devcontainers/features/java:1": {
+      "version": "17"
+    },
+    "ghcr.io/devcontainers/features/aws-cli:1": {},
+    "ghcr.io/dhoeric/features/google-cloud-cli:1": {}
+  },
+  "remoteUser": "root"
+}
diff --git a/src/sql-jupyter/README.md b/src/sql-jupyter/README.md
new file mode 100644
index 00000000..c5dd6cc8
--- /dev/null
+++ b/src/sql-jupyter/README.md
@@ -0,0 +1,37 @@
+# SQL Query Tool (Based on Working Example)
+
+This app is based on the **proven working `example` app structure** but modified to run a SQL query interface.
+
+## Why This Should Work
+
+- ✅ Uses the **exact same devcontainer structure** as the working example app
+- ✅ Same port (8888)
+- ✅ Same network configuration
+- ✅ Same startup scripts
+- ✅ Only difference: runs SQL query interface instead of Jupyter
+
+## Features
+
+- Simple SQL text editor
+- Execute SELECT queries against BigQuery
+- View results in interactive table
+- Download results as CSV
+- Query statistics (bytes processed, rows returned)
+
+## Usage
+
+1. Open the app after deployment
+2. Write your SQL query:
+   ```sql
+   SELECT * FROM \`project.dataset.table\` LIMIT 100
+   ```
+3. Click "Execute Query"
+4. View results and download if needed
+
+## First-Time Startup
+
+The first time you start this app, it will take **2-3 minutes** to install dependencies (streamlit, pandas, BigQuery client). Subsequent restarts will be faster.
+
+## Authentication
+
+Uses the same Google Cloud credentials as the example app - no configuration needed!
diff --git a/src/sql-jupyter/app.py b/src/sql-jupyter/app.py
new file mode 100644
index 00000000..fae12428
--- /dev/null
+++ b/src/sql-jupyter/app.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""
+Simple BigQuery SQL Query Executor
+Execute SQL queries against your Workbench datasets
+"""
+
+import streamlit as st
+import pandas as pd
+from google.cloud import bigquery
+from google.auth import default
+import re
+
+# Page configuration
+st.set_page_config(
+    page_title="SQL Query Executor",
+    page_icon="🔍",
+    layout="wide"
+)
+
+# Initialize BigQuery client
+@st.cache_resource
+def get_bigquery_client():
+    """Initialize and cache BigQuery client"""
+    try:
+        credentials, project = default()
+        client = bigquery.Client(credentials=credentials, project=project)
+        return client, project
+    except Exception as e:
+        st.error(f"Failed to initialize BigQuery client: {e}")
+        return None, None
+
+def execute_query(client, query):
+    """Execute a SQL query and return results as DataFrame"""
+    try:
+        # Run the query
+        query_job = client.query(query)
+
+        # Get results
+        results = query_job.result()
+        df = results.to_dataframe()
+
+        # Get query stats
+        stats = {
+            "bytes_processed": query_job.total_bytes_processed,
+            "bytes_billed": query_job.total_bytes_billed,
+            "slot_time": query_job.slot_millis,
+            "rows_returned": len(df)
+        }
+
+        return df, stats, None
+    except Exception as e:
+        return None, None, str(e)
+
+def format_bytes(bytes_val):
+    """Format bytes to human-readable format"""
+    if bytes_val == 0:
+        return "0 B"
+
+    units = ["B", "KB", "MB", "GB", "TB"]
+    i = 0
+    while bytes_val >= 1024 and i < len(units) - 1:
+        bytes_val /= 1024.0
+        i += 1
+
+    return f"{bytes_val:.2f} {units[i]}"
+
+# Main app
+def main():
+    st.title("🔍 SQL Query Executor")
+    st.markdown("Execute SQL queries against BigQuery datasets in your Workbench workspace")
+
+    # Initialize client
+    client, project = get_bigquery_client()
+
+    if not client:
+        st.error("Cannot connect to BigQuery. Please check your credentials.")
+        return
+
+    st.success(f"Connected to project: **{project}**")
+
+    # Sidebar with help
+    with st.sidebar:
+        st.header("Quick Help")
+
+        st.markdown("### Query Format")
+        st.code("""
+SELECT *
+FROM `project.dataset.table`
+LIMIT 100
+        """, language="sql")
+
+        st.markdown("### Your Project")
+        st.info(f"**{project}**")
+
+        st.markdown("### Tips")
+        st.markdown("""
+- Use backticks `` for table names
+- Include LIMIT to avoid large results
+- Use fully qualified names:
+  `project.dataset.table`
+- Press Ctrl+Enter to execute
+        """)
+
+        st.markdown("### Examples")
+
+        if st.button("📊 List All Datasets"):
+            st.session_state['query'] = f"""
+SELECT schema_name as dataset
+FROM `{project}.INFORMATION_SCHEMA.SCHEMATA`
+ORDER BY schema_name
+            """.strip()
+
+        if st.button("📋 Show Dataset Tables"):
+            st.session_state['query'] = f"""
+SELECT
+  table_schema as dataset,
+  table_name,
+  table_type,
+  TIMESTAMP_MILLIS(creation_time) as created,
+  row_count,
+  size_bytes
+FROM `{project}.__TABLES__`
+ORDER BY table_schema, table_name
+            """.strip()
+
+    # Main query area
+    st.header("SQL Query")
+
+    # Get query from session state or default
+    default_query = st.session_state.get('query', f"""
+SELECT *
+FROM `{project}.DATASET.TABLE`
+LIMIT 100
+    """.strip())
+
+    query = st.text_area(
+        "Enter your SQL query:",
+        value=default_query,
+        height=200,
+        key="sql_input"
+    )
+
+    # Store query in session state
+    st.session_state['query'] = query
+
+    col1, col2, col3 = st.columns([1, 1, 4])
+
+    with col1:
+        execute_button = st.button("▶️ Execute Query", type="primary", use_container_width=True)
+
+    with col2:
+        if st.button("🗑️ Clear", use_container_width=True):
+            st.session_state['query'] = ""
+            st.rerun()
+
+    # Execute query
+    if execute_button and query.strip():
+        with st.spinner("Executing query..."):
+            df, stats, error = execute_query(client, query)
+
+            if error:
+                st.error(f"Query failed: {error}")
+            else:
+                # Show stats
+                st.subheader("Query Statistics")
+                col1, col2, col3, col4 = st.columns(4)
+
+                with col1:
+                    st.metric("Rows Returned", f"{stats['rows_returned']:,}")
+
+                with col2:
+                    st.metric("Bytes Processed", format_bytes(stats['bytes_processed']))
+
+                with col3:
+                    st.metric("Bytes Billed", format_bytes(stats['bytes_billed']))
+
+                with col4:
+                    slot_seconds = stats['slot_time'] / 1000.0
+                    st.metric("Slot Time", f"{slot_seconds:.2f}s")
+
+                # Show results
+                st.subheader("Query Results")
+
+                if len(df) > 0:
+                    # Display dataframe
+                    st.dataframe(df, use_container_width=True, height=400)
+
+                    # Download button
+                    csv = df.to_csv(index=False)
+                    st.download_button(
+                        label="📥 Download CSV",
+                        data=csv,
+                        file_name="query_results.csv",
+                        mime="text/csv"
+                    )
+
+                    # Show schema
+                    with st.expander("📋 Show Schema"):
+                        schema_df = pd.DataFrame({
+                            "Column": df.columns,
+                            "Type": [str(dtype) for dtype in df.dtypes],
+                            "Sample": [df[col].iloc[0] if len(df) > 0 else None for col in df.columns]
+                        })
+                        st.dataframe(schema_df, use_container_width=True)
+                else:
+                    st.info("Query executed successfully but returned no rows.")
+
+    elif execute_button:
+        st.warning("Please enter a SQL query.")
+
+    # Query history (simple version)
+    st.markdown("---")
+
+    with st.expander("ℹ️ About"):
+        st.markdown("""
+        ### BigQuery SQL Query Executor
+
+        This app allows you to execute SQL queries against BigQuery datasets in your Workbench workspace.
+
+        **Features:**
+        - Execute any SELECT query
+        - View query statistics (bytes processed, slot time)
+        - Download results as CSV
+        - View schema information
+
+        **Supported:**
+        - All standard SQL queries
+        - Queries across datasets and projects
+        - JOIN operations
+        - Aggregations and GROUP BY
+        - CTEs (WITH clauses)
+
+        **Authentication:**
+        Uses Google Cloud Application Default Credentials configured by Workbench.
+        """)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sql-jupyter/devcontainer-template.json b/src/sql-jupyter/devcontainer-template.json
new file mode 100644
index 00000000..f4e74d4f
--- /dev/null
+++ b/src/sql-jupyter/devcontainer-template.json
@@ -0,0 +1,20 @@
+{
+  "id": "sql-jupyter",
+  "version": "1.0.0",
+  "name": "SQL Query Tool (Based on Working Example)",
+  "description": "Simple BigQuery SQL interface using proven example app structure (Port: 8888)",
+  "options": {
+    "cloud": {
+      "type": "string",
+      "enum": ["gcp", "aws"],
+      "default": "gcp",
+      "description": "Cloud provider (gcp or aws)"
+    },
+    "login": {
+      "type": "string",
+      "description": "Whether to log in to workbench CLI",
+      "proposals": ["true", "false"],
+      "default": "false"
+    }
+  }
+}
diff --git a/src/sql-jupyter/docker-compose.yaml b/src/sql-jupyter/docker-compose.yaml
new file mode 100644
index 00000000..acbc7a9e
--- /dev/null
+++ b/src/sql-jupyter/docker-compose.yaml
@@ -0,0 +1,33 @@
+services:
+  app:
+    container_name: "application-server"
+    image: "python:3.11-slim"
+    restart: always
+    working_dir: /workspace
+    command: >
+      bash -c "
+      apt-get update &&
+      apt-get install -y curl git &&
+      pip install --no-cache-dir streamlit pandas google-cloud-bigquery google-auth db-dtypes &&
+      streamlit run /workspace/src/sql-jupyter/app.py --server.port=8888 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false
+      "
+    volumes:
+      - .:/workspace:cached
+      - work:/root/work
+    ports:
+      - 8888:8888
+    networks:
+      - app-network
+    cap_add:
+      - SYS_ADMIN
+    devices:
+      - /dev/fuse
+    security_opt:
+      - apparmor:unconfined
+
+volumes:
+  work:
+
+networks:
+  app-network:
+    external: true