From f9179e1e79a0f2dd37c842345f5868d9406f9781 Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Thu, 15 Jan 2026 22:49:15 +0000 Subject: [PATCH 1/6] Add BigQuery Monitoring Dashboard custom app Features: - Real-time BigQuery dataset and table monitoring - Interactive visualizations (time series, histograms, box plots) - Sample data viewing and CSV export - Comprehensive data profiling using ydata-profiling - Auto-refresh capability - Data freshness indicators Co-Authored-By: Claude Sonnet 4.5 --- .../.devcontainer.json | 38 ++ src/bq-monitoring-dashboard/Dockerfile | 38 ++ src/bq-monitoring-dashboard/README.md | 84 ++++ src/bq-monitoring-dashboard/app.py | 454 ++++++++++++++++++ .../devcontainer-template.json | 20 + .../docker-compose.yaml | 33 ++ src/bq-monitoring-dashboard/requirements.txt | 6 + 7 files changed, 673 insertions(+) create mode 100644 src/bq-monitoring-dashboard/.devcontainer.json create mode 100644 src/bq-monitoring-dashboard/Dockerfile create mode 100644 src/bq-monitoring-dashboard/README.md create mode 100644 src/bq-monitoring-dashboard/app.py create mode 100644 src/bq-monitoring-dashboard/devcontainer-template.json create mode 100644 src/bq-monitoring-dashboard/docker-compose.yaml create mode 100644 src/bq-monitoring-dashboard/requirements.txt diff --git a/src/bq-monitoring-dashboard/.devcontainer.json b/src/bq-monitoring-dashboard/.devcontainer.json new file mode 100644 index 00000000..c26f52eb --- /dev/null +++ b/src/bq-monitoring-dashboard/.devcontainer.json @@ -0,0 +1,38 @@ +{ + "name": "bq-monitoring-dashboard", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "streamlit", + "/home/streamlit", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "streamlit", + "/home/streamlit", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "ghcr.io/devcontainers/features/java:1": { + "version": "17" + }, + "ghcr.io/devcontainers/features/aws-cli:1": {}, + "ghcr.io/dhoeric/features/google-cloud-cli:1": {} + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "opens": { + "extensions": [".py", ".csv", ".json", ".md"], + "fileUrlSuffix": "/", + "folderUrlSuffix": "/" + } + } + } +} diff --git a/src/bq-monitoring-dashboard/Dockerfile b/src/bq-monitoring-dashboard/Dockerfile new file mode 100644 index 00000000..3bbadf2b --- /dev/null +++ b/src/bq-monitoring-dashboard/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app.py . + +# Create non-root user +RUN useradd -m -u 1000 streamlit && \ + chown -R streamlit:streamlit /app + +# Switch to non-root user +USER streamlit + +# Expose Streamlit port +EXPOSE 8501 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8501/_stcore/health || exit 1 + +# Run Streamlit +CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"] diff --git a/src/bq-monitoring-dashboard/README.md b/src/bq-monitoring-dashboard/README.md new file mode 100644 index 00000000..0869aa3c --- /dev/null +++ b/src/bq-monitoring-dashboard/README.md @@ -0,0 +1,84 @@ +# BigQuery Monitoring Dashboard + +A comprehensive real-time monitoring dashboard for BigQuery datasets with detailed data profiling capabilities. + +## Features + +### 📈 Overview Tab +- **Key Metrics**: Total rows, table size, column count, table type +- **Timestamps**: Creation and last modification dates +- **Data Freshness Indicators**: Visual alerts for data age + - đŸŸĸ Very fresh (< 1 hour) + - 🟡 Fresh (< 1 day) + - 🟠 Moderate (< 1 week) + - 🔴 Stale (> 1 week) + +### 📋 Sample Data Tab +- View sample data from selected tables +- Adjustable row count (10-1000 rows) +- Download sample data as CSV + +### 📊 Visualizations Tab +- **Time Series Analysis**: Automatic detection of timestamp columns +- **Distribution Analysis**: Histograms for numeric columns +- **Box Plots**: Statistical distribution visualization +- **Categorical Analysis**: Top value frequency charts + +### 🔍 Detailed Data Characteristics Tab +- Comprehensive data profiling using **ydata-profiling** +- Includes: + - Dataset overview and statistics + - Variable type detection and analysis + - Correlation matrices (Pearson, Spearman, Kendall) + - Missing value analysis + - Interaction plots + - Sample data preview +- Downloadable HTML reports + +## Configuration + +### Auto-refresh +- Enable auto-refresh in the sidebar +- Configurable interval (10-300 seconds) + +### Dataset & Table Selection +- Dropdown selection for all available datasets +- Dropdown selection for tables within selected dataset + +## Technical Details + +- **Framework**: Streamlit +- **Port**: 8501 +- **Base Image**: Python 3.11-slim +- **Key Dependencies**: + - `streamlit`: Web UI framework + - `google-cloud-bigquery`: BigQuery client + - `plotly`: Interactive visualizations + - `ydata-profiling`: Comprehensive data profiling + - `pandas`: Data manipulation + +## Usage in Workbench + +1. Create a custom app in Workbench +2. Point to this repository +3. Select "BigQuery Monitoring Dashboard" template +4. Choose cloud provider (GCP recommended) +5. Launch the app +6. Access via Workbench UI + +## Authentication + +The app uses Google Cloud Application Default Credentials (ADC) automatically configured by Workbench. No manual authentication required. + +## Performance Notes + +- Table metadata is cached for 5 minutes +- Query results are cached for 1 minute +- Data profiling is limited to 10,000 rows for performance +- Auto-refresh can be adjusted based on data update frequency + +## Requirements + +- Workbench workspace with BigQuery access +- GCP project with enabled BigQuery API +- Sufficient IAM permissions to query BigQuery datasets diff --git a/src/bq-monitoring-dashboard/app.py b/src/bq-monitoring-dashboard/app.py new file mode 100644 index 00000000..3acffac3 --- /dev/null +++ b/src/bq-monitoring-dashboard/app.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +""" +BigQuery Monitoring Dashboard +Real-time monitoring of BigQuery datasets with data profiling +""" + +import streamlit as st +import pandas as pd +from google.cloud import bigquery +from google.auth import default +import plotly.express as px +import plotly.graph_objects as go +from datetime import datetime, timedelta +import time +from ydata_profiling import ProfileReport +import streamlit.components.v1 as components +import tempfile +import os + +# Page configuration +st.set_page_config( + page_title="BigQuery Monitoring Dashboard", + page_icon="📊", + layout="wide", + initial_sidebar_state="expanded" +) + +# Initialize BigQuery client +@st.cache_resource +def get_bigquery_client(): + """Initialize and cache BigQuery client""" + try: + credentials, project = default() + client = bigquery.Client(credentials=credentials, project=project) + return client, project + except Exception as e: + st.error(f"Failed to initialize BigQuery client: {e}") + return None, None + +# Get all datasets in the project +@st.cache_data(ttl=300) # Cache for 5 minutes +def get_datasets(project): + """Fetch all datasets in the project""" + try: + client, _ = get_bigquery_client() + if not client: + return [] + + datasets = list(client.list_datasets()) + return [dataset.dataset_id for dataset in datasets] + except Exception as e: + st.error(f"Error fetching datasets: {e}") + return [] + +# Get all tables in a dataset +@st.cache_data(ttl=300) +def get_tables(project, dataset_id): + """Fetch all tables in a dataset""" + try: + client, _ = get_bigquery_client() + if not client: + return [] + + tables = list(client.list_tables(dataset_id)) + return [table.table_id for table in tables] + except Exception as e: + st.error(f"Error fetching tables: {e}") + return [] + +# Get table metadata +@st.cache_data(ttl=60) # Cache for 1 minute +def get_table_info(project, dataset_id, table_id): + """Get table metadata including row count and last modified""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + table_ref = f"{project}.{dataset_id}.{table_id}" + table = client.get_table(table_ref) + + return { + "table_id": table_id, + "num_rows": table.num_rows, + "num_bytes": table.num_bytes, + "created": table.created, + "modified": table.modified, + "schema_fields": len(table.schema), + "table_type": table.table_type + } + except Exception as e: + st.error(f"Error fetching table info: {e}") + return None + +# Query table data +@st.cache_data(ttl=60) +def query_table_sample(project, dataset_id, table_id, limit=100): + """Query sample data from table""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + query = f""" + SELECT * + FROM `{project}.{dataset_id}.{table_id}` + LIMIT {limit} + """ + + df = client.query(query).to_dataframe() + return df + except Exception as e: + st.error(f"Error querying table: {e}") + return None + +# Query for time-based metrics (if timestamp column exists) +@st.cache_data(ttl=60) +def get_time_series_data(project, dataset_id, table_id, time_column=None): + """Get time series data for visualization""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + # Try to find a timestamp column if not provided + if not time_column: + table_ref = f"{project}.{dataset_id}.{table_id}" + table = client.get_table(table_ref) + + # Look for common timestamp column names + time_columns = [field.name for field in table.schema + if field.field_type in ['TIMESTAMP', 'DATE', 'DATETIME']] + + if not time_columns: + return None + + time_column = time_columns[0] + + query = f""" + SELECT + DATE({time_column}) as date, + COUNT(*) as count + FROM `{project}.{dataset_id}.{table_id}` + WHERE {time_column} IS NOT NULL + GROUP BY date + ORDER BY date DESC + LIMIT 365 + """ + + df = client.query(query).to_dataframe() + return df, time_column + except Exception as e: + # Silently fail if no time column exists + return None + +# Generate profiling report +def generate_profiling_report(df, title="Data Profile"): + """Generate ydata-profiling report""" + try: + with st.spinner("Generating detailed data profile... This may take a moment."): + # Limit rows for profiling to prevent timeout + df_sample = df.head(10000) if len(df) > 10000 else df + + profile = ProfileReport( + df_sample, + title=title, + minimal=False, + explorative=True, + progress_bar=False + ) + + return profile + except Exception as e: + st.error(f"Error generating profile: {e}") + return None + +# Main dashboard +def main(): + st.title("📊 BigQuery Monitoring Dashboard") + st.markdown("Real-time monitoring of BigQuery datasets with detailed data profiling") + + # Initialize client + client, project = get_bigquery_client() + + if not client: + st.error("Cannot connect to BigQuery. Please check your credentials.") + return + + st.success(f"Connected to project: **{project}**") + + # Sidebar configuration + st.sidebar.header("Configuration") + + # Auto-refresh settings + auto_refresh = st.sidebar.checkbox("Auto-refresh", value=False) + if auto_refresh: + refresh_interval = st.sidebar.slider( + "Refresh interval (seconds)", + min_value=10, + max_value=300, + value=30 + ) + + # Dataset selection + datasets = get_datasets(project) + + if not datasets: + st.warning("No datasets found in this project.") + return + + selected_dataset = st.sidebar.selectbox( + "Select Dataset", + datasets, + index=0 + ) + + # Table selection + tables = get_tables(project, selected_dataset) + + if not tables: + st.warning(f"No tables found in dataset '{selected_dataset}'") + return + + selected_table = st.sidebar.selectbox( + "Select Table", + tables, + index=0 + ) + + # Display timestamp + st.sidebar.markdown("---") + st.sidebar.markdown(f"**Last updated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Main content area + tab1, tab2, tab3, tab4 = st.tabs([ + "📈 Overview", + "📋 Sample Data", + "📊 Visualizations", + "🔍 Detailed Data Characteristics" + ]) + + # Tab 1: Overview + with tab1: + st.header(f"Table: {selected_dataset}.{selected_table}") + + # Get table info + table_info = get_table_info(project, selected_dataset, selected_table) + + if table_info: + # Display KPIs + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric( + "Total Rows", + f"{table_info['num_rows']:,}" + ) + + with col2: + size_mb = table_info['num_bytes'] / (1024 * 1024) + st.metric( + "Size (MB)", + f"{size_mb:,.2f}" + ) + + with col3: + st.metric( + "Columns", + table_info['schema_fields'] + ) + + with col4: + st.metric( + "Table Type", + table_info['table_type'] + ) + + # Display timestamps + st.markdown("---") + col1, col2 = st.columns(2) + + with col1: + st.markdown(f"**Created:** {table_info['created'].strftime('%Y-%m-%d %H:%M:%S')}") + + with col2: + st.markdown(f"**Last Modified:** {table_info['modified'].strftime('%Y-%m-%d %H:%M:%S')}") + + # Data freshness indicator + time_since_modified = datetime.now(table_info['modified'].tzinfo) - table_info['modified'] + + if time_since_modified < timedelta(hours=1): + st.success("đŸŸĸ Very fresh (< 1 hour)") + elif time_since_modified < timedelta(days=1): + st.info("🟡 Fresh (< 1 day)") + elif time_since_modified < timedelta(days=7): + st.warning("🟠 Moderate (< 1 week)") + else: + st.error("🔴 Stale (> 1 week)") + + # Tab 2: Sample Data + with tab2: + st.header("Sample Data") + + sample_size = st.slider("Number of rows to display", 10, 1000, 100) + + df = query_table_sample(project, selected_dataset, selected_table, limit=sample_size) + + if df is not None and not df.empty: + st.dataframe(df, use_container_width=True, height=400) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="Download as CSV", + data=csv, + file_name=f"{selected_dataset}_{selected_table}_sample.csv", + mime="text/csv" + ) + else: + st.warning("No data available") + + # Tab 3: Visualizations + with tab3: + st.header("Data Visualizations") + + df = query_table_sample(project, selected_dataset, selected_table, limit=1000) + + if df is not None and not df.empty: + # Time series visualization + time_data = get_time_series_data(project, selected_dataset, selected_table) + + if time_data: + time_df, time_column = time_data + st.subheader(f"Time Series: Records per Day (by {time_column})") + + fig = px.line( + time_df, + x='date', + y='count', + title=f"Records over time", + labels={'date': 'Date', 'count': 'Record Count'} + ) + fig.update_layout(height=400) + st.plotly_chart(fig, use_container_width=True) + + # Column statistics + st.subheader("Column Statistics") + + numeric_columns = df.select_dtypes(include=['number']).columns.tolist() + + if numeric_columns: + selected_column = st.selectbox("Select numeric column", numeric_columns) + + col1, col2 = st.columns(2) + + with col1: + # Histogram + fig = px.histogram( + df, + x=selected_column, + title=f"Distribution of {selected_column}", + nbins=50 + ) + st.plotly_chart(fig, use_container_width=True) + + with col2: + # Box plot + fig = px.box( + df, + y=selected_column, + title=f"Box Plot of {selected_column}" + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("No numeric columns found for visualization") + + # Categorical columns + categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist() + + if categorical_columns: + st.subheader("Categorical Analysis") + selected_cat_column = st.selectbox("Select categorical column", categorical_columns) + + # Value counts + value_counts = df[selected_cat_column].value_counts().head(10) + + fig = px.bar( + x=value_counts.index, + y=value_counts.values, + title=f"Top 10 values in {selected_cat_column}", + labels={'x': selected_cat_column, 'y': 'Count'} + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.warning("No data available for visualization") + + # Tab 4: Detailed Data Characteristics (ydata-profiling) + with tab4: + st.header("🔍 Detailed Data Characteristics") + st.markdown("Comprehensive data profiling using ydata-profiling") + + # Option to generate profile + if st.button("Generate Detailed Profile Report", type="primary"): + df = query_table_sample(project, selected_dataset, selected_table, limit=10000) + + if df is not None and not df.empty: + profile = generate_profiling_report( + df, + title=f"{selected_dataset}.{selected_table} - Data Profile" + ) + + if profile: + # Save to temp file and display + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as f: + profile.to_file(f.name) + + # Read and display + with open(f.name, 'r', encoding='utf-8') as html_file: + html_content = html_file.read() + components.html(html_content, height=1000, scrolling=True) + + # Download button + st.download_button( + label="Download Profile Report", + data=html_content, + file_name=f"{selected_dataset}_{selected_table}_profile.html", + mime="text/html" + ) + + # Clean up + os.unlink(f.name) + else: + st.warning("No data available for profiling") + else: + st.info("👆 Click the button above to generate a comprehensive data profile report") + st.markdown(""" + The detailed profile report includes: + - **Overview**: Dataset statistics, variable types, warnings + - **Variables**: Detailed analysis of each column + - **Interactions**: Correlation matrices and scatter plots + - **Correlations**: Pearson, Spearman, Kendall correlations + - **Missing Values**: Analysis of missing data patterns + - **Sample**: First and last rows of the dataset + + *Note: For large tables, only the first 10,000 rows are profiled to ensure performance.* + """) + + # Auto-refresh logic + if auto_refresh: + time.sleep(refresh_interval) + st.rerun() + +if __name__ == "__main__": + main() diff --git a/src/bq-monitoring-dashboard/devcontainer-template.json b/src/bq-monitoring-dashboard/devcontainer-template.json new file mode 100644 index 00000000..076843c6 --- /dev/null +++ b/src/bq-monitoring-dashboard/devcontainer-template.json @@ -0,0 +1,20 @@ +{ + "id": "bq-monitoring-dashboard", + "version": "1.0.0", + "name": "BigQuery Monitoring Dashboard", + "description": "Real-time BigQuery monitoring dashboard with data profiling. Displays dataset/table metrics, visualizations, and detailed data characteristics using ydata-profiling.", + "options": { + "cloud": { + "type": "string", + "enum": ["gcp", "aws"], + "default": "gcp", + "description": "Cloud provider (gcp or aws)" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + } +} diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml new file mode 100644 index 00000000..12fdbab4 --- /dev/null +++ b/src/bq-monitoring-dashboard/docker-compose.yaml @@ -0,0 +1,33 @@ +version: "2.4" + +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - work:/home/streamlit/work + ports: + - 8501:8501 + networks: + - app-network + # Required for gcsfuse (GCS bucket mounting) + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + environment: + - GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp_credentials.json + - DEFAULT_WORKSPACE=/config + +volumes: + work: + +networks: + app-network: + external: true diff --git a/src/bq-monitoring-dashboard/requirements.txt b/src/bq-monitoring-dashboard/requirements.txt new file mode 100644 index 00000000..c3c90ca3 --- /dev/null +++ b/src/bq-monitoring-dashboard/requirements.txt @@ -0,0 +1,6 @@ +streamlit==1.31.0 +pandas==2.1.4 +google-cloud-bigquery==3.14.1 +google-auth==2.25.2 +plotly==5.18.0 +ydata-profiling==4.6.4 From 5f504f7487fd5a009d0a5094a8d668d1598a482b Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Thu, 15 Jan 2026 23:08:53 +0000 Subject: [PATCH 2/6] Fix: Update volume mount to repository root for startup scripts The volume mount needs to point to the repository root (../..) so that the startup scripts at ./startupscript/ are accessible to the container. Co-Authored-By: Claude Sonnet 4.5 --- src/bq-monitoring-dashboard/docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml index 12fdbab4..0b090d4d 100644 --- a/src/bq-monitoring-dashboard/docker-compose.yaml +++ b/src/bq-monitoring-dashboard/docker-compose.yaml @@ -8,7 +8,7 @@ services: dockerfile: Dockerfile restart: always volumes: - - .:/workspace:cached + - ../..:/workspace:cached - work:/home/streamlit/work ports: - 8501:8501 From 7b561750992464157fa2493091654031f36a3940 Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Thu, 15 Jan 2026 23:23:27 +0000 Subject: [PATCH 3/6] Fix: Simplify Dockerfile and correct build context Changes: - Set build context to repository root for proper volume mounting - Remove user switching to match remoteUser: root in devcontainer.json - Add sudo for streamlit user - Update CMD to reference app from /workspace path - Fix requirements.txt copy path for new build context This should resolve permission issues and path resolution problems. Co-Authored-By: Claude Sonnet 4.5 --- src/bq-monitoring-dashboard/Dockerfile | 31 ++++++------------- .../docker-compose.yaml | 6 ++-- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/src/bq-monitoring-dashboard/Dockerfile b/src/bq-monitoring-dashboard/Dockerfile index 3bbadf2b..2d5adefe 100644 --- a/src/bq-monitoring-dashboard/Dockerfile +++ b/src/bq-monitoring-dashboard/Dockerfile @@ -1,38 +1,27 @@ FROM python:3.11-slim -# Set working directory -WORKDIR /app - # Install system dependencies RUN apt-get update && apt-get install -y \ gcc \ g++ \ git \ curl \ + sudo \ && rm -rf /var/lib/apt/lists/* -# Copy requirements first for better caching -COPY requirements.txt . - # Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt +COPY src/bq-monitoring-dashboard/requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt -# Copy application code -COPY app.py . +# Create streamlit user with home directory +RUN useradd -m -u 1000 -s /bin/bash streamlit && \ + echo "streamlit ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers -# Create non-root user -RUN useradd -m -u 1000 streamlit && \ - chown -R streamlit:streamlit /app - -# Switch to non-root user -USER streamlit +# Set working directory +WORKDIR /workspace # Expose Streamlit port EXPOSE 8501 -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8501/_stcore/health || exit 1 - -# Run Streamlit -CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"] +# Run Streamlit from workspace +CMD ["streamlit", "run", "/workspace/src/bq-monitoring-dashboard/app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true", "--browser.gatherUsageStats=false"] diff --git a/src/bq-monitoring-dashboard/docker-compose.yaml b/src/bq-monitoring-dashboard/docker-compose.yaml index 0b090d4d..d76adb57 100644 --- a/src/bq-monitoring-dashboard/docker-compose.yaml +++ b/src/bq-monitoring-dashboard/docker-compose.yaml @@ -4,11 +4,11 @@ services: app: container_name: "application-server" build: - context: . - dockerfile: Dockerfile + context: ../.. + dockerfile: src/bq-monitoring-dashboard/Dockerfile restart: always volumes: - - ../..:/workspace:cached + - .:/workspace:cached - work:/home/streamlit/work ports: - 8501:8501 From 7fb353c4321c74b3f47166a8e7dbafbdfd39be27 Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Thu, 15 Jan 2026 23:41:20 +0000 Subject: [PATCH 4/6] Add simplified test version without custom Dockerfile This version uses python:3.11-slim pre-built image and installs dependencies at runtime to help debug build issues. Use this version if bq-monitoring-dashboard fails to build. Co-Authored-By: Claude Sonnet 4.5 --- src/bq-dashboard-simple/.devcontainer.json | 29 ++ src/bq-dashboard-simple/app.py | 454 ++++++++++++++++++ .../devcontainer-template.json | 20 + src/bq-dashboard-simple/docker-compose.yaml | 36 ++ 4 files changed, 539 insertions(+) create mode 100644 src/bq-dashboard-simple/.devcontainer.json create mode 100644 src/bq-dashboard-simple/app.py create mode 100644 src/bq-dashboard-simple/devcontainer-template.json create mode 100644 src/bq-dashboard-simple/docker-compose.yaml diff --git a/src/bq-dashboard-simple/.devcontainer.json b/src/bq-dashboard-simple/.devcontainer.json new file mode 100644 index 00000000..736e8b77 --- /dev/null +++ b/src/bq-dashboard-simple/.devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "bq-dashboard-simple", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "ghcr.io/devcontainers/features/java:1": { + "version": "17" + }, + "ghcr.io/devcontainers/features/aws-cli:1": {}, + "ghcr.io/dhoeric/features/google-cloud-cli:1": {} + }, + "remoteUser": "root" +} diff --git a/src/bq-dashboard-simple/app.py b/src/bq-dashboard-simple/app.py new file mode 100644 index 00000000..3acffac3 --- /dev/null +++ b/src/bq-dashboard-simple/app.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +""" +BigQuery Monitoring Dashboard +Real-time monitoring of BigQuery datasets with data profiling +""" + +import streamlit as st +import pandas as pd +from google.cloud import bigquery +from google.auth import default +import plotly.express as px +import plotly.graph_objects as go +from datetime import datetime, timedelta +import time +from ydata_profiling import ProfileReport +import streamlit.components.v1 as components +import tempfile +import os + +# Page configuration +st.set_page_config( + page_title="BigQuery Monitoring Dashboard", + page_icon="📊", + layout="wide", + initial_sidebar_state="expanded" +) + +# Initialize BigQuery client +@st.cache_resource +def get_bigquery_client(): + """Initialize and cache BigQuery client""" + try: + credentials, project = default() + client = bigquery.Client(credentials=credentials, project=project) + return client, project + except Exception as e: + st.error(f"Failed to initialize BigQuery client: {e}") + return None, None + +# Get all datasets in the project +@st.cache_data(ttl=300) # Cache for 5 minutes +def get_datasets(project): + """Fetch all datasets in the project""" + try: + client, _ = get_bigquery_client() + if not client: + return [] + + datasets = list(client.list_datasets()) + return [dataset.dataset_id for dataset in datasets] + except Exception as e: + st.error(f"Error fetching datasets: {e}") + return [] + +# Get all tables in a dataset +@st.cache_data(ttl=300) +def get_tables(project, dataset_id): + """Fetch all tables in a dataset""" + try: + client, _ = get_bigquery_client() + if not client: + return [] + + tables = list(client.list_tables(dataset_id)) + return [table.table_id for table in tables] + except Exception as e: + st.error(f"Error fetching tables: {e}") + return [] + +# Get table metadata +@st.cache_data(ttl=60) # Cache for 1 minute +def get_table_info(project, dataset_id, table_id): + """Get table metadata including row count and last modified""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + table_ref = f"{project}.{dataset_id}.{table_id}" + table = client.get_table(table_ref) + + return { + "table_id": table_id, + "num_rows": table.num_rows, + "num_bytes": table.num_bytes, + "created": table.created, + "modified": table.modified, + "schema_fields": len(table.schema), + "table_type": table.table_type + } + except Exception as e: + st.error(f"Error fetching table info: {e}") + return None + +# Query table data +@st.cache_data(ttl=60) +def query_table_sample(project, dataset_id, table_id, limit=100): + """Query sample data from table""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + query = f""" + SELECT * + FROM `{project}.{dataset_id}.{table_id}` + LIMIT {limit} + """ + + df = client.query(query).to_dataframe() + return df + except Exception as e: + st.error(f"Error querying table: {e}") + return None + +# Query for time-based metrics (if timestamp column exists) +@st.cache_data(ttl=60) +def get_time_series_data(project, dataset_id, table_id, time_column=None): + """Get time series data for visualization""" + try: + client, _ = get_bigquery_client() + if not client: + return None + + # Try to find a timestamp column if not provided + if not time_column: + table_ref = f"{project}.{dataset_id}.{table_id}" + table = client.get_table(table_ref) + + # Look for common timestamp column names + time_columns = [field.name for field in table.schema + if field.field_type in ['TIMESTAMP', 'DATE', 'DATETIME']] + + if not time_columns: + return None + + time_column = time_columns[0] + + query = f""" + SELECT + DATE({time_column}) as date, + COUNT(*) as count + FROM `{project}.{dataset_id}.{table_id}` + WHERE {time_column} IS NOT NULL + GROUP BY date + ORDER BY date DESC + LIMIT 365 + """ + + df = client.query(query).to_dataframe() + return df, time_column + except Exception as e: + # Silently fail if no time column exists + return None + +# Generate profiling report +def generate_profiling_report(df, title="Data Profile"): + """Generate ydata-profiling report""" + try: + with st.spinner("Generating detailed data profile... This may take a moment."): + # Limit rows for profiling to prevent timeout + df_sample = df.head(10000) if len(df) > 10000 else df + + profile = ProfileReport( + df_sample, + title=title, + minimal=False, + explorative=True, + progress_bar=False + ) + + return profile + except Exception as e: + st.error(f"Error generating profile: {e}") + return None + +# Main dashboard +def main(): + st.title("📊 BigQuery Monitoring Dashboard") + st.markdown("Real-time monitoring of BigQuery datasets with detailed data profiling") + + # Initialize client + client, project = get_bigquery_client() + + if not client: + st.error("Cannot connect to BigQuery. Please check your credentials.") + return + + st.success(f"Connected to project: **{project}**") + + # Sidebar configuration + st.sidebar.header("Configuration") + + # Auto-refresh settings + auto_refresh = st.sidebar.checkbox("Auto-refresh", value=False) + if auto_refresh: + refresh_interval = st.sidebar.slider( + "Refresh interval (seconds)", + min_value=10, + max_value=300, + value=30 + ) + + # Dataset selection + datasets = get_datasets(project) + + if not datasets: + st.warning("No datasets found in this project.") + return + + selected_dataset = st.sidebar.selectbox( + "Select Dataset", + datasets, + index=0 + ) + + # Table selection + tables = get_tables(project, selected_dataset) + + if not tables: + st.warning(f"No tables found in dataset '{selected_dataset}'") + return + + selected_table = st.sidebar.selectbox( + "Select Table", + tables, + index=0 + ) + + # Display timestamp + st.sidebar.markdown("---") + st.sidebar.markdown(f"**Last updated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Main content area + tab1, tab2, tab3, tab4 = st.tabs([ + "📈 Overview", + "📋 Sample Data", + "📊 Visualizations", + "🔍 Detailed Data Characteristics" + ]) + + # Tab 1: Overview + with tab1: + st.header(f"Table: {selected_dataset}.{selected_table}") + + # Get table info + table_info = get_table_info(project, selected_dataset, selected_table) + + if table_info: + # Display KPIs + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric( + "Total Rows", + f"{table_info['num_rows']:,}" + ) + + with col2: + size_mb = table_info['num_bytes'] / (1024 * 1024) + st.metric( + "Size (MB)", + f"{size_mb:,.2f}" + ) + + with col3: + st.metric( + "Columns", + table_info['schema_fields'] + ) + + with col4: + st.metric( + "Table Type", + table_info['table_type'] + ) + + # Display timestamps + st.markdown("---") + col1, col2 = st.columns(2) + + with col1: + st.markdown(f"**Created:** {table_info['created'].strftime('%Y-%m-%d %H:%M:%S')}") + + with col2: + st.markdown(f"**Last Modified:** {table_info['modified'].strftime('%Y-%m-%d %H:%M:%S')}") + + # Data freshness indicator + time_since_modified = datetime.now(table_info['modified'].tzinfo) - table_info['modified'] + + if time_since_modified < timedelta(hours=1): + st.success("đŸŸĸ Very fresh (< 1 hour)") + elif time_since_modified < timedelta(days=1): + st.info("🟡 Fresh (< 1 day)") + elif time_since_modified < timedelta(days=7): + st.warning("🟠 Moderate (< 1 week)") + else: + st.error("🔴 Stale (> 1 week)") + + # Tab 2: Sample Data + with tab2: + st.header("Sample Data") + + sample_size = st.slider("Number of rows to display", 10, 1000, 100) + + df = query_table_sample(project, selected_dataset, selected_table, limit=sample_size) + + if df is not None and not df.empty: + st.dataframe(df, use_container_width=True, height=400) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="Download as CSV", + data=csv, + file_name=f"{selected_dataset}_{selected_table}_sample.csv", + mime="text/csv" + ) + else: + st.warning("No data available") + + # Tab 3: Visualizations + with tab3: + st.header("Data Visualizations") + + df = query_table_sample(project, selected_dataset, selected_table, limit=1000) + + if df is not None and not df.empty: + # Time series visualization + time_data = get_time_series_data(project, selected_dataset, selected_table) + + if time_data: + time_df, time_column = time_data + st.subheader(f"Time Series: Records per Day (by {time_column})") + + fig = px.line( + time_df, + x='date', + y='count', + title=f"Records over time", + labels={'date': 'Date', 'count': 'Record Count'} + ) + fig.update_layout(height=400) + st.plotly_chart(fig, use_container_width=True) + + # Column statistics + st.subheader("Column Statistics") + + numeric_columns = df.select_dtypes(include=['number']).columns.tolist() + + if numeric_columns: + selected_column = st.selectbox("Select numeric column", numeric_columns) + + col1, col2 = st.columns(2) + + with col1: + # Histogram + fig = px.histogram( + df, + x=selected_column, + title=f"Distribution of {selected_column}", + nbins=50 + ) + st.plotly_chart(fig, use_container_width=True) + + with col2: + # Box plot + fig = px.box( + df, + y=selected_column, + title=f"Box Plot of {selected_column}" + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("No numeric columns found for visualization") + + # Categorical columns + categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist() + + if categorical_columns: + st.subheader("Categorical Analysis") + selected_cat_column = st.selectbox("Select categorical column", categorical_columns) + + # Value counts + value_counts = df[selected_cat_column].value_counts().head(10) + + fig = px.bar( + x=value_counts.index, + y=value_counts.values, + title=f"Top 10 values in {selected_cat_column}", + labels={'x': selected_cat_column, 'y': 'Count'} + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.warning("No data available for visualization") + + # Tab 4: Detailed Data Characteristics (ydata-profiling) + with tab4: + st.header("🔍 Detailed Data Characteristics") + st.markdown("Comprehensive data profiling using ydata-profiling") + + # Option to generate profile + if st.button("Generate Detailed Profile Report", type="primary"): + df = query_table_sample(project, selected_dataset, selected_table, limit=10000) + + if df is not None and not df.empty: + profile = generate_profiling_report( + df, + title=f"{selected_dataset}.{selected_table} - Data Profile" + ) + + if profile: + # Save to temp file and display + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as f: + profile.to_file(f.name) + + # Read and display + with open(f.name, 'r', encoding='utf-8') as html_file: + html_content = html_file.read() + components.html(html_content, height=1000, scrolling=True) + + # Download button + st.download_button( + label="Download Profile Report", + data=html_content, + file_name=f"{selected_dataset}_{selected_table}_profile.html", + mime="text/html" + ) + + # Clean up + os.unlink(f.name) + else: + st.warning("No data available for profiling") + else: + st.info("👆 Click the button above to generate a comprehensive data profile report") + st.markdown(""" + The detailed profile report includes: + - **Overview**: Dataset statistics, variable types, warnings + - **Variables**: Detailed analysis of each column + - **Interactions**: Correlation matrices and scatter plots + - **Correlations**: Pearson, Spearman, Kendall correlations + - **Missing Values**: Analysis of missing data patterns + - **Sample**: First and last rows of the dataset + + *Note: For large tables, only the first 10,000 rows are profiled to ensure performance.* + """) + + # Auto-refresh logic + if auto_refresh: + time.sleep(refresh_interval) + st.rerun() + +if __name__ == "__main__": + main() diff --git a/src/bq-dashboard-simple/devcontainer-template.json b/src/bq-dashboard-simple/devcontainer-template.json new file mode 100644 index 00000000..f82621b9 --- /dev/null +++ b/src/bq-dashboard-simple/devcontainer-template.json @@ -0,0 +1,20 @@ +{ + "id": "bq-dashboard-simple", + "version": "1.0.0", + "name": "BigQuery Dashboard (Simple Test)", + "description": "Simplified version for testing - installs dependencies at runtime", + "options": { + "cloud": { + "type": "string", + "enum": ["gcp", "aws"], + "default": "gcp", + "description": "Cloud provider (gcp or aws)" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + } +} diff --git a/src/bq-dashboard-simple/docker-compose.yaml b/src/bq-dashboard-simple/docker-compose.yaml new file mode 100644 index 00000000..7eb3e86e --- /dev/null +++ b/src/bq-dashboard-simple/docker-compose.yaml @@ -0,0 +1,36 @@ +version: "2.4" + +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c " + apt-get update && apt-get install -y curl gcc g++ git sudo && + pip install streamlit pandas google-cloud-bigquery google-auth plotly ydata-profiling && + streamlit run /workspace/src/bq-dashboard-simple/app.py --server.port=8501 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false + " + volumes: + - .:/workspace:cached + - work:/home/python/work + ports: + - 8501:8501 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + environment: + - GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp_credentials.json + +volumes: + work: + +networks: + app-network: + external: true From d78e3bdc1bfbf095888a6a2c713f01df80f87312 Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Fri, 16 Jan 2026 02:22:39 +0000 Subject: [PATCH 5/6] Add SQL Query Executor app Simple BigQuery SQL interface for executing queries in Workbench. Features: - Text editor for SQL queries - Execute SELECT statements - View results in interactive table - Query statistics (bytes processed, slot time) - CSV download - Schema viewer - Helper queries to list datasets and tables - Support for cross-project queries (shared datasets) Uses runtime dependency installation to avoid build issues. Simple configuration based on working example app. Co-Authored-By: Claude Sonnet 4.5 --- src/sql-query-executor/.devcontainer.json | 29 +++ src/sql-query-executor/README.md | 77 ++++++ src/sql-query-executor/app.py | 238 ++++++++++++++++++ .../devcontainer-template.json | 20 ++ src/sql-query-executor/docker-compose.yaml | 35 +++ 5 files changed, 399 insertions(+) create mode 100644 src/sql-query-executor/.devcontainer.json create mode 100644 src/sql-query-executor/README.md create mode 100644 src/sql-query-executor/app.py create mode 100644 src/sql-query-executor/devcontainer-template.json create mode 100644 src/sql-query-executor/docker-compose.yaml diff --git a/src/sql-query-executor/.devcontainer.json b/src/sql-query-executor/.devcontainer.json new file mode 100644 index 00000000..ab5895b7 --- /dev/null +++ b/src/sql-query-executor/.devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "sql-query-executor", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "ghcr.io/devcontainers/features/java:1": { + "version": "17" + }, + "ghcr.io/devcontainers/features/aws-cli:1": {}, + "ghcr.io/dhoeric/features/google-cloud-cli:1": {} + }, + "remoteUser": "root" +} diff --git a/src/sql-query-executor/README.md b/src/sql-query-executor/README.md new file mode 100644 index 00000000..899b775a --- /dev/null +++ b/src/sql-query-executor/README.md @@ -0,0 +1,77 @@ +# SQL Query Executor + +A simple web interface to execute SQL queries against BigQuery datasets in your Workbench workspace. + +## Features + +- **SQL Text Editor**: Write and execute SELECT queries +- **Query Results**: View results in an interactive table +- **Query Statistics**: See bytes processed, slot time, and row count +- **CSV Export**: Download query results +- **Schema Viewer**: Inspect column types and sample values +- **Helper Queries**: Quick buttons to list datasets and tables + +## Usage + +### Basic Query +```sql +SELECT * +FROM `project.dataset.table` +LIMIT 100 +``` + +### Fully Qualified Table Names +When querying datasets from other projects (shared datasets), use the full path: +```sql +SELECT * +FROM `source-project.dataset.table` +LIMIT 100 +``` + +### List Your Datasets +```sql +SELECT schema_name as dataset +FROM `your-project.INFORMATION_SCHEMA.SCHEMATA` +ORDER BY schema_name +``` + +### Show Tables in a Dataset +```sql +SELECT + table_schema as dataset, + table_name, + table_type, + row_count +FROM `your-project.dataset.__TABLES__` +ORDER BY table_name +``` + +## Query Features Supported + +- ✅ SELECT statements +- ✅ JOINs across tables +- ✅ Aggregations (COUNT, SUM, AVG, etc.) +- ✅ GROUP BY and ORDER BY +- ✅ WHERE clauses +- ✅ CTEs (WITH clauses) +- ✅ Subqueries +- ✅ Cross-project queries + +## Technical Details + +- **Framework**: Streamlit +- **Port**: 8501 +- **Base Image**: python:3.11-slim +- **Dependencies**: streamlit, pandas, google-cloud-bigquery + +## Authentication + +Uses Google Cloud Application Default Credentials (ADC) automatically configured by Workbench. + +## Tips + +1. **Always use LIMIT** to avoid processing large datasets +2. **Use backticks** around table names: `` `project.dataset.table` `` +3. **Check bytes processed** before running expensive queries +4. **Download results** as CSV for further analysis +5. **Use the sidebar helpers** to explore your datasets diff --git a/src/sql-query-executor/app.py b/src/sql-query-executor/app.py new file mode 100644 index 00000000..fae12428 --- /dev/null +++ b/src/sql-query-executor/app.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Simple BigQuery SQL Query Executor +Execute SQL queries against your Workbench datasets +""" + +import streamlit as st +import pandas as pd +from google.cloud import bigquery +from google.auth import default +import re + +# Page configuration +st.set_page_config( + page_title="SQL Query Executor", + page_icon="🔍", + layout="wide" +) + +# Initialize BigQuery client +@st.cache_resource +def get_bigquery_client(): + """Initialize and cache BigQuery client""" + try: + credentials, project = default() + client = bigquery.Client(credentials=credentials, project=project) + return client, project + except Exception as e: + st.error(f"Failed to initialize BigQuery client: {e}") + return None, None + +def execute_query(client, query): + """Execute a SQL query and return results as DataFrame""" + try: + # Run the query + query_job = client.query(query) + + # Get results + results = query_job.result() + df = results.to_dataframe() + + # Get query stats + stats = { + "bytes_processed": query_job.total_bytes_processed, + "bytes_billed": query_job.total_bytes_billed, + "slot_time": query_job.slot_millis, + "rows_returned": len(df) + } + + return df, stats, None + except Exception as e: + return None, None, str(e) + +def format_bytes(bytes_val): + """Format bytes to human-readable format""" + if bytes_val == 0: + return "0 B" + + units = ["B", "KB", "MB", "GB", "TB"] + i = 0 + while bytes_val >= 1024 and i < len(units) - 1: + bytes_val /= 1024.0 + i += 1 + + return f"{bytes_val:.2f} {units[i]}" + +# Main app +def main(): + st.title("🔍 SQL Query Executor") + st.markdown("Execute SQL queries against BigQuery datasets in your Workbench workspace") + + # Initialize client + client, project = get_bigquery_client() + + if not client: + st.error("Cannot connect to BigQuery. Please check your credentials.") + return + + st.success(f"Connected to project: **{project}**") + + # Sidebar with help + with st.sidebar: + st.header("Quick Help") + + st.markdown("### Query Format") + st.code(""" +SELECT * +FROM `project.dataset.table` +LIMIT 100 + """, language="sql") + + st.markdown("### Your Project") + st.info(f"**{project}**") + + st.markdown("### Tips") + st.markdown(""" +- Use backticks `` for table names +- Include LIMIT to avoid large results +- Use fully qualified names: + `project.dataset.table` +- Press Ctrl+Enter to execute + """) + + st.markdown("### Examples") + + if st.button("📊 List All Datasets"): + st.session_state['query'] = f""" +SELECT schema_name as dataset +FROM `{project}.INFORMATION_SCHEMA.SCHEMATA` +ORDER BY schema_name + """.strip() + + if st.button("📋 Show Dataset Tables"): + st.session_state['query'] = f""" +SELECT + table_schema as dataset, + table_name, + table_type, + TIMESTAMP_MILLIS(creation_time) as created, + row_count, + size_bytes +FROM `{project}.__TABLES__` +ORDER BY table_schema, table_name + """.strip() + + # Main query area + st.header("SQL Query") + + # Get query from session state or default + default_query = st.session_state.get('query', f""" +SELECT * +FROM `{project}.DATASET.TABLE` +LIMIT 100 + """.strip()) + + query = st.text_area( + "Enter your SQL query:", + value=default_query, + height=200, + key="sql_input" + ) + + # Store query in session state + st.session_state['query'] = query + + col1, col2, col3 = st.columns([1, 1, 4]) + + with col1: + execute_button = st.button("â–ļī¸ Execute Query", type="primary", use_container_width=True) + + with col2: + if st.button("đŸ—‘ī¸ Clear", use_container_width=True): + st.session_state['query'] = "" + st.rerun() + + # Execute query + if execute_button and query.strip(): + with st.spinner("Executing query..."): + df, stats, error = execute_query(client, query) + + if error: + st.error(f"Query failed: {error}") + else: + # Show stats + st.subheader("Query Statistics") + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Rows Returned", f"{stats['rows_returned']:,}") + + with col2: + st.metric("Bytes Processed", format_bytes(stats['bytes_processed'])) + + with col3: + st.metric("Bytes Billed", format_bytes(stats['bytes_billed'])) + + with col4: + slot_seconds = stats['slot_time'] / 1000.0 + st.metric("Slot Time", f"{slot_seconds:.2f}s") + + # Show results + st.subheader("Query Results") + + if len(df) > 0: + # Display dataframe + st.dataframe(df, use_container_width=True, height=400) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="đŸ“Ĩ Download CSV", + data=csv, + file_name="query_results.csv", + mime="text/csv" + ) + + # Show schema + with st.expander("📋 Show Schema"): + schema_df = pd.DataFrame({ + "Column": df.columns, + "Type": [str(dtype) for dtype in df.dtypes], + "Sample": [df[col].iloc[0] if len(df) > 0 else None for col in df.columns] + }) + st.dataframe(schema_df, use_container_width=True) + else: + st.info("Query executed successfully but returned no rows.") + + elif execute_button: + st.warning("Please enter a SQL query.") + + # Query history (simple version) + st.markdown("---") + + with st.expander("â„šī¸ About"): + st.markdown(""" + ### BigQuery SQL Query Executor + + This app allows you to execute SQL queries against BigQuery datasets in your Workbench workspace. + + **Features:** + - Execute any SELECT query + - View query statistics (bytes processed, slot time) + - Download results as CSV + - View schema information + + **Supported:** + - All standard SQL queries + - Queries across datasets and projects + - JOIN operations + - Aggregations and GROUP BY + - CTEs (WITH clauses) + + **Authentication:** + Uses Google Cloud Application Default Credentials configured by Workbench. + """) + +if __name__ == "__main__": + main() diff --git a/src/sql-query-executor/devcontainer-template.json b/src/sql-query-executor/devcontainer-template.json new file mode 100644 index 00000000..db96e672 --- /dev/null +++ b/src/sql-query-executor/devcontainer-template.json @@ -0,0 +1,20 @@ +{ + "id": "sql-query-executor", + "version": "1.0.0", + "name": "SQL Query Executor", + "description": "Simple BigQuery SQL query interface. Execute SELECT queries, view results, and download as CSV.", + "options": { + "cloud": { + "type": "string", + "enum": ["gcp", "aws"], + "default": "gcp", + "description": "Cloud provider (gcp or aws)" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + } +} diff --git a/src/sql-query-executor/docker-compose.yaml b/src/sql-query-executor/docker-compose.yaml new file mode 100644 index 00000000..14ea65a5 --- /dev/null +++ b/src/sql-query-executor/docker-compose.yaml @@ -0,0 +1,35 @@ +version: "2.4" + +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c " + apt-get update && + apt-get install -y curl git && + pip install --no-cache-dir streamlit pandas google-cloud-bigquery google-auth && + streamlit run /workspace/src/sql-query-executor/app.py --server.port=8501 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false + " + volumes: + - .:/workspace:cached + - work:/root/work + ports: + - 8501:8501 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + work: + +networks: + app-network: + external: true From 4fd22d16527ff710721451bf1729dfd9b3bbded6 Mon Sep 17 00:00:00 2001 From: Sowmya Ingarsal Date: Fri, 16 Jan 2026 02:37:26 +0000 Subject: [PATCH 6/6] Add SQL Query Tool based on working example app This app uses the EXACT structure of the working 'example' app but runs a SQL query interface instead of Jupyter. Key points: - Copied from src/example/ (proven to work in Workbench) - Same port (8888), same network config, same startup scripts - Only change: runs Streamlit SQL app instead of Jupyter - Runtime dependency installation (no Dockerfile build) Since the example app deploys successfully, this should too. Co-Authored-By: Claude Sonnet 4.5 --- src/sql-jupyter/.devcontainer.json | 29 +++ src/sql-jupyter/README.md | 37 ++++ src/sql-jupyter/app.py | 238 +++++++++++++++++++++ src/sql-jupyter/devcontainer-template.json | 20 ++ src/sql-jupyter/docker-compose.yaml | 33 +++ 5 files changed, 357 insertions(+) create mode 100644 src/sql-jupyter/.devcontainer.json create mode 100644 src/sql-jupyter/README.md create mode 100644 src/sql-jupyter/app.py create mode 100644 src/sql-jupyter/devcontainer-template.json create mode 100644 src/sql-jupyter/docker-compose.yaml diff --git a/src/sql-jupyter/.devcontainer.json b/src/sql-jupyter/.devcontainer.json new file mode 100644 index 00000000..42e886c5 --- /dev/null +++ b/src/sql-jupyter/.devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "sql-jupyter", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "ghcr.io/devcontainers/features/java:1": { + "version": "17" + }, + "ghcr.io/devcontainers/features/aws-cli:1": {}, + "ghcr.io/dhoeric/features/google-cloud-cli:1": {} + }, + "remoteUser": "root" +} diff --git a/src/sql-jupyter/README.md b/src/sql-jupyter/README.md new file mode 100644 index 00000000..c5dd6cc8 --- /dev/null +++ b/src/sql-jupyter/README.md @@ -0,0 +1,37 @@ +# SQL Query Tool (Based on Working Example) + +This app is based on the **proven working `example` app structure** but modified to run a SQL query interface. + +## Why This Should Work + +- ✅ Uses the **exact same devcontainer structure** as the working example app +- ✅ Same port (8888) +- ✅ Same network configuration +- ✅ Same startup scripts +- ✅ Only difference: runs SQL query interface instead of Jupyter + +## Features + +- Simple SQL text editor +- Execute SELECT queries against BigQuery +- View results in interactive table +- Download results as CSV +- Query statistics (bytes processed, rows returned) + +## Usage + +1. Open the app after deployment +2. Write your SQL query: + ```sql + SELECT * FROM \`project.dataset.table\` LIMIT 100 + ``` +3. Click "Execute Query" +4. View results and download if needed + +## First-Time Startup + +The first time you start this app, it will take **2-3 minutes** to install dependencies (streamlit, pandas, BigQuery client). Subsequent restarts will be faster. + +## Authentication + +Uses the same Google Cloud credentials as the example app - no configuration needed! diff --git a/src/sql-jupyter/app.py b/src/sql-jupyter/app.py new file mode 100644 index 00000000..fae12428 --- /dev/null +++ b/src/sql-jupyter/app.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Simple BigQuery SQL Query Executor +Execute SQL queries against your Workbench datasets +""" + +import streamlit as st +import pandas as pd +from google.cloud import bigquery +from google.auth import default +import re + +# Page configuration +st.set_page_config( + page_title="SQL Query Executor", + page_icon="🔍", + layout="wide" +) + +# Initialize BigQuery client +@st.cache_resource +def get_bigquery_client(): + """Initialize and cache BigQuery client""" + try: + credentials, project = default() + client = bigquery.Client(credentials=credentials, project=project) + return client, project + except Exception as e: + st.error(f"Failed to initialize BigQuery client: {e}") + return None, None + +def execute_query(client, query): + """Execute a SQL query and return results as DataFrame""" + try: + # Run the query + query_job = client.query(query) + + # Get results + results = query_job.result() + df = results.to_dataframe() + + # Get query stats + stats = { + "bytes_processed": query_job.total_bytes_processed, + "bytes_billed": query_job.total_bytes_billed, + "slot_time": query_job.slot_millis, + "rows_returned": len(df) + } + + return df, stats, None + except Exception as e: + return None, None, str(e) + +def format_bytes(bytes_val): + """Format bytes to human-readable format""" + if bytes_val == 0: + return "0 B" + + units = ["B", "KB", "MB", "GB", "TB"] + i = 0 + while bytes_val >= 1024 and i < len(units) - 1: + bytes_val /= 1024.0 + i += 1 + + return f"{bytes_val:.2f} {units[i]}" + +# Main app +def main(): + st.title("🔍 SQL Query Executor") + st.markdown("Execute SQL queries against BigQuery datasets in your Workbench workspace") + + # Initialize client + client, project = get_bigquery_client() + + if not client: + st.error("Cannot connect to BigQuery. Please check your credentials.") + return + + st.success(f"Connected to project: **{project}**") + + # Sidebar with help + with st.sidebar: + st.header("Quick Help") + + st.markdown("### Query Format") + st.code(""" +SELECT * +FROM `project.dataset.table` +LIMIT 100 + """, language="sql") + + st.markdown("### Your Project") + st.info(f"**{project}**") + + st.markdown("### Tips") + st.markdown(""" +- Use backticks `` for table names +- Include LIMIT to avoid large results +- Use fully qualified names: + `project.dataset.table` +- Press Ctrl+Enter to execute + """) + + st.markdown("### Examples") + + if st.button("📊 List All Datasets"): + st.session_state['query'] = f""" +SELECT schema_name as dataset +FROM `{project}.INFORMATION_SCHEMA.SCHEMATA` +ORDER BY schema_name + """.strip() + + if st.button("📋 Show Dataset Tables"): + st.session_state['query'] = f""" +SELECT + table_schema as dataset, + table_name, + table_type, + TIMESTAMP_MILLIS(creation_time) as created, + row_count, + size_bytes +FROM `{project}.__TABLES__` +ORDER BY table_schema, table_name + """.strip() + + # Main query area + st.header("SQL Query") + + # Get query from session state or default + default_query = st.session_state.get('query', f""" +SELECT * +FROM `{project}.DATASET.TABLE` +LIMIT 100 + """.strip()) + + query = st.text_area( + "Enter your SQL query:", + value=default_query, + height=200, + key="sql_input" + ) + + # Store query in session state + st.session_state['query'] = query + + col1, col2, col3 = st.columns([1, 1, 4]) + + with col1: + execute_button = st.button("â–ļī¸ Execute Query", type="primary", use_container_width=True) + + with col2: + if st.button("đŸ—‘ī¸ Clear", use_container_width=True): + st.session_state['query'] = "" + st.rerun() + + # Execute query + if execute_button and query.strip(): + with st.spinner("Executing query..."): + df, stats, error = execute_query(client, query) + + if error: + st.error(f"Query failed: {error}") + else: + # Show stats + st.subheader("Query Statistics") + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Rows Returned", f"{stats['rows_returned']:,}") + + with col2: + st.metric("Bytes Processed", format_bytes(stats['bytes_processed'])) + + with col3: + st.metric("Bytes Billed", format_bytes(stats['bytes_billed'])) + + with col4: + slot_seconds = stats['slot_time'] / 1000.0 + st.metric("Slot Time", f"{slot_seconds:.2f}s") + + # Show results + st.subheader("Query Results") + + if len(df) > 0: + # Display dataframe + st.dataframe(df, use_container_width=True, height=400) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="đŸ“Ĩ Download CSV", + data=csv, + file_name="query_results.csv", + mime="text/csv" + ) + + # Show schema + with st.expander("📋 Show Schema"): + schema_df = pd.DataFrame({ + "Column": df.columns, + "Type": [str(dtype) for dtype in df.dtypes], + "Sample": [df[col].iloc[0] if len(df) > 0 else None for col in df.columns] + }) + st.dataframe(schema_df, use_container_width=True) + else: + st.info("Query executed successfully but returned no rows.") + + elif execute_button: + st.warning("Please enter a SQL query.") + + # Query history (simple version) + st.markdown("---") + + with st.expander("â„šī¸ About"): + st.markdown(""" + ### BigQuery SQL Query Executor + + This app allows you to execute SQL queries against BigQuery datasets in your Workbench workspace. + + **Features:** + - Execute any SELECT query + - View query statistics (bytes processed, slot time) + - Download results as CSV + - View schema information + + **Supported:** + - All standard SQL queries + - Queries across datasets and projects + - JOIN operations + - Aggregations and GROUP BY + - CTEs (WITH clauses) + + **Authentication:** + Uses Google Cloud Application Default Credentials configured by Workbench. + """) + +if __name__ == "__main__": + main() diff --git a/src/sql-jupyter/devcontainer-template.json b/src/sql-jupyter/devcontainer-template.json new file mode 100644 index 00000000..f4e74d4f --- /dev/null +++ b/src/sql-jupyter/devcontainer-template.json @@ -0,0 +1,20 @@ +{ + "id": "sql-jupyter", + "version": "1.0.0", + "name": "SQL Query Tool (Based on Working Example)", + "description": "Simple BigQuery SQL interface using proven example app structure (Port: 8888)", + "options": { + "cloud": { + "type": "string", + "enum": ["gcp", "aws"], + "default": "gcp", + "description": "Cloud provider (gcp or aws)" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + } +} diff --git a/src/sql-jupyter/docker-compose.yaml b/src/sql-jupyter/docker-compose.yaml new file mode 100644 index 00000000..acbc7a9e --- /dev/null +++ b/src/sql-jupyter/docker-compose.yaml @@ -0,0 +1,33 @@ +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c " + apt-get update && + apt-get install -y curl git && + pip install --no-cache-dir streamlit pandas google-cloud-bigquery google-auth db-dtypes && + streamlit run /workspace/src/sql-jupyter/app.py --server.port=8888 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false + " + volumes: + - .:/workspace:cached + - work:/root/work + ports: + - 8888:8888 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + work: + +networks: + app-network: + external: true