diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
deleted file mode 100644
index 8c462bbd..00000000
--- a/.github/workflows/lint.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: lint
-on:
-  push:
-  pull_request:
-    types: [opened, reopened]
-jobs:
-  run-linters:
-    name: Run linters
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Check out Git repository
-        uses: actions/checkout@v2
-
-      - name: Set up Python
-        uses: actions/setup-python@v1
-        with:
-          python-version: 3.9
-
-      - name: Install Python dependencies
-        run: pip install black flake8
-
-      - name: Run linters
-        uses: wearerequired/lint-action@v1
-        with:
-          github_token: ${{ secrets.github_token }}
-          # Enable linters
-          black: true
-          flake8: true
-          # Mark the following line true if you want linters to attempt to
-          # autocorrect your code
-          auto_fix: true
-          git_name: "Greene Lab Linter"
-          git_email: "miltondp@gmail.com"
-          commit_message: "fix code style issues with ${linter}"
-
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
deleted file mode 100644
index 5abc2841..00000000
--- a/.github/workflows/pytest.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-name: tests
-on:
-  push:
-  pull_request:
-    types: [opened, reopened]
-
-env:
-  # Increase this value to reset cache if environment.yml has not changed.
-  PY_CACHE_NUMBER: 2
-  PY_ENV: ccc_gene_expr
-
-jobs:
-  ccc_pytest:
-    name: Python tests for CCC
-    runs-on: ${{ matrix.os }}
-    strategy:
-      max-parallel: 4
-      fail-fast: false
-      matrix:
-        python-version: ["3.10", "3.11"]
-        os: [ubuntu-latest, macOS-latest, windows-latest]
-    steps:
-      - name: Checkout git repo
-        uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pytest "numpy<2.0" scipy numba pandas scikit-learn
-      - name: Test CCC with pytest
-        env:
-          PYTHONPATH: libs/
-        run: |
-          pytest tests/test_coef.py tests/test_pytorch_core.py tests/test_scipy_stats.py tests/test_sklearn_metrics.py
-
-#  pytest:
-#    name: Python tests for analyses
-#    runs-on: ${{ matrix.os }}
-#    strategy:
-#      max-parallel: 4
-#      fail-fast: false
-#      matrix:
-#        python-version: ["3.9"]
-#        os: [ubuntu-latest, macOS-latest, windows-latest]
-#    steps:
-#      - name: Checkout git repo
-#        uses: actions/checkout@v3
-#      - name: Cache conda
-#        id: cache
-#        uses: actions/cache@v3
-#        with:
-#          path: "${{ env.PY_ENV }}.tar.gz"
-#          key: ${{ runner.os }}-${{ env.PY_CACHE_NUMBER }}-${{ hashFiles('environment/environment.yml', 'environment/scripts/install_r_packages.r', 'environment/scripts/install_other_packages.sh') }}
-#      - name: Setup Miniconda
-#        if: steps.cache.outputs.cache-hit != 'true'
-#        uses: conda-incubator/setup-miniconda@v2
-#        with:
-#          activate-environment: ${{ env.PY_ENV }}
-#          environment-file: environment/environment.yml
-#          auto-activate-base: false
-#          miniforge-variant: Mambaforge
-#          miniforge-version: 'latest'
-#          use-mamba: true
-#      - name: Install other packages and Conda-Pack environment
-#        if: steps.cache.outputs.cache-hit != 'true'
-#        shell: bash -l {0}
-#        run: |
-#          # other packages (R packages mainly)
-#          bash environment/scripts/install_other_packages.sh
-#
-#          # install conda-pack, and pack environment
-#          conda install --yes -c conda-forge conda-pack coverage
-#          conda pack -f -n ${{ env.PY_ENV }} -o "${{ env.PY_ENV }}.tar.gz"
-#      - name: Unpack environment
-#        shell: bash -l {0}
-#        run: |
-#          mkdir -p "${{ env.PY_ENV }}"
-#          tar -xzf "${{ env.PY_ENV }}.tar.gz" -C "${{ env.PY_ENV }}"
-#      - name: Setup data and run pytest (Windows systems)
-#        if: runner.os == 'Windows'
-#        env:
-#          PYTHONPATH: libs/
-#        shell: cmd
-#        run: |
-#          echo on
-#          cd ${{ env.PY_ENV }}
-#          call .\Scripts\activate.bat
-#          .\Scripts\conda-unpack.exe
-#          cd ..
-#          set R_HOME=%CONDA_PREFIX%\Lib\R
-#          python environment\scripts\setup_data.py --mode testing
-#          pytest -v -rs tests
-#      - name: Setup data and run pytest (non-Windows systems)
-#        if: runner.os != 'Windows'
-#        shell: bash
-#        env:
-#          PYTHONPATH: libs/
-#        run: |
-#          source ${{ env.PY_ENV }}/bin/activate
-#          conda-unpack
-#
-#          python environment/scripts/setup_data.py --mode testing
-#
-#          if [ "$RUNNER_OS" == "Linux" ]; then
-#            # for linux/ubuntu, run the tests once: with numba jit activated
-#            # (which is the expected implementation) and with the jit
-#            # deactivated (otherwise coverage does not work).
-#
-#            # numba jit activated
-#            pytest -v -rs tests
-#
-#            # numba jit deactivated + code coverage
-#            export NUMBA_DISABLE_JIT=1
-#            coverage run --source=libs/ -m pytest -v -rs tests
-#            coverage xml -o coverage.xml
-#          else
-#            pytest -v -rs tests
-#          fi
-#      - name: Codecov upload
-#        if: runner.os == 'Linux'
-#        uses: codecov/codecov-action@v2
-#        with:
-#          files: ./coverage.xml
-#          name: codecov-${{ matrix.os }}-python${{ matrix.python-version }}
-#          fail_ci_if_error: true
-#          verbose: true
diff --git a/README.md b/README.md
index 14285186..c5a12bb7 100644
--- a/README.md
+++ b/README.md
@@ -73,17 +73,13 @@ cd ccc-gpu
 
 #### 2. Setup Environment with conda-lock
 
-This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean:
+This process uses [pipx](https://pipx.pypa.io/stable/) to install conda-lock in an isolated environment, keeping your base environment clean:
 
 > **Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular `environment.yml` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development.
 
 ```bash
-# Create temporary environment for conda-lock
-conda create -n ccc-gpu-setup python=3.10 -y  # or: mamba create -n ccc-gpu-setup python=3.10 -y
-conda activate ccc-gpu-setup
-
-# Install conda-lock in temporary environment
-conda install --channel=conda-forge conda-lock -y  # or: mamba install --channel=conda-forge conda-lock -y
+# Install conda-lock using pipx (installs in isolated environment)
+pipx install conda-lock
 
 # Create the main ccc-gpu environment from lock file
 conda-lock install --name ccc-gpu conda-lock.yml  # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
@@ -95,32 +91,6 @@ conda activate ccc-gpu
 pip install .
 ```
 
-#### 3. Optional: Clean up temporary environment
-
-Once installation is complete, you can optionally remove the temporary setup environment:
-
-```bash
-# Remove temporary environment (optional)
-conda deactivate  # Make sure you're not in ccc-gpu-setup
-conda remove -n ccc-gpu-setup --all -y  # or: mamba remove -n ccc-gpu-setup --all -y
-```
-
-#### Alternative: Install conda-lock in base environment
-
-If you prefer to install conda-lock directly in your base environment:
-
-```bash
-# Option 1: Using pip
-pip install conda-lock
-
-# Option 2: Using conda
-conda install --channel=conda-forge conda-lock -y  # or: mamba install --channel=conda-forge conda-lock -y
-
-# Then create environment directly
-conda-lock install --name ccc-gpu conda-lock.yml  # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
-conda activate ccc-gpu
-pip install .
-```
 
 > **Note**: If you prefer to use Mamba for faster package resolution, you can install MiniForge which includes Mamba:
 > ```bash
@@ -139,6 +109,10 @@ bash ./scripts/run_tests.sh python
 ```
 
 ## Usage
+### End-to-End Tutorial
+
+You can find a tutorial showing simplified analysis steps for those we used in our paper in this [notebook](nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb) using the GTEx v8 data.
+
 
 ### Basic Usage
 
@@ -161,35 +135,16 @@ correlation = ccc(x, y)
 print(f"CCC coefficient: {correlation:.3f}")
 ```
 
-### Controlling Debug Logging
-
-By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable:
-
-```bash
-# Run with default behavior (no debug output)
-python your_script.py
-
-# Enable debug logging for troubleshooting
-CCC_GPU_LOGGING=1 python your_script.py
-
-# Or set it for the session
-export CCC_GPU_LOGGING=1
-python your_script.py
-```
-
-This is particularly useful for:
-- Debugging GPU memory issues
-- Understanding CUDA device utilization
-- Monitoring batch processing performance
-- Troubleshooting installation problems
-
 ### Working with Gene Expression Data
 
 CCC-GPU is particularly useful for genomics applications:
 
 ```python
 import pandas as pd
-from ccc.coef import ccc
+# New CCC-GPU implementation import
+from ccc.coef.impl_gpu import ccc
+# Original CCC implementation import
+# from ccc.coef.impl import ccc
 
 # Load gene expression data
 # Assume genes are in columns, samples in rows
@@ -217,6 +172,28 @@ for i, j in zip(top_indices[0], top_indices[1]):
 
 Refer to the original CCC Repository for more usage examples: [https://github.com/greenelab/ccc](https://github.com/greenelab/ccc)
 
+### Controlling Debug Logging
+
+By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable:
+
+```bash
+# Run with default behavior (no debug output)
+python your_script.py
+
+# Enable debug logging for troubleshooting
+CCC_GPU_LOGGING=1 python your_script.py
+
+# Or set it for the session
+export CCC_GPU_LOGGING=1
+python your_script.py
+```
+
+This is particularly useful for:
+- Debugging GPU memory issues
+- Understanding CUDA device utilization
+- Monitoring batch processing performance
+- Troubleshooting installation problems
+
 ## Performance Benchmarks
 
 CCC-GPU provides significant performance improvements over CPU-only implementations:
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 68596b54..70b1065c 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -65,19 +65,15 @@ Install from source using the provided conda-lock environment:
 2. Setup Environment with conda-lock
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean:
+This process uses pipx to install conda-lock in an isolated environment, keeping your base environment clean:
 
 .. note::
    **Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular ``environment.yml`` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development.
 
 .. code-block:: bash
 
-    # Create temporary environment for conda-lock
-    conda create -n ccc-gpu-setup python=3.10 -y  # or: mamba create -n ccc-gpu-setup python=3.10 -y
-    conda activate ccc-gpu-setup
-
-    # Install conda-lock in temporary environment
-    conda install --channel=conda-forge conda-lock -y  # or: mamba install --channel=conda-forge conda-lock -y
+    # Install conda-lock using pipx (installs in isolated environment)
+    pipx install conda-lock
 
     # Create the main ccc-gpu environment from lock file
     conda-lock install --name ccc-gpu conda-lock.yml  # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
@@ -88,21 +84,23 @@ This process uses a temporary environment to manage the conda-lock installation,
     # Install the package from source
     pip install .
 
-3. Optional: Clean up temporary environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. note::
+   If you don't have pipx installed, you can install it with ``pip install pipx`` or follow the `pipx installation guide <https://pypa.github.io/pipx/installation/>`_.
+
+3. Optional: Remove conda-lock
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Once installation is complete, you can optionally remove the temporary setup environment:
+If you no longer need conda-lock after installation, you can remove it:
 
 .. code-block:: bash
 
-    # Remove temporary environment (optional)
-    conda deactivate  # Make sure you're not in ccc-gpu-setup
-    conda remove -n ccc-gpu-setup --all -y  # or: mamba remove -n ccc-gpu-setup --all -y
+    # Remove conda-lock (optional)
+    pipx uninstall conda-lock
 
 Alternative: Install conda-lock in base environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If you prefer to install conda-lock directly in your base environment:
+If you prefer to install conda-lock directly in your base environment instead of using pipx:
 
 .. code-block:: bash
 
diff --git a/nbs/03-manuscript/40_prepare_supp_data/README.md b/nbs/03-manuscript/40_prepare_supp_data/README.md
index f972dcbe..6c733d8f 100644
--- a/nbs/03-manuscript/40_prepare_supp_data/README.md
+++ b/nbs/03-manuscript/40_prepare_supp_data/README.md
@@ -1,82 +1,188 @@
-# CCC Data Processing Script
+# CCC Data Processing Scripts
 
-This directory contains a script to process GTEx similarity matrices (.pkl files) and extract only the CCC (Clustered Correlation Coefficient) data.
+This directory contains scripts to process GTEx similarity matrices (.pkl files) and extract CCC (Clustered Correlation Coefficient) data in optimized formats for efficient storage and fast queries.
 
-## Script: `process_ccc_data.py`
+## Available Scripts
 
-### Description
-Processes all .pkl files in the source directory, extracts only the 'ccc' column with multi-indices, and saves individual .parquet files with snappy compression for each input. This significantly reduces file sizes compared to .pkl format.
+### 1. `process_ccc_to_duckdb.py` (Recommended)
 
-### Usage
+**Description:** Converts pickle files to DuckDB format for ultra-fast queries and efficient storage.
+
+#### Key Features
+- **Sub-millisecond query performance** for individual gene pairs
+- **4-5x better compression** than parquet format
+- **Minimal memory usage** (queries use <1GB RAM vs 13GB+ for parquet)
+- **SQL query capabilities** for complex analyses
+- **Support for both individual and consolidated databases**
+
+#### Usage
 
-#### Dry Run (recommended first step)
 ```bash
 # Activate the conda environment
 conda activate ccc-gpu
 
-# Run dry run to see what files would be processed
-python process_ccc_data.py --dry-run
+# Install DuckDB (if not already installed)
+pip install duckdb
+
+# Process all tissues into individual databases
+python process_ccc_to_duckdb.py \
+    --source-dir /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all \
+    --output-dir /mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb
+
+# Create a single consolidated database (all tissues)
+python process_ccc_to_duckdb.py --single-db
+
+# Process specific tissues only
+python process_ccc_to_duckdb.py --tissues bladder brain_cortex
+
+# Dry run to see what would be processed
+python process_ccc_to_duckdb.py --dry-run
 ```
 
-#### Full Processing
+#### Arguments
+- `--source-dir`: Source directory with .pkl files
+- `--output-dir`: Output directory for DuckDB files
+- `--single-db`: Create one consolidated database instead of individual ones
+- `--tissues`: Process specific tissues only
+- `--dry-run`: Show what would be processed without doing it
+- `--debug`: Enable debug logging
+
+### 2. `ccc_duckdb_query.py` - Query Interface
+
+**Description:** Python wrapper for fast queries on DuckDB databases.
+
+#### Usage as Module
+
+```python
+from ccc_duckdb_query import CCCDatabase
+
+# Open database
+db = CCCDatabase("/path/to/bladder_ccc.duckdb")
+
+# Query single gene pair
+ccc = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11")
+
+# Get all correlations for a gene
+correlations = db.get_gene_correlations("ENSG00000141510.16", min_ccc=0.5)
+
+# Get top correlations
+top_pairs = db.get_top_correlations(threshold=0.9, limit=100)
+
+# Batch query multiple pairs
+pairs = [("gene1", "gene2"), ("gene3", "gene4")]
+results = db.get_batch_correlations(pairs)
+
+# Custom SQL query
+df = db.query("SELECT * FROM ccc_data WHERE ccc > 0.95 LIMIT 10")
+
+# Get database statistics
+stats = db.get_statistics()
+
+db.close()
+```
+
+#### Usage as CLI
+
+```bash
+# Get database statistics
+python ccc_duckdb_query.py /path/to/database.duckdb --stats
+
+# Query specific gene pair
+python ccc_duckdb_query.py /path/to/database.duckdb \
+    --gene1 ENSG00000141510.16 --gene2 ENSG00000133703.11
+
+# Get correlations for a gene
+python ccc_duckdb_query.py /path/to/database.duckdb \
+    --gene ENSG00000141510.16 --limit 50
+
+# Get top correlations
+python ccc_duckdb_query.py /path/to/database.duckdb \
+    --top 0.9 --limit 100
+```
+
+### 3. `process_ccc_data.py` (Legacy - Parquet Output)
+
+**Description:** Original script that creates parquet files. Kept for compatibility but DuckDB format is recommended.
+
 ```bash
 # Run with default paths
 python process_ccc_data.py
 
-# Run with custom paths
+# Custom paths
 python process_ccc_data.py --source-dir /path/to/source --output-dir /path/to/output
 ```
 
-### Arguments
-- `--source-dir`: Source directory containing .pkl files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all`)
-- `--output-dir`: Output directory for processed parquet files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet`)
-- `--dry-run`: Show what would be processed without actually doing it
-- `--debug`: Enable debug logging (shows detailed processing information)
-
-### Requirements
-- `pandas`: For reading .pkl files and writing .parquet files
-- `pyarrow`: Required for parquet format support with snappy compression
-- `tqdm`: For progress bars (optional, script will work without it)
-
-### Logging
-The script automatically creates detailed logs in the `logs/` directory with timestamps:
-- **Log location**: `logs/process_ccc_data_YYYYMMDD_HHMMSS.log`
-- **Log levels**: INFO (default) and DEBUG (with `--debug` flag)
-- **Log content**: Processing progress, file details, errors, timing information, and archive sizes
-- **Console output**: Key information is also printed to console for real-time monitoring
-
-Example log entries:
+## Performance Comparison
+
+| Metric | Parquet | DuckDB | Improvement |
+|--------|---------|---------|-------------|
+| Storage Size | 302 GB | ~60-80 GB | 4-5x smaller |
+| Load Time | >60s timeout | 0s (no loading) | Instant access |
+| Single Query | >100ms | <1ms | 100x+ faster |
+| Memory Usage | 13+ GB | <1 GB | 13x+ less |
+| Random Access | Very slow | Sub-millisecond | Orders of magnitude |
+
+## Requirements
+
+```bash
+# Core requirements
+conda activate ccc-gpu
+pip install pandas duckdb numpy tqdm
+
+# Optional for parquet support
+pip install pyarrow
+```
+
+## Output Structure
+
+### DuckDB Format (Recommended)
 ```
-2024-01-15 10:30:15,123 - INFO - Starting CCC data processing
-2024-01-15 10:30:15,124 - INFO - Found 54 .pkl files to process
-2024-01-15 10:30:16,200 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2024-01-15 10:35:22,456 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl
+/output_directory/
+├── bladder_ccc.duckdb          # Individual tissue databases
+├── brain_cortex_ccc.duckdb
+├── whole_blood_ccc.duckdb
+└── all_tissues_ccc.duckdb      # Optional consolidated database
 ```
 
-### Output
-The script will create individual `.parquet` files for each source file containing only CCC data with multi-indices preserved. Parquet format with snappy compression provides significant space savings compared to .pkl files while maintaining fast read/write performance.
+### Database Schema
+```sql
+-- Individual tissue table
+CREATE TABLE ccc_data (
+    gene1 VARCHAR NOT NULL,
+    gene2 VARCHAR NOT NULL,
+    ccc REAL NOT NULL,
+    PRIMARY KEY (gene1, gene2)
+);
+
+-- Indexes for fast lookups
+CREATE INDEX idx_gene2 ON ccc_data(gene2);
+CREATE INDEX idx_ccc ON ccc_data(ccc);
+```
 
-### File Naming Convention
-Input: `gtex_v8_data_whole_blood-var_pc_log2-all.pkl`
-Output: `gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet`
+## Example Workflow
 
-### Example
 ```bash
-# Activate environment and run
+# 1. Convert all pickle files to DuckDB
 conda activate ccc-gpu
-python process_ccc_data.py
+python process_ccc_to_duckdb.py
+
+# 2. Test query performance
+python ccc_duckdb_query.py /path/to/bladder_ccc.duckdb --stats
+
+# 3. Use in Python scripts
+from ccc_duckdb_query import CCCDatabase
+
+with CCCDatabase("bladder_ccc.duckdb") as db:
+    # Fast queries for your analysis
+    ccc = db.get_correlation("gene1", "gene2")
+```
+
+## Advantages of DuckDB Format
 
-# Run with debug logging for more detailed information
-python process_ccc_data.py --debug
-
-# Expected output structure:
-# nbs/03-manuscript/40_prepare_supp_data/
-# └── logs/
-#     └── process_ccc_data_YYYYMMDD_HHMMSS.log
-#
-# /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/
-# ├── gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-# ├── gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-# ├── gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-# └── ... (54 individual .parquet files total)
-``` 
\ No newline at end of file
+1. **No Loading Required**: Direct queries without loading entire dataset
+2. **Memory Efficient**: Uses memory-mapped IO, minimal RAM footprint
+3. **Fast Random Access**: Indexed lookups in microseconds
+4. **SQL Support**: Complex queries and aggregations
+5. **Better Compression**: Columnar storage with efficient encoding
+6. **Concurrent Access**: Multiple readers can query simultaneously
+7. **ACID Compliance**: Data integrity guarantees 
\ No newline at end of file
diff --git a/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py
new file mode 100755
index 00000000..38938650
--- /dev/null
+++ b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+"""
+Python wrapper for fast CCC correlation queries from DuckDB databases.
+
+This module provides a simple interface for querying gene pair correlations
+from the DuckDB databases created by process_ccc_to_duckdb.py.
+
+Example usage:
+    from ccc_duckdb_query import CCCDatabase
+
+    # Single tissue database
+    db = CCCDatabase("/path/to/bladder_ccc.duckdb")
+
+    # Query single gene pair
+    ccc_value = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11")
+
+    # Get all correlations for a gene
+    correlations = db.get_gene_correlations("ENSG00000141510.16")
+
+    # Get top correlations
+    top_pairs = db.get_top_correlations(threshold=0.9, limit=100)
+
+    # Batch query multiple pairs
+    pairs = [("gene1", "gene2"), ("gene3", "gene4")]
+    results = db.get_batch_correlations(pairs)
+"""
+
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Union
+import pandas as pd
+import duckdb
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class CCCDatabase:
+    """Wrapper for querying CCC correlation data from DuckDB databases."""
+
+    def __init__(self, db_path: Union[str, Path], tissue: Optional[str] = None):
+        """
+        Initialize connection to DuckDB database.
+
+        Args:
+            db_path: Path to DuckDB database file
+            tissue: Tissue name (for consolidated database)
+        """
+        self.db_path = Path(db_path)
+        if not self.db_path.exists():
+            raise FileNotFoundError(f"Database not found: {db_path}")
+
+        self.con = duckdb.connect(str(self.db_path), read_only=True)
+        self.tissue = tissue
+
+        # Detect database type (single tissue or consolidated)
+        tables = self.con.execute("SHOW TABLES").fetchall()
+        table_names = [t[0] for t in tables]
+
+        if "tissues" in table_names:
+            # Consolidated database
+            self.db_type = "consolidated"
+            self.tissues = self._get_available_tissues()
+
+            if tissue:
+                if tissue not in self.tissues:
+                    raise ValueError(f"Tissue '{tissue}' not found. Available: {self.tissues}")
+                self.table_name = f"ccc_{tissue}"
+            else:
+                logger.info(f"Consolidated database with {len(self.tissues)} tissues")
+                logger.info(f"Available tissues: {', '.join(self.tissues[:5])}...")
+        else:
+            # Single tissue database
+            self.db_type = "single"
+            self.table_name = "ccc_data"
+            self.tissues = None
+
+    def _get_available_tissues(self) -> List[str]:
+        """Get list of available tissues in consolidated database."""
+        result = self.con.execute("SELECT tissue_name FROM tissues ORDER BY tissue_name").fetchall()
+        return [r[0] for r in result]
+
+    def get_correlation(self, gene1: str, gene2: str) -> Optional[float]:
+        """
+        Get CCC correlation for a specific gene pair.
+
+        Args:
+            gene1: First gene ID
+            gene2: Second gene ID
+
+        Returns:
+            CCC correlation value or None if not found
+        """
+        if self.tissue is None and self.db_type == "consolidated":
+            raise ValueError("Tissue must be specified for consolidated database")
+
+        # Try both orientations since correlation is symmetric
+        query = f"""
+            SELECT ccc FROM {self.table_name}
+            WHERE (gene1 = ? AND gene2 = ?)
+               OR (gene1 = ? AND gene2 = ?)
+            LIMIT 1
+        """
+
+        result = self.con.execute(query, [gene1, gene2, gene2, gene1]).fetchone()
+        return result[0] if result else None
+
+    def get_gene_correlations(
+        self,
+        gene: str,
+        min_ccc: Optional[float] = None,
+        limit: Optional[int] = None
+    ) -> pd.DataFrame:
+        """
+        Get all correlations for a specific gene.
+
+        Args:
+            gene: Gene ID
+            min_ccc: Minimum CCC threshold (optional)
+            limit: Maximum number of results (optional)
+
+        Returns:
+            DataFrame with columns: gene_pair, ccc
+        """
+        if self.tissue is None and self.db_type == "consolidated":
+            raise ValueError("Tissue must be specified for consolidated database")
+
+        where_clause = ""
+        if min_ccc is not None:
+            where_clause = f"AND ccc >= {min_ccc}"
+
+        limit_clause = ""
+        if limit is not None:
+            limit_clause = f"LIMIT {limit}"
+
+        query = f"""
+            SELECT
+                CASE
+                    WHEN gene1 = ? THEN gene2
+                    ELSE gene1
+                END as gene_pair,
+                ccc
+            FROM {self.table_name}
+            WHERE (gene1 = ? OR gene2 = ?)
+                {where_clause}
+            ORDER BY ccc DESC
+            {limit_clause}
+        """
+
+        result = self.con.execute(query, [gene, gene, gene]).df()
+        return result
+
+    def get_top_correlations(
+        self,
+        threshold: float = 0.9,
+        limit: int = 100
+    ) -> pd.DataFrame:
+        """
+        Get top correlations above a threshold.
+
+        Args:
+            threshold: Minimum CCC value
+            limit: Maximum number of results
+
+        Returns:
+            DataFrame with columns: gene1, gene2, ccc
+        """
+        if self.tissue is None and self.db_type == "consolidated":
+            raise ValueError("Tissue must be specified for consolidated database")
+
+        query = f"""
+            SELECT gene1, gene2, ccc
+            FROM {self.table_name}
+            WHERE ccc >= ?
+            ORDER BY ccc DESC
+            LIMIT ?
+        """
+
+        result = self.con.execute(query, [threshold, limit]).df()
+        return result
+
+    def get_batch_correlations(
+        self,
+        pairs: List[Tuple[str, str]]
+    ) -> Dict[Tuple[str, str], Optional[float]]:
+        """
+        Get correlations for multiple gene pairs efficiently.
+
+        Args:
+            pairs: List of (gene1, gene2) tuples
+
+        Returns:
+            Dictionary mapping (gene1, gene2) to CCC values
+        """
+        if self.tissue is None and self.db_type == "consolidated":
+            raise ValueError("Tissue must be specified for consolidated database")
+
+        if not pairs:
+            return {}
+
+        # Create temporary table for batch lookup
+        self.con.execute("CREATE TEMPORARY TABLE query_pairs (gene1 VARCHAR, gene2 VARCHAR)")
+
+        # Insert pairs
+        for g1, g2 in pairs:
+            self.con.execute("INSERT INTO query_pairs VALUES (?, ?)", [g1, g2])
+
+        # Batch query with joins
+        query = f"""
+            SELECT
+                COALESCE(qp.gene1, qp2.gene1) as query_gene1,
+                COALESCE(qp.gene2, qp2.gene2) as query_gene2,
+                c.ccc
+            FROM query_pairs qp
+            LEFT JOIN {self.table_name} c
+                ON qp.gene1 = c.gene1 AND qp.gene2 = c.gene2
+            LEFT JOIN query_pairs qp2
+                ON qp2.gene1 = c.gene2 AND qp2.gene2 = c.gene1
+            WHERE c.ccc IS NOT NULL
+        """
+
+        results = self.con.execute(query).fetchall()
+
+        # Drop temporary table
+        self.con.execute("DROP TABLE query_pairs")
+
+        # Convert to dictionary
+        result_dict = {}
+        for row in results:
+            result_dict[(row[0], row[1])] = row[2]
+
+        # Add None for missing pairs
+        for pair in pairs:
+            if pair not in result_dict and (pair[1], pair[0]) not in result_dict:
+                result_dict[pair] = None
+
+        return result_dict
+
+    def get_cross_tissue_correlation(
+        self,
+        gene1: str,
+        gene2: str
+    ) -> pd.DataFrame:
+        """
+        Get correlation values across all tissues (consolidated database only).
+
+        Args:
+            gene1: First gene ID
+            gene2: Second gene ID
+
+        Returns:
+            DataFrame with columns: tissue, ccc
+        """
+        if self.db_type != "consolidated":
+            raise ValueError("Cross-tissue query requires consolidated database")
+
+        query = """
+            SELECT tissue, ccc
+            FROM all_correlations
+            WHERE (gene1 = ? AND gene2 = ?)
+               OR (gene1 = ? AND gene2 = ?)
+            ORDER BY ccc DESC
+        """
+
+        result = self.con.execute(query, [gene1, gene2, gene2, gene1]).df()
+        return result
+
+    def query(self, sql: str, parameters: Optional[List] = None) -> pd.DataFrame:
+        """
+        Execute custom SQL query on the database.
+
+        Args:
+            sql: SQL query string
+            parameters: Query parameters (optional)
+
+        Returns:
+            Query results as DataFrame
+        """
+        if parameters:
+            return self.con.execute(sql, parameters).df()
+        else:
+            return self.con.execute(sql).df()
+
+    def get_statistics(self) -> Dict:
+        """
+        Get database statistics.
+
+        Returns:
+            Dictionary with database statistics
+        """
+        stats = {}
+
+        if self.db_type == "consolidated":
+            # Get tissue statistics
+            tissue_stats = self.con.execute("""
+                SELECT
+                    COUNT(*) as num_tissues,
+                    SUM(num_pairs) as total_pairs,
+                    MIN(min_ccc) as global_min_ccc,
+                    MAX(max_ccc) as global_max_ccc,
+                    AVG(mean_ccc) as avg_mean_ccc
+                FROM tissues
+            """).fetchone()
+
+            stats['type'] = 'consolidated'
+            stats['num_tissues'] = tissue_stats[0]
+            stats['total_pairs'] = tissue_stats[1]
+            stats['global_min_ccc'] = tissue_stats[2]
+            stats['global_max_ccc'] = tissue_stats[3]
+            stats['avg_mean_ccc'] = tissue_stats[4]
+
+            if self.tissue:
+                # Get specific tissue stats
+                tissue_info = self.con.execute("""
+                    SELECT num_pairs, min_ccc, max_ccc, mean_ccc
+                    FROM tissues
+                    WHERE tissue_name = ?
+                """, [self.tissue]).fetchone()
+
+                if tissue_info:
+                    stats['tissue'] = self.tissue
+                    stats['tissue_pairs'] = tissue_info[0]
+                    stats['tissue_min_ccc'] = tissue_info[1]
+                    stats['tissue_max_ccc'] = tissue_info[2]
+                    stats['tissue_mean_ccc'] = tissue_info[3]
+
+        else:
+            # Single tissue database statistics
+            result = self.con.execute(f"""
+                SELECT
+                    COUNT(*) as num_pairs,
+                    MIN(ccc) as min_ccc,
+                    MAX(ccc) as max_ccc,
+                    AVG(ccc) as mean_ccc
+                FROM {self.table_name}
+            """).fetchone()
+
+            stats['type'] = 'single'
+            stats['num_pairs'] = result[0]
+            stats['min_ccc'] = result[1]
+            stats['max_ccc'] = result[2]
+            stats['mean_ccc'] = result[3]
+
+        # Database file size
+        stats['database_size_gb'] = self.db_path.stat().st_size / (1024**3)
+
+        return stats
+
+    def close(self):
+        """Close database connection."""
+        self.con.close()
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.close()
+
+
+def main():
+    """Example usage and simple CLI."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Query CCC correlation database")
+    parser.add_argument("database", help="Path to DuckDB database")
+    parser.add_argument("--tissue", help="Tissue name (for consolidated database)")
+    parser.add_argument("--gene1", help="First gene ID")
+    parser.add_argument("--gene2", help="Second gene ID")
+    parser.add_argument("--gene", help="Get all correlations for this gene")
+    parser.add_argument("--top", type=float, help="Get top correlations above threshold")
+    parser.add_argument("--limit", type=int, default=100, help="Limit number of results")
+    parser.add_argument("--stats", action="store_true", help="Show database statistics")
+
+    args = parser.parse_args()
+
+    # Initialize database
+    with CCCDatabase(args.database, tissue=args.tissue) as db:
+
+        if args.stats:
+            stats = db.get_statistics()
+            print("\nDatabase Statistics:")
+            for key, value in stats.items():
+                if isinstance(value, float):
+                    print(f"  {key}: {value:.4f}")
+                else:
+                    print(f"  {key}: {value}")
+
+        elif args.gene1 and args.gene2:
+            # Query specific pair
+            ccc = db.get_correlation(args.gene1, args.gene2)
+            if ccc is not None:
+                print(f"CCC({args.gene1}, {args.gene2}) = {ccc:.6f}")
+            else:
+                print(f"No correlation found for pair ({args.gene1}, {args.gene2})")
+
+        elif args.gene:
+            # Get all correlations for gene
+            results = db.get_gene_correlations(args.gene, limit=args.limit)
+            print(f"\nTop {len(results)} correlations for {args.gene}:")
+            print(results.to_string())
+
+        elif args.top:
+            # Get top correlations
+            results = db.get_top_correlations(threshold=args.top, limit=args.limit)
+            print(f"\nTop {len(results)} correlations above {args.top}:")
+            print(results.to_string())
+
+        else:
+            print("Please specify a query option (--gene1/--gene2, --gene, --top, or --stats)")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log b/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
deleted file mode 100644
index 0508e68b..00000000
--- a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
+++ /dev/null
@@ -1,392 +0,0 @@
-2025-09-11 22:50:04,101 - INFO - Starting CCC data processing
-2025-09-11 22:50:04,101 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
-2025-09-11 22:50:04,101 - DEBUG - Debug logging enabled
-2025-09-11 22:50:04,101 - INFO - Script arguments: source_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all, output_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet, dry_run=False
-2025-09-11 22:50:04,101 - INFO - Scanning directory for .pkl files: /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all
-2025-09-11 22:50:04,102 - INFO - Found 54 .pkl files to process
-2025-09-11 22:50:04,102 - DEBUG - First few files: ['gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl', 'gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl', 'gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl', 'gtex_v8_data_artery_aorta-var_pc_log2-all.pkl', 'gtex_v8_data_artery_coronary-var_pc_log2-all.pkl']
-2025-09-11 22:50:04,102 - INFO - Output directory created/verified: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet
-2025-09-11 22:50:04,102 - INFO - Starting processing of 54 files
-2025-09-11 22:50:04,104 - DEBUG - Processing file 1/54: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl
-2025-09-11 22:50:04,104 - INFO - Processing file: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl
-2025-09-11 22:50:38,091 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:50:40,159 - DEBUG - Extracted CCC data shape: (1460025703, 1)
-2025-09-11 22:50:40,159 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:54:34,827 - INFO - Successfully processed gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl -> gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:54:34,827 - INFO - Size reduction: 19.04 GB -> 6.55 GB (65.6% smaller)
-2025-09-11 22:54:36,104 - DEBUG - Processing file 2/54: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl
-2025-09-11 22:54:36,105 - INFO - Processing file: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl
-2025-09-11 22:55:12,030 - DEBUG - Loaded data shape: (1440046611, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:55:14,077 - DEBUG - Extracted CCC data shape: (1440046611, 1)
-2025-09-11 22:55:14,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:59:03,657 - INFO - Successfully processed gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl -> gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:59:03,657 - INFO - Size reduction: 18.78 GB -> 6.35 GB (66.2% smaller)
-2025-09-11 22:59:04,942 - DEBUG - Processing file 3/54: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl
-2025-09-11 22:59:04,942 - INFO - Processing file: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl
-2025-09-11 22:59:38,946 - DEBUG - Loaded data shape: (1358012670, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:59:40,880 - DEBUG - Extracted CCC data shape: (1358012670, 1)
-2025-09-11 22:59:40,880 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:03:24,150 - INFO - Successfully processed gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl -> gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:03:24,150 - INFO - Size reduction: 17.71 GB -> 5.59 GB (68.4% smaller)
-2025-09-11 23:03:25,363 - DEBUG - Processing file 4/54: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl
-2025-09-11 23:03:25,363 - INFO - Processing file: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl
-2025-09-11 23:04:00,118 - DEBUG - Loaded data shape: (1419832116, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:04:02,111 - DEBUG - Extracted CCC data shape: (1419832116, 1)
-2025-09-11 23:04:02,112 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:07:50,792 - INFO - Successfully processed gtex_v8_data_artery_aorta-var_pc_log2-all.pkl -> gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:07:50,793 - INFO - Size reduction: 18.51 GB -> 6.18 GB (66.6% smaller)
-2025-09-11 23:07:52,114 - DEBUG - Processing file 5/54: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl
-2025-09-11 23:07:52,114 - INFO - Processing file: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl
-2025-09-11 23:08:31,245 - DEBUG - Loaded data shape: (1373430255, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:08:33,214 - DEBUG - Extracted CCC data shape: (1373430255, 1)
-2025-09-11 23:08:33,214 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:12:11,613 - INFO - Successfully processed gtex_v8_data_artery_coronary-var_pc_log2-all.pkl -> gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:12:11,613 - INFO - Size reduction: 17.91 GB -> 5.74 GB (67.9% smaller)
-2025-09-11 23:12:12,970 - DEBUG - Processing file 6/54: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl
-2025-09-11 23:12:12,970 - INFO - Processing file: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl
-2025-09-11 23:12:48,104 - DEBUG - Loaded data shape: (1454033701, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:12:50,137 - DEBUG - Extracted CCC data shape: (1454033701, 1)
-2025-09-11 23:12:50,137 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:16:47,353 - INFO - Successfully processed gtex_v8_data_artery_tibial-var_pc_log2-all.pkl -> gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:16:47,353 - INFO - Size reduction: 18.96 GB -> 6.49 GB (65.8% smaller)
-2025-09-11 23:16:48,739 - DEBUG - Processing file 7/54: gtex_v8_data_bladder-var_pc_log2-all.pkl
-2025-09-11 23:16:48,739 - INFO - Processing file: gtex_v8_data_bladder-var_pc_log2-all.pkl
-2025-09-11 23:17:16,930 - DEBUG - Loaded data shape: (995271420, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:17:18,340 - DEBUG - Extracted CCC data shape: (995271420, 1)
-2025-09-11 23:17:18,340 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:19:53,633 - INFO - Successfully processed gtex_v8_data_bladder-var_pc_log2-all.pkl -> gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:19:53,633 - INFO - Size reduction: 12.98 GB -> 3.37 GB (74.0% smaller)
-2025-09-11 23:19:54,610 - DEBUG - Processing file 8/54: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl
-2025-09-11 23:19:54,611 - INFO - Processing file: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl
-2025-09-11 23:20:26,794 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:20:28,666 - DEBUG - Extracted CCC data shape: (1313153128, 1)
-2025-09-11 23:20:28,666 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:24:01,118 - INFO - Successfully processed gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl -> gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:24:01,118 - INFO - Size reduction: 17.12 GB -> 5.14 GB (70.0% smaller)
-2025-09-11 23:24:02,286 - DEBUG - Processing file 9/54: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl
-2025-09-11 23:24:02,286 - INFO - Processing file: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl
-2025-09-11 23:24:40,076 - DEBUG - Loaded data shape: (1345637503, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:24:41,958 - DEBUG - Extracted CCC data shape: (1345637503, 1)
-2025-09-11 23:24:41,958 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:28:18,530 - INFO - Successfully processed gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl -> gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:28:18,530 - INFO - Size reduction: 17.55 GB -> 5.44 GB (69.0% smaller)
-2025-09-11 23:28:19,802 - DEBUG - Processing file 10/54: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl
-2025-09-11 23:28:19,802 - INFO - Processing file: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl
-2025-09-11 23:28:52,599 - DEBUG - Loaded data shape: (1377836265, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:28:54,575 - DEBUG - Extracted CCC data shape: (1377836265, 1)
-2025-09-11 23:28:54,576 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:32:34,520 - INFO - Successfully processed gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:32:34,520 - INFO - Size reduction: 17.97 GB -> 5.74 GB (68.0% smaller)
-2025-09-11 23:32:35,848 - DEBUG - Processing file 11/54: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl
-2025-09-11 23:32:35,848 - INFO - Processing file: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl
-2025-09-11 23:33:09,984 - DEBUG - Loaded data shape: (1357283151, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:33:11,931 - DEBUG - Extracted CCC data shape: (1357283151, 1)
-2025-09-11 23:33:11,931 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:36:52,280 - INFO - Successfully processed gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:36:52,280 - INFO - Size reduction: 17.70 GB -> 5.58 GB (68.4% smaller)
-2025-09-11 23:36:53,574 - DEBUG - Processing file 12/54: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl
-2025-09-11 23:36:53,574 - INFO - Processing file: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl
-2025-09-11 23:37:32,215 - DEBUG - Loaded data shape: (1373692320, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:37:34,080 - DEBUG - Extracted CCC data shape: (1373692320, 1)
-2025-09-11 23:37:34,080 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:41:32,142 - INFO - Successfully processed gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:41:32,142 - INFO - Size reduction: 17.91 GB -> 5.75 GB (67.9% smaller)
-2025-09-11 23:41:33,450 - DEBUG - Processing file 13/54: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl
-2025-09-11 23:41:33,450 - INFO - Processing file: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl
-2025-09-11 23:42:24,751 - DEBUG - Loaded data shape: (1428371076, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:42:26,765 - DEBUG - Extracted CCC data shape: (1428371076, 1)
-2025-09-11 23:42:26,765 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:46:22,020 - INFO - Successfully processed gtex_v8_data_brain_cortex-var_pc_log2-all.pkl -> gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:46:22,020 - INFO - Size reduction: 18.63 GB -> 6.26 GB (66.4% smaller)
-2025-09-11 23:46:23,372 - DEBUG - Processing file 14/54: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl
-2025-09-11 23:46:23,372 - INFO - Processing file: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl
-2025-09-11 23:47:00,136 - DEBUG - Loaded data shape: (1359576585, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:47:01,992 - DEBUG - Extracted CCC data shape: (1359576585, 1)
-2025-09-11 23:47:01,992 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:50:37,217 - INFO - Successfully processed gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl -> gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:50:37,217 - INFO - Size reduction: 17.73 GB -> 5.59 GB (68.5% smaller)
-2025-09-11 23:50:38,531 - DEBUG - Processing file 15/54: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl
-2025-09-11 23:50:38,531 - INFO - Processing file: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl
-2025-09-11 23:52:04,330 - DEBUG - Loaded data shape: (1381565895, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:52:06,311 - DEBUG - Extracted CCC data shape: (1381565895, 1)
-2025-09-11 23:52:06,311 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:55:49,825 - INFO - Successfully processed gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:55:49,825 - INFO - Size reduction: 18.01 GB -> 5.78 GB (67.9% smaller)
-2025-09-11 23:55:51,065 - DEBUG - Processing file 16/54: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl
-2025-09-11 23:55:51,065 - INFO - Processing file: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl
-2025-09-11 23:56:27,935 - DEBUG - Loaded data shape: (1371020430, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:56:29,884 - DEBUG - Extracted CCC data shape: (1371020430, 1)
-2025-09-11 23:56:29,884 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:00:09,533 - INFO - Successfully processed gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:00:09,533 - INFO - Size reduction: 17.88 GB -> 5.67 GB (68.3% smaller)
-2025-09-12 00:00:10,797 - DEBUG - Processing file 17/54: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:00:10,797 - INFO - Processing file: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:00:44,756 - DEBUG - Loaded data shape: (1389198405, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:00:46,719 - DEBUG - Extracted CCC data shape: (1389198405, 1)
-2025-09-12 00:00:46,719 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:05:05,403 - INFO - Successfully processed gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:05:05,403 - INFO - Size reduction: 18.11 GB -> 5.84 GB (67.7% smaller)
-2025-09-12 00:05:06,693 - DEBUG - Processing file 18/54: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:05:06,693 - INFO - Processing file: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:05:39,317 - DEBUG - Loaded data shape: (1336936195, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:05:41,239 - DEBUG - Extracted CCC data shape: (1336936195, 1)
-2025-09-12 00:05:41,239 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:09:17,133 - INFO - Successfully processed gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:09:17,134 - INFO - Size reduction: 17.43 GB -> 5.35 GB (69.3% smaller)
-2025-09-12 00:09:18,419 - DEBUG - Processing file 19/54: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl
-2025-09-12 00:09:18,419 - INFO - Processing file: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl
-2025-09-12 00:09:55,385 - DEBUG - Loaded data shape: (1305886065, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:09:57,241 - DEBUG - Extracted CCC data shape: (1305886065, 1)
-2025-09-12 00:09:57,241 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:13:24,327 - INFO - Successfully processed gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl -> gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:13:24,328 - INFO - Size reduction: 17.03 GB -> 5.07 GB (70.2% smaller)
-2025-09-12 00:13:25,591 - DEBUG - Processing file 20/54: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl
-2025-09-12 00:13:25,591 - INFO - Processing file: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl
-2025-09-12 00:13:56,027 - DEBUG - Loaded data shape: (1278940600, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:13:57,877 - DEBUG - Extracted CCC data shape: (1278940600, 1)
-2025-09-12 00:13:57,877 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:17:25,692 - INFO - Successfully processed gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl -> gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:17:25,692 - INFO - Size reduction: 16.68 GB -> 4.82 GB (71.1% smaller)
-2025-09-12 00:17:26,928 - DEBUG - Processing file 21/54: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl
-2025-09-12 00:17:26,928 - INFO - Processing file: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl
-2025-09-12 00:18:07,581 - DEBUG - Loaded data shape: (1452847560, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:18:09,592 - DEBUG - Extracted CCC data shape: (1452847560, 1)
-2025-09-12 00:18:09,593 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:22:35,756 - INFO - Successfully processed gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl -> gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:22:35,756 - INFO - Size reduction: 18.94 GB -> 6.50 GB (65.7% smaller)
-2025-09-12 00:22:37,092 - DEBUG - Processing file 22/54: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl
-2025-09-12 00:22:37,092 - INFO - Processing file: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl
-2025-09-12 00:23:16,518 - DEBUG - Loaded data shape: (1401877725, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:23:18,482 - DEBUG - Extracted CCC data shape: (1401877725, 1)
-2025-09-12 00:23:18,482 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:27:42,509 - INFO - Successfully processed gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl -> gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:27:42,510 - INFO - Size reduction: 18.28 GB -> 6.01 GB (67.1% smaller)
-2025-09-12 00:27:43,822 - DEBUG - Processing file 23/54: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl
-2025-09-12 00:27:43,822 - INFO - Processing file: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl
-2025-09-12 00:28:16,439 - DEBUG - Loaded data shape: (1338539670, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:28:18,322 - DEBUG - Extracted CCC data shape: (1338539670, 1)
-2025-09-12 00:28:18,322 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:31:50,501 - INFO - Successfully processed gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl -> gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:31:50,501 - INFO - Size reduction: 17.45 GB -> 5.43 GB (68.9% smaller)
-2025-09-12 00:31:51,756 - DEBUG - Processing file 24/54: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl
-2025-09-12 00:31:51,756 - INFO - Processing file: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl
-2025-09-12 00:32:16,338 - DEBUG - Loaded data shape: (871468626, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:32:17,565 - DEBUG - Extracted CCC data shape: (871468626, 1)
-2025-09-12 00:32:17,565 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:34:32,270 - INFO - Successfully processed gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:34:32,270 - INFO - Size reduction: 11.36 GB -> 2.56 GB (77.4% smaller)
-2025-09-12 00:34:33,139 - DEBUG - Processing file 25/54: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl
-2025-09-12 00:34:33,139 - INFO - Processing file: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl
-2025-09-12 00:34:57,887 - DEBUG - Loaded data shape: (883533666, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:34:59,115 - DEBUG - Extracted CCC data shape: (883533666, 1)
-2025-09-12 00:34:59,116 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:37:14,667 - INFO - Successfully processed gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:37:14,667 - INFO - Size reduction: 11.52 GB -> 2.70 GB (76.6% smaller)
-2025-09-12 00:37:15,482 - DEBUG - Processing file 26/54: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl
-2025-09-12 00:37:15,482 - INFO - Processing file: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl
-2025-09-12 00:38:40,776 - DEBUG - Loaded data shape: (1414189153, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:38:42,695 - DEBUG - Extracted CCC data shape: (1414189153, 1)
-2025-09-12 00:38:42,695 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:42:32,231 - INFO - Successfully processed gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl -> gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:42:32,231 - INFO - Size reduction: 18.44 GB -> 6.12 GB (66.8% smaller)
-2025-09-12 00:42:33,416 - DEBUG - Processing file 27/54: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl
-2025-09-12 00:42:33,416 - INFO - Processing file: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl
-2025-09-12 00:43:13,557 - DEBUG - Loaded data shape: (1425646503, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:43:15,585 - DEBUG - Extracted CCC data shape: (1425646503, 1)
-2025-09-12 00:43:15,585 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:47:04,970 - INFO - Successfully processed gtex_v8_data_colon_transverse-var_pc_log2-all.pkl -> gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:47:04,971 - INFO - Size reduction: 18.59 GB -> 6.22 GB (66.6% smaller)
-2025-09-12 00:47:06,372 - DEBUG - Processing file 28/54: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl
-2025-09-12 00:47:06,372 - INFO - Processing file: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl
-2025-09-12 00:47:40,754 - DEBUG - Loaded data shape: (1407708330, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:47:42,712 - DEBUG - Extracted CCC data shape: (1407708330, 1)
-2025-09-12 00:47:42,712 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:51:30,851 - INFO - Successfully processed gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:51:30,852 - INFO - Size reduction: 18.36 GB -> 6.05 GB (67.0% smaller)
-2025-09-12 00:51:32,216 - DEBUG - Processing file 29/54: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl
-2025-09-12 00:51:32,217 - INFO - Processing file: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl
-2025-09-12 00:52:12,189 - DEBUG - Loaded data shape: (1429226380, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:52:14,202 - DEBUG - Extracted CCC data shape: (1429226380, 1)
-2025-09-12 00:52:14,202 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:56:04,136 - INFO - Successfully processed gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:56:04,136 - INFO - Size reduction: 18.64 GB -> 6.26 GB (66.4% smaller)
-2025-09-12 00:56:05,503 - DEBUG - Processing file 30/54: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl
-2025-09-12 00:56:05,504 - INFO - Processing file: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl
-2025-09-12 00:56:40,714 - DEBUG - Loaded data shape: (1438705261, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:56:42,738 - DEBUG - Extracted CCC data shape: (1438705261, 1)
-2025-09-12 00:56:42,738 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:01:04,618 - INFO - Successfully processed gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:01:04,618 - INFO - Size reduction: 18.76 GB -> 6.34 GB (66.2% smaller)
-2025-09-12 01:01:05,944 - DEBUG - Processing file 31/54: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl
-2025-09-12 01:01:05,944 - INFO - Processing file: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl
-2025-09-12 01:01:30,707 - DEBUG - Loaded data shape: (869799486, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:01:31,984 - DEBUG - Extracted CCC data shape: (869799486, 1)
-2025-09-12 01:01:31,984 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:03:48,060 - INFO - Successfully processed gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl -> gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:03:48,061 - INFO - Size reduction: 11.34 GB -> 2.57 GB (77.4% smaller)
-2025-09-12 01:03:48,926 - DEBUG - Processing file 32/54: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl
-2025-09-12 01:03:48,926 - INFO - Processing file: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl
-2025-09-12 01:04:24,279 - DEBUG - Loaded data shape: (1416051153, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:04:26,222 - DEBUG - Extracted CCC data shape: (1416051153, 1)
-2025-09-12 01:04:26,222 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:08:16,477 - INFO - Successfully processed gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl -> gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:08:16,477 - INFO - Size reduction: 18.46 GB -> 6.11 GB (66.9% smaller)
-2025-09-12 01:08:17,711 - DEBUG - Processing file 33/54: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl
-2025-09-12 01:08:17,711 - INFO - Processing file: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl
-2025-09-12 01:08:56,782 - DEBUG - Loaded data shape: (1389303828, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:08:58,759 - DEBUG - Extracted CCC data shape: (1389303828, 1)
-2025-09-12 01:08:58,759 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:12:40,469 - INFO - Successfully processed gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl -> gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:12:40,469 - INFO - Size reduction: 18.12 GB -> 5.84 GB (67.8% smaller)
-2025-09-12 01:12:41,779 - DEBUG - Processing file 34/54: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl
-2025-09-12 01:12:41,779 - INFO - Processing file: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl
-2025-09-12 01:13:12,636 - DEBUG - Loaded data shape: (1231692528, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:13:14,333 - DEBUG - Extracted CCC data shape: (1231692528, 1)
-2025-09-12 01:13:14,333 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:16:29,445 - INFO - Successfully processed gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl -> gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:16:29,445 - INFO - Size reduction: 16.06 GB -> 4.38 GB (72.7% smaller)
-2025-09-12 01:16:30,622 - DEBUG - Processing file 35/54: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl
-2025-09-12 01:16:30,622 - INFO - Processing file: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl
-2025-09-12 01:16:48,096 - DEBUG - Loaded data shape: (692459505, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:16:49,091 - DEBUG - Extracted CCC data shape: (692459505, 1)
-2025-09-12 01:16:49,091 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:18:36,495 - INFO - Successfully processed gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl -> gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:18:36,495 - INFO - Size reduction: 9.03 GB -> 1.43 GB (84.1% smaller)
-2025-09-12 01:18:37,172 - DEBUG - Processing file 36/54: gtex_v8_data_liver-var_pc_log2-all.pkl
-2025-09-12 01:18:37,172 - INFO - Processing file: gtex_v8_data_liver-var_pc_log2-all.pkl
-2025-09-12 01:19:14,213 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:19:16,077 - DEBUG - Extracted CCC data shape: (1313153128, 1)
-2025-09-12 01:19:16,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:22:44,977 - INFO - Successfully processed gtex_v8_data_liver-var_pc_log2-all.pkl -> gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:22:44,977 - INFO - Size reduction: 17.12 GB -> 5.16 GB (69.9% smaller)
-2025-09-12 01:22:46,127 - DEBUG - Processing file 37/54: gtex_v8_data_lung-var_pc_log2-all.pkl
-2025-09-12 01:22:46,127 - INFO - Processing file: gtex_v8_data_lung-var_pc_log2-all.pkl
-2025-09-12 01:23:21,938 - DEBUG - Loaded data shape: (1461917628, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:23:24,049 - DEBUG - Extracted CCC data shape: (1461917628, 1)
-2025-09-12 01:23:24,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:27:20,003 - INFO - Successfully processed gtex_v8_data_lung-var_pc_log2-all.pkl -> gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:27:20,003 - INFO - Size reduction: 19.06 GB -> 6.58 GB (65.5% smaller)
-2025-09-12 01:27:21,442 - DEBUG - Processing file 38/54: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl
-2025-09-12 01:27:21,442 - INFO - Processing file: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl
-2025-09-12 01:27:54,922 - DEBUG - Loaded data shape: (1331409003, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:27:56,838 - DEBUG - Extracted CCC data shape: (1331409003, 1)
-2025-09-12 01:27:56,838 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:31:28,578 - INFO - Successfully processed gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl -> gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:31:28,578 - INFO - Size reduction: 17.36 GB -> 5.35 GB (69.2% smaller)
-2025-09-12 01:31:29,896 - DEBUG - Processing file 39/54: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl
-2025-09-12 01:31:29,896 - INFO - Processing file: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl
-2025-09-12 01:32:06,348 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:32:08,350 - DEBUG - Extracted CCC data shape: (1460025703, 1)
-2025-09-12 01:32:08,350 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:36:04,226 - INFO - Successfully processed gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl -> gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:36:04,226 - INFO - Size reduction: 19.04 GB -> 6.54 GB (65.7% smaller)
-2025-09-12 01:36:05,600 - DEBUG - Processing file 40/54: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl
-2025-09-12 01:36:05,601 - INFO - Processing file: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl
-2025-09-12 01:36:42,425 - DEBUG - Loaded data shape: (1472643585, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:36:44,497 - DEBUG - Extracted CCC data shape: (1472643585, 1)
-2025-09-12 01:36:44,497 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:40:42,722 - INFO - Successfully processed gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl -> gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:40:42,722 - INFO - Size reduction: 19.20 GB -> 6.69 GB (65.2% smaller)
-2025-09-12 01:40:44,106 - DEBUG - Processing file 41/54: gtex_v8_data_ovary-var_pc_log2-all.pkl
-2025-09-12 01:40:44,106 - INFO - Processing file: gtex_v8_data_ovary-var_pc_log2-all.pkl
-2025-09-12 01:41:18,148 - DEBUG - Loaded data shape: (1353222276, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:41:20,048 - DEBUG - Extracted CCC data shape: (1353222276, 1)
-2025-09-12 01:41:20,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:45:04,730 - INFO - Successfully processed gtex_v8_data_ovary-var_pc_log2-all.pkl -> gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:45:04,730 - INFO - Size reduction: 17.65 GB -> 5.56 GB (68.5% smaller)
-2025-09-12 01:45:06,016 - DEBUG - Processing file 42/54: gtex_v8_data_pancreas-var_pc_log2-all.pkl
-2025-09-12 01:45:06,016 - INFO - Processing file: gtex_v8_data_pancreas-var_pc_log2-all.pkl
-2025-09-12 01:46:30,520 - DEBUG - Loaded data shape: (1369711630, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:46:32,480 - DEBUG - Extracted CCC data shape: (1369711630, 1)
-2025-09-12 01:46:32,480 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:50:13,619 - INFO - Successfully processed gtex_v8_data_pancreas-var_pc_log2-all.pkl -> gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:50:13,619 - INFO - Size reduction: 17.86 GB -> 5.68 GB (68.2% smaller)
-2025-09-12 01:50:14,880 - DEBUG - Processing file 43/54: gtex_v8_data_pituitary-var_pc_log2-all.pkl
-2025-09-12 01:50:14,880 - INFO - Processing file: gtex_v8_data_pituitary-var_pc_log2-all.pkl
-2025-09-12 01:50:55,006 - DEBUG - Loaded data shape: (1418660011, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:50:57,042 - DEBUG - Extracted CCC data shape: (1418660011, 1)
-2025-09-12 01:50:57,042 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:54:43,653 - INFO - Successfully processed gtex_v8_data_pituitary-var_pc_log2-all.pkl -> gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:54:43,653 - INFO - Size reduction: 18.50 GB -> 6.17 GB (66.6% smaller)
-2025-09-12 01:54:44,998 - DEBUG - Processing file 44/54: gtex_v8_data_prostate-var_pc_log2-all.pkl
-2025-09-12 01:54:44,998 - INFO - Processing file: gtex_v8_data_prostate-var_pc_log2-all.pkl
-2025-09-12 01:55:19,883 - DEBUG - Loaded data shape: (1395161076, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:55:21,840 - DEBUG - Extracted CCC data shape: (1395161076, 1)
-2025-09-12 01:55:21,840 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:59:09,359 - INFO - Successfully processed gtex_v8_data_prostate-var_pc_log2-all.pkl -> gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:59:09,359 - INFO - Size reduction: 18.19 GB -> 5.96 GB (67.2% smaller)
-2025-09-12 01:59:10,703 - DEBUG - Processing file 45/54: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl
-2025-09-12 01:59:10,703 - INFO - Processing file: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl
-2025-09-12 01:59:51,917 - DEBUG - Loaded data shape: (1458621066, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:59:53,937 - DEBUG - Extracted CCC data shape: (1458621066, 1)
-2025-09-12 01:59:53,937 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:03:48,454 - INFO - Successfully processed gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl -> gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:03:48,454 - INFO - Size reduction: 19.02 GB -> 6.55 GB (65.6% smaller)
-2025-09-12 02:03:49,884 - DEBUG - Processing file 46/54: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl
-2025-09-12 02:03:49,884 - INFO - Processing file: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl
-2025-09-12 02:05:18,157 - DEBUG - Loaded data shape: (1473566328, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:05:20,326 - DEBUG - Extracted CCC data shape: (1473566328, 1)
-2025-09-12 02:05:20,327 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:09:19,847 - INFO - Successfully processed gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl -> gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:09:19,847 - INFO - Size reduction: 19.21 GB -> 6.69 GB (65.2% smaller)
-2025-09-12 02:09:21,270 - DEBUG - Processing file 47/54: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl
-2025-09-12 02:09:21,270 - INFO - Processing file: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl
-2025-09-12 02:09:59,324 - DEBUG - Loaded data shape: (1353014190, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:10:01,233 - DEBUG - Extracted CCC data shape: (1353014190, 1)
-2025-09-12 02:10:01,233 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:13:38,031 - INFO - Successfully processed gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl -> gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:13:38,032 - INFO - Size reduction: 17.64 GB -> 5.54 GB (68.6% smaller)
-2025-09-12 02:13:39,310 - DEBUG - Processing file 48/54: gtex_v8_data_spleen-var_pc_log2-all.pkl
-2025-09-12 02:13:39,310 - INFO - Processing file: gtex_v8_data_spleen-var_pc_log2-all.pkl
-2025-09-12 02:14:13,490 - DEBUG - Loaded data shape: (1367095905, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:14:15,383 - DEBUG - Extracted CCC data shape: (1367095905, 1)
-2025-09-12 02:14:15,383 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:17:52,891 - INFO - Successfully processed gtex_v8_data_spleen-var_pc_log2-all.pkl -> gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:17:52,891 - INFO - Size reduction: 17.83 GB -> 5.68 GB (68.1% smaller)
-2025-09-12 02:17:54,199 - DEBUG - Processing file 49/54: gtex_v8_data_stomach-var_pc_log2-all.pkl
-2025-09-12 02:17:54,199 - INFO - Processing file: gtex_v8_data_stomach-var_pc_log2-all.pkl
-2025-09-12 02:18:29,137 - DEBUG - Loaded data shape: (1402248403, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:18:31,077 - DEBUG - Extracted CCC data shape: (1402248403, 1)
-2025-09-12 02:18:31,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:23:07,195 - INFO - Successfully processed gtex_v8_data_stomach-var_pc_log2-all.pkl -> gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:23:07,195 - INFO - Size reduction: 18.28 GB -> 5.99 GB (67.2% smaller)
-2025-09-12 02:23:08,518 - DEBUG - Processing file 50/54: gtex_v8_data_testis-var_pc_log2-all.pkl
-2025-09-12 02:23:08,519 - INFO - Processing file: gtex_v8_data_testis-var_pc_log2-all.pkl
-2025-09-12 02:23:47,944 - DEBUG - Loaded data shape: (1502917725, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:23:50,000 - DEBUG - Extracted CCC data shape: (1502917725, 1)
-2025-09-12 02:23:50,000 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:27:48,966 - INFO - Successfully processed gtex_v8_data_testis-var_pc_log2-all.pkl -> gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:27:48,966 - INFO - Size reduction: 19.60 GB -> 7.02 GB (64.2% smaller)
-2025-09-12 02:27:50,385 - DEBUG - Processing file 51/54: gtex_v8_data_thyroid-var_pc_log2-all.pkl
-2025-09-12 02:27:50,385 - INFO - Processing file: gtex_v8_data_thyroid-var_pc_log2-all.pkl
-2025-09-12 02:28:27,204 - DEBUG - Loaded data shape: (1472317980, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:28:29,203 - DEBUG - Extracted CCC data shape: (1472317980, 1)
-2025-09-12 02:28:29,204 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:32:22,715 - INFO - Successfully processed gtex_v8_data_thyroid-var_pc_log2-all.pkl -> gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:32:22,715 - INFO - Size reduction: 19.20 GB -> 6.68 GB (65.2% smaller)
-2025-09-12 02:32:24,123 - DEBUG - Processing file 52/54: gtex_v8_data_uterus-var_pc_log2-all.pkl
-2025-09-12 02:32:24,123 - INFO - Processing file: gtex_v8_data_uterus-var_pc_log2-all.pkl
-2025-09-12 02:32:56,654 - DEBUG - Loaded data shape: (1308289128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:32:58,515 - DEBUG - Extracted CCC data shape: (1308289128, 1)
-2025-09-12 02:32:58,515 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:36:26,336 - INFO - Successfully processed gtex_v8_data_uterus-var_pc_log2-all.pkl -> gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:36:26,338 - INFO - Size reduction: 17.06 GB -> 5.16 GB (69.8% smaller)
-2025-09-12 02:36:27,576 - DEBUG - Processing file 53/54: gtex_v8_data_vagina-var_pc_log2-all.pkl
-2025-09-12 02:36:27,576 - INFO - Processing file: gtex_v8_data_vagina-var_pc_log2-all.pkl
-2025-09-12 02:37:09,453 - DEBUG - Loaded data shape: (1328623926, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:37:11,361 - DEBUG - Extracted CCC data shape: (1328623926, 1)
-2025-09-12 02:37:11,361 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:40:41,189 - INFO - Successfully processed gtex_v8_data_vagina-var_pc_log2-all.pkl -> gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:40:41,189 - INFO - Size reduction: 17.32 GB -> 5.33 GB (69.2% smaller)
-2025-09-12 02:40:42,400 - DEBUG - Processing file 54/54: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2025-09-12 02:40:42,400 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2025-09-12 02:41:17,012 - DEBUG - Loaded data shape: (1420258456, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:41:19,005 - DEBUG - Extracted CCC data shape: (1420258456, 1)
-2025-09-12 02:41:19,005 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:46:00,216 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl -> gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:46:00,216 - INFO - Size reduction: 18.52 GB -> 6.12 GB (66.9% smaller)
-2025-09-12 02:46:01,553 - INFO - File processing completed in 3:55:57.451227
-2025-09-12 02:46:01,553 - INFO - Processing complete! Successfully processed: 54/54 files
-2025-09-12 02:46:01,553 - INFO - Total execution time: 3:55:57.451335
-2025-09-12 02:46:01,553 - INFO - Output directory: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet
-2025-09-12 02:46:01,553 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
diff --git a/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py
new file mode 100755
index 00000000..bb729d39
--- /dev/null
+++ b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+"""
+Convert GTEx CCC correlation data from pickle format to DuckDB format for efficient storage and fast queries.
+
+This script processes all .pkl files containing gene correlation data and creates optimized
+DuckDB databases that provide:
+- Fast random access to gene pairs (sub-millisecond queries)
+- Significantly reduced storage size
+- SQL query capabilities
+- Minimal memory usage for queries
+"""
+
+import argparse
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import gc
+
+import pandas as pd
+import duckdb
+import numpy as np
+from tqdm import tqdm
+
+
+def setup_logging(debug: bool = False) -> str:
+    """Set up logging with timestamped log file."""
+    script_dir = Path(__file__).parent
+    logs_dir = script_dir / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_filename = logs_dir / f"process_ccc_to_duckdb_{timestamp}.log"
+
+    logging.basicConfig(
+        level=logging.DEBUG if debug else logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_filename),
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+
+    logger = logging.getLogger(__name__)
+    logger.info(f"Starting CCC to DuckDB conversion")
+    logger.info(f"Log file: {log_filename}")
+
+    return str(log_filename)
+
+
+def convert_pickle_to_duckdb(
+    pkl_file: Path,
+    output_dir: Path,
+    single_db: bool = False,
+    db_con: Optional[duckdb.DuckDBPyConnection] = None,
+    chunk_size: int = 10_000_000
+) -> Dict:
+    """
+    Convert a single pickle file to DuckDB format.
+
+    Args:
+        pkl_file: Path to input pickle file
+        output_dir: Directory for output database files
+        single_db: If True, append to single database (db_con must be provided)
+        db_con: Existing DuckDB connection (if single_db is True)
+        chunk_size: Number of rows to process at once
+
+    Returns:
+        Dictionary with conversion statistics
+    """
+    logger = logging.getLogger(__name__)
+    stats = {}
+    start_time = datetime.now()
+
+    # Get tissue name from filename
+    tissue_name = pkl_file.stem.replace('gtex_v8_data_', '').replace('-var_pc_log2-all', '')
+    stats['tissue'] = tissue_name
+    stats['input_file'] = pkl_file.name
+
+    logger.info(f"Processing: {pkl_file.name}")
+
+    try:
+        # Load pickle file
+        logger.info(f"Loading pickle file...")
+        load_start = datetime.now()
+        df = pd.read_pickle(pkl_file)
+        load_time = (datetime.now() - load_start).total_seconds()
+
+        stats['input_rows'] = len(df)
+        stats['input_size_gb'] = pkl_file.stat().st_size / (1024**3)
+        logger.info(f"Loaded {len(df):,} rows in {load_time:.1f}s ({stats['input_size_gb']:.2f} GB)")
+
+        # Extract only CCC column and reset index
+        logger.info("Preparing data...")
+        df_ccc = df[['ccc']].reset_index()
+        df_ccc.columns = ['gene1', 'gene2', 'ccc']
+
+        # Convert to appropriate types
+        df_ccc['ccc'] = df_ccc['ccc'].astype('float32')
+        df_ccc['gene1'] = df_ccc['gene1'].astype(str)
+        df_ccc['gene2'] = df_ccc['gene2'].astype(str)
+
+        # Clean up original dataframe to free memory
+        del df
+        gc.collect()
+
+        # Create or connect to database
+        if single_db:
+            con = db_con
+            table_name = f"ccc_{tissue_name}"
+        else:
+            db_file = output_dir / f"{tissue_name}_ccc.duckdb"
+            con = duckdb.connect(str(db_file))
+            table_name = "ccc_data"
+
+        # Create table
+        logger.info(f"Creating table {table_name}...")
+
+        if single_db:
+            # For single database, include tissue in the table
+            con.execute(f"""
+                CREATE TABLE IF NOT EXISTS {table_name} (
+                    gene1 VARCHAR NOT NULL,
+                    gene2 VARCHAR NOT NULL,
+                    ccc REAL NOT NULL,
+                    PRIMARY KEY (gene1, gene2)
+                )
+            """)
+        else:
+            con.execute(f"""
+                CREATE TABLE {table_name} (
+                    gene1 VARCHAR NOT NULL,
+                    gene2 VARCHAR NOT NULL,
+                    ccc REAL NOT NULL,
+                    PRIMARY KEY (gene1, gene2)
+                )
+            """)
+
+        # Insert data efficiently
+        logger.info(f"Inserting {len(df_ccc):,} rows...")
+        insert_start = datetime.now()
+
+        # Use DuckDB's register for bulk insert
+        con.register('df_temp', df_ccc)
+        con.execute(f"INSERT INTO {table_name} SELECT * FROM df_temp")
+        con.unregister('df_temp')
+
+        insert_time = (datetime.now() - insert_start).total_seconds()
+        stats['insert_time'] = insert_time
+        logger.info(f"Data inserted in {insert_time:.1f}s")
+
+        # Clean up dataframe
+        del df_ccc
+        gc.collect()
+
+        # Create indexes for faster lookups
+        logger.info("Creating indexes...")
+        index_start = datetime.now()
+
+        # Create index on gene2 for reverse lookups
+        con.execute(f"CREATE INDEX idx_{table_name}_gene2 ON {table_name}(gene2)")
+
+        # Create index on ccc for range queries
+        con.execute(f"CREATE INDEX idx_{table_name}_ccc ON {table_name}(ccc)")
+
+        # Analyze table for query optimization
+        con.execute(f"ANALYZE {table_name}")
+
+        index_time = (datetime.now() - index_start).total_seconds()
+        stats['index_time'] = index_time
+
+        # Get final statistics
+        result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
+        stats['output_rows'] = result[0]
+
+        if not single_db:
+            # Close connection and get file size
+            con.close()
+            stats['output_size_gb'] = (output_dir / f"{tissue_name}_ccc.duckdb").stat().st_size / (1024**3)
+
+        stats['total_time'] = (datetime.now() - start_time).total_seconds()
+
+        if 'output_size_gb' in stats:
+            stats['compression_ratio'] = stats['input_size_gb'] / stats['output_size_gb']
+            logger.info(f"Completed: {stats['input_size_gb']:.2f} GB -> {stats['output_size_gb']:.2f} GB "
+                       f"(compression: {stats['compression_ratio']:.1f}x)")
+
+        logger.info(f"Total time: {stats['total_time']:.1f}s")
+
+    except Exception as e:
+        logger.error(f"Error processing {pkl_file.name}: {e}")
+        stats['error'] = str(e)
+        if not single_db and 'con' in locals():
+            con.close()
+
+    return stats
+
+
+def create_consolidated_database(
+    pkl_files: List[Path],
+    output_dir: Path
+) -> Dict:
+    """
+    Create a single consolidated DuckDB database with all tissues.
+
+    Args:
+        pkl_files: List of pickle files to process
+        output_dir: Directory for output database
+
+    Returns:
+        Dictionary with overall statistics
+    """
+    logger = logging.getLogger(__name__)
+
+    db_file = output_dir / "all_tissues_ccc.duckdb"
+    logger.info(f"Creating consolidated database: {db_file}")
+
+    con = duckdb.connect(str(db_file))
+    all_stats = []
+
+    try:
+        # Create master table for tissue metadata
+        con.execute("""
+            CREATE TABLE tissues (
+                tissue_id INTEGER PRIMARY KEY,
+                tissue_name VARCHAR UNIQUE NOT NULL,
+                num_pairs BIGINT,
+                min_ccc REAL,
+                max_ccc REAL,
+                mean_ccc REAL
+            )
+        """)
+
+        tissue_id = 1
+
+        for pkl_file in tqdm(pkl_files, desc="Processing tissues"):
+            stats = convert_pickle_to_duckdb(
+                pkl_file=pkl_file,
+                output_dir=output_dir,
+                single_db=True,
+                db_con=con
+            )
+
+            if 'error' not in stats:
+                # Add tissue metadata
+                tissue_name = stats['tissue']
+                table_name = f"ccc_{tissue_name}"
+
+                tissue_stats = con.execute(f"""
+                    SELECT
+                        COUNT(*) as num_pairs,
+                        MIN(ccc) as min_ccc,
+                        MAX(ccc) as max_ccc,
+                        AVG(ccc) as mean_ccc
+                    FROM {table_name}
+                """).fetchone()
+
+                con.execute("""
+                    INSERT INTO tissues (tissue_id, tissue_name, num_pairs, min_ccc, max_ccc, mean_ccc)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                """, [tissue_id, tissue_name, *tissue_stats])
+
+                tissue_id += 1
+
+            all_stats.append(stats)
+
+        # Create a view for easy cross-tissue queries
+        logger.info("Creating cross-tissue query views...")
+
+        # Get list of all tissue tables
+        tissue_tables = con.execute("""
+            SELECT 'ccc_' || tissue_name as table_name, tissue_name
+            FROM tissues
+        """).fetchall()
+
+        # Create UNION ALL view for searching across all tissues
+        union_parts = []
+        for table_name, tissue_name in tissue_tables:
+            union_parts.append(f"""
+                SELECT '{tissue_name}' as tissue, gene1, gene2, ccc
+                FROM {table_name}
+            """)
+
+        if union_parts:
+            union_query = " UNION ALL ".join(union_parts)
+            con.execute(f"""
+                CREATE VIEW all_correlations AS
+                {union_query}
+            """)
+
+            logger.info("Created all_correlations view for cross-tissue queries")
+
+        # Optimize database
+        logger.info("Optimizing database...")
+        con.execute("PRAGMA optimize")
+
+        # Get final database size
+        con.close()
+
+        db_size = db_file.stat().st_size / (1024**3)
+        logger.info(f"Consolidated database size: {db_size:.2f} GB")
+
+        return {
+            'database': str(db_file),
+            'tissues_processed': len([s for s in all_stats if 'error' not in s]),
+            'tissues_failed': len([s for s in all_stats if 'error' in s]),
+            'total_size_gb': db_size,
+            'stats': all_stats
+        }
+
+    except Exception as e:
+        logger.error(f"Error creating consolidated database: {e}")
+        con.close()
+        raise
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert GTEx CCC data from pickle to DuckDB format"
+    )
+    parser.add_argument(
+        "--source-dir",
+        type=str,
+        default="/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all",
+        help="Source directory containing .pkl files"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="/mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb",
+        help="Output directory for DuckDB files"
+    )
+    parser.add_argument(
+        "--single-db",
+        action="store_true",
+        help="Create a single consolidated database instead of one per tissue"
+    )
+    parser.add_argument(
+        "--tissues",
+        nargs="+",
+        help="Specific tissues to process (default: all)"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be processed without doing it"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug logging"
+    )
+
+    args = parser.parse_args()
+
+    # Setup logging
+    log_file = setup_logging(debug=args.debug)
+    logger = logging.getLogger(__name__)
+
+    # Convert paths
+    source_dir = Path(args.source_dir)
+    output_dir = Path(args.output_dir)
+
+    logger.info(f"Configuration:")
+    logger.info(f"  Source: {source_dir}")
+    logger.info(f"  Output: {output_dir}")
+    logger.info(f"  Single DB: {args.single_db}")
+
+    # Check source directory
+    if not source_dir.exists():
+        logger.error(f"Source directory not found: {source_dir}")
+        sys.exit(1)
+
+    # Get list of pickle files
+    pkl_files = sorted(source_dir.glob("*.pkl"))
+
+    # Filter by specific tissues if requested
+    if args.tissues:
+        filtered = []
+        for tissue in args.tissues:
+            matching = [f for f in pkl_files if tissue in f.name]
+            filtered.extend(matching)
+        pkl_files = filtered
+
+    if not pkl_files:
+        logger.error("No pickle files found to process")
+        sys.exit(1)
+
+    logger.info(f"Found {len(pkl_files)} files to process")
+
+    if args.dry_run:
+        print("\nFiles that would be processed:")
+        for f in pkl_files:
+            size_gb = f.stat().st_size / (1024**3)
+            print(f"  {f.name} ({size_gb:.2f} GB)")
+        print(f"\nOutput would be written to: {output_dir}")
+        return
+
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Process files
+    start_time = datetime.now()
+
+    if args.single_db:
+        # Create single consolidated database
+        results = create_consolidated_database(pkl_files, output_dir)
+
+        print(f"\n{'='*60}")
+        print("PROCESSING COMPLETE")
+        print(f"{'='*60}")
+        print(f"Database: {results['database']}")
+        print(f"Tissues processed: {results['tissues_processed']}")
+        print(f"Tissues failed: {results['tissues_failed']}")
+        print(f"Total size: {results['total_size_gb']:.2f} GB")
+
+    else:
+        # Create individual databases
+        all_stats = []
+
+        for pkl_file in tqdm(pkl_files, desc="Processing files"):
+            stats = convert_pickle_to_duckdb(
+                pkl_file=pkl_file,
+                output_dir=output_dir,
+                single_db=False
+            )
+            all_stats.append(stats)
+
+        # Summary
+        successful = [s for s in all_stats if 'error' not in s]
+        failed = [s for s in all_stats if 'error' in s]
+
+        print(f"\n{'='*60}")
+        print("PROCESSING COMPLETE")
+        print(f"{'='*60}")
+        print(f"Files processed: {len(successful)}/{len(pkl_files)}")
+
+        if successful:
+            total_input = sum(s['input_size_gb'] for s in successful)
+            total_output = sum(s.get('output_size_gb', 0) for s in successful)
+            avg_compression = total_input / total_output if total_output > 0 else 0
+
+            print(f"Total input size: {total_input:.2f} GB")
+            print(f"Total output size: {total_output:.2f} GB")
+            print(f"Average compression: {avg_compression:.1f}x")
+
+        if failed:
+            print(f"\nFailed files ({len(failed)}):")
+            for s in failed:
+                print(f"  {s['input_file']}: {s['error']}")
+
+    total_time = (datetime.now() - start_time).total_seconds()
+    print(f"\nTotal processing time: {total_time/60:.1f} minutes")
+    print(f"Log file: {log_file}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb
new file mode 100644
index 00000000..68f2b129
--- /dev/null
+++ b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb
@@ -0,0 +1,4056 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Description\n",
+    "This notebook demonstrates:\n",
+    "\n",
+    "1. how to compute coefficients values\n",
+    "2. how to correlate gene expression data with categorical metadata\n",
+    "\n",
+    "using CCC GPU with public data from GTEx v8."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please follow the instructions in the [README](../../README.md), section \"Quick Install with pip\" to install CCC-GPU with a conda environment `ccc-gpu-env`.\n",
+    "\n",
+    "Then activate the environment and start the jupyter notebook server in order to run this notebook.\n",
+    "\n",
+    "```bash\n",
+    "conda activate ccc-gpu-env\n",
+    "pip install notebook\n",
+    "jupyter notebook\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "import urllib.request\n",
+    "from tqdm import tqdm\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from ccc.utils import simplify_string\n",
+    "from ccc import conf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set this path to the directory where you want to save the intermediate data and results\n",
+    "ANALYSIS_DIR = Path(\"/mnt/data/proj_data/ccc-gpu/data/tutorial\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Fetching and Preprocessing\n",
+    "This section downloads:\n",
+    "1. the public GTEx v8 gene TPMs data (https://www.gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)\n",
+    "2. the GTEx sample attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n",
+    "3. the GTEx subject attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n",
+    "\n",
+    "and perform preprocessing to prepare the data for the analysis."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download GTEx v8 gene expression data and split by tissue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gtex_all_sample_ids_with_expr_data already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\n",
+      "gtex_sample_attrs already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\n",
+      "Downloading gtex_subject_attrs to /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\n",
+      "Download completed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create analysis directory if it doesn't exist\n",
+    "os.makedirs(ANALYSIS_DIR, exist_ok=True)\n",
+    "\n",
+    "# Define files to download\n",
+    "files_to_download = {\n",
+    "    \"gtex_all_sample_ids_with_expr_data\": \"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\",\n",
+    "    \"gtex_sample_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\",\n",
+    "    \"gtex_subject_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\"\n",
+    "}\n",
+    "\n",
+    "# Dictionary to store file paths\n",
+    "file_paths = {}\n",
+    "\n",
+    "# Download files\n",
+    "for var_name, url in files_to_download.items():\n",
+    "    filename = Path(url).name\n",
+    "    file_path = Path(ANALYSIS_DIR) / filename\n",
+    "    file_paths[var_name] = file_path\n",
+    "    \n",
+    "    if not file_path.exists():\n",
+    "        print(f\"Downloading {var_name} to {file_path}\")\n",
+    "        urllib.request.urlretrieve(url, file_path)\n",
+    "        print(\"Download completed!\")\n",
+    "    else:\n",
+    "        print(f\"{var_name} already exists at {file_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GTEx sample attributes shape: (22951, 63)\n",
+      "GTEx sample attributes columns: Index(['SAMPID', 'SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'SMRIN', 'SMTS', 'SMTSD',\n",
+      "       'SMUBRID', 'SMTSISCH', 'SMTSPAX', 'SMNABTCH', 'SMNABTCHT', 'SMNABTCHD',\n",
+      "       'SMGEBTCH', 'SMGEBTCHD', 'SMGEBTCHT', 'SMAFRZE', 'SMGTC', 'SME2MPRT',\n",
+      "       'SMCHMPRS', 'SMNTRART', 'SMNUMGPS', 'SMMAPRT', 'SMEXNCRT', 'SM550NRM',\n",
+      "       'SMGNSDTC', 'SMUNMPRT', 'SM350NRM', 'SMRDLGTH', 'SMMNCPB', 'SME1MMRT',\n",
+      "       'SMSFLGTH', 'SMESTLBS', 'SMMPPD', 'SMNTERRT', 'SMRRNANM', 'SMRDTTL',\n",
+      "       'SMVQCFL', 'SMMNCV', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMGAPPCT',\n",
+      "       'SMUNPDRD', 'SMNTRNRT', 'SMMPUNRT', 'SMEXPEFF', 'SMMPPDUN', 'SME2MMRT',\n",
+      "       'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SME1ANTI', 'SMSPLTRD',\n",
+      "       'SMBSMMRT', 'SME1SNSE', 'SME1PCTS', 'SMRRNART', 'SME1MPRT', 'SMNUM5CD',\n",
+      "       'SMDPMPRT', 'SME2PCTS'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "gtex_sample_attrs = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\")\n",
+    "print(f\"GTEx sample attributes shape: {gtex_sample_attrs.shape}\")\n",
+    "print(f\"GTEx sample attributes columns: {gtex_sample_attrs.columns}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "55\n",
+      "['Whole Blood' 'Brain - Frontal Cortex (BA9)' 'Adipose - Subcutaneous'\n",
+      " 'Muscle - Skeletal' 'Artery - Tibial' 'Artery - Coronary'\n",
+      " 'Heart - Atrial Appendage' 'Adipose - Visceral (Omentum)' 'Ovary'\n",
+      " 'Uterus' 'Vagina' 'Breast - Mammary Tissue'\n",
+      " 'Skin - Not Sun Exposed (Suprapubic)' 'Minor Salivary Gland'\n",
+      " 'Brain - Cortex' 'Adrenal Gland' 'Thyroid' 'Lung' 'Spleen' 'Pancreas'\n",
+      " 'Esophagus - Muscularis' 'Esophagus - Mucosa'\n",
+      " 'Esophagus - Gastroesophageal Junction' 'Stomach' 'Colon - Sigmoid'\n",
+      " 'Small Intestine - Terminal Ileum' 'Colon - Transverse' 'Prostate'\n",
+      " 'Testis' 'Skin - Sun Exposed (Lower leg)' 'Nerve - Tibial'\n",
+      " 'Heart - Left Ventricle' 'Pituitary' 'Brain - Cerebellum'\n",
+      " 'Cells - Cultured fibroblasts' 'Artery - Aorta'\n",
+      " 'Cells - EBV-transformed lymphocytes' 'Brain - Cerebellar Hemisphere'\n",
+      " 'Brain - Caudate (basal ganglia)'\n",
+      " 'Brain - Nucleus accumbens (basal ganglia)'\n",
+      " 'Brain - Putamen (basal ganglia)' 'Brain - Hypothalamus'\n",
+      " 'Brain - Spinal cord (cervical c-1)' 'Liver' 'Brain - Hippocampus'\n",
+      " 'Brain - Anterior cingulate cortex (BA24)' 'Brain - Substantia nigra'\n",
+      " 'Kidney - Cortex' 'Brain - Amygdala' 'Cervix - Ectocervix'\n",
+      " 'Fallopian Tube' 'Cervix - Endocervix' 'Bladder' 'Kidney - Medulla'\n",
+      " 'Cells - Leukemia cell line (CML)']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get tissue names\n",
+    "gtex_tissues = gtex_sample_attrs[\"SMTSD\"].unique()\n",
+    "print(len(gtex_tissues))\n",
+    "print(gtex_tissues)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Get sample IDs for each tissue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of samples with expression data: 17382\n",
+      "Sample IDs with expression data: ['GTEX-1HFI7-2426-SM-B2LXV', 'GTEX-11TTK-0226-SM-5N9EC', 'GTEX-11UD2-1226-SM-5EQMI', 'GTEX-X4EO-0006-SM-3P5ZF', 'GTEX-13O21-0326-SM-5J1N9', 'GTEX-XBED-1526-SM-4AT5W', 'GTEX-13NZ8-0011-R8b-SM-5KM48', 'GTEX-1H3O1-0005-SM-ACKV8', 'GTEX-13JVG-0011-R5a-SM-5MR4O', 'GTEX-1F88F-1126-SM-7MKHL']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# first, get all sample IDs with expression data\n",
+    "gtex_all_sample_ids_with_expr_data = set(\n",
+    "    pd.read_csv(\n",
+    "        file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n",
+    "        sep=\"\\t\",\n",
+    "        skiprows=2,\n",
+    "        nrows=1,\n",
+    "        usecols=lambda x: x not in (\"Name\", \"Description\"),\n",
+    "    ).columns\n",
+    ")\n",
+    "\n",
+    "print(f\"Number of samples with expression data: {len(gtex_all_sample_ids_with_expr_data)}\")\n",
+    "print(f\"Sample IDs with expression data: {list(gtex_all_sample_ids_with_expr_data)[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get sample IDs by tissue\n",
+    "sample_ids_by_tissue = {\n",
+    "    tissue_name: sorted(\n",
+    "        list(\n",
+    "            gtex_all_sample_ids_with_expr_data.intersection(\n",
+    "                set(\n",
+    "                    gtex_sample_attrs[gtex_sample_attrs[\"SMTSD\"] == tissue_name][\n",
+    "                        \"SAMPID\"\n",
+    "                    ].tolist()\n",
+    "                )\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    for tissue_name in gtex_tissues\n",
+    "}\n",
+    "\n",
+    "assert len(gtex_tissues) == len(sample_ids_by_tissue)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['GTEX-111YS-0006-SM-5NQBE',\n",
+       " 'GTEX-1122O-0005-SM-5O99J',\n",
+       " 'GTEX-1128S-0005-SM-5P9HI',\n",
+       " 'GTEX-113IC-0006-SM-5NQ9C',\n",
+       " 'GTEX-113JC-0006-SM-5O997',\n",
+       " 'GTEX-117XS-0005-SM-5PNU6',\n",
+       " 'GTEX-117YW-0005-SM-5NQ8Z',\n",
+       " 'GTEX-1192W-0005-SM-5NQBQ',\n",
+       " 'GTEX-1192X-0005-SM-5NQC3',\n",
+       " 'GTEX-11DXW-0006-SM-5NQ7Y']"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_ids_by_tissue[\"Whole Blood\"][:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure all IDs are unique\n",
+    "assert all(\n",
+    "    [\n",
+    "        len(sample_ids_by_tissue[tissue_name])\n",
+    "        == len(set(sample_ids_by_tissue[tissue_name]))\n",
+    "        for tissue_name in sample_ids_by_tissue.keys()\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Show sample size by tissue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tissue</th>\n",
+       "      <th>sample_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Muscle - Skeletal</td>\n",
+       "      <td>803</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Whole Blood</td>\n",
+       "      <td>755</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Skin - Sun Exposed (Lower leg)</td>\n",
+       "      <td>701</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Artery - Tibial</td>\n",
+       "      <td>663</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Adipose - Subcutaneous</td>\n",
+       "      <td>663</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Thyroid</td>\n",
+       "      <td>653</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Nerve - Tibial</td>\n",
+       "      <td>619</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Skin - Not Sun Exposed (Suprapubic)</td>\n",
+       "      <td>604</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Lung</td>\n",
+       "      <td>578</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Esophagus - Mucosa</td>\n",
+       "      <td>555</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Adipose - Visceral (Omentum)</td>\n",
+       "      <td>541</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Esophagus - Muscularis</td>\n",
+       "      <td>515</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Cells - Cultured fibroblasts</td>\n",
+       "      <td>504</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Breast - Mammary Tissue</td>\n",
+       "      <td>459</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>Heart - Left Ventricle</td>\n",
+       "      <td>432</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>Artery - Aorta</td>\n",
+       "      <td>432</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Heart - Atrial Appendage</td>\n",
+       "      <td>429</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Colon - Transverse</td>\n",
+       "      <td>406</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>Esophagus - Gastroesophageal Junction</td>\n",
+       "      <td>375</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Colon - Sigmoid</td>\n",
+       "      <td>373</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Testis</td>\n",
+       "      <td>361</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Stomach</td>\n",
+       "      <td>359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Pancreas</td>\n",
+       "      <td>328</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Pituitary</td>\n",
+       "      <td>283</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Adrenal Gland</td>\n",
+       "      <td>258</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Brain - Cortex</td>\n",
+       "      <td>255</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>Brain - Caudate (basal ganglia)</td>\n",
+       "      <td>246</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>Brain - Nucleus accumbens (basal ganglia)</td>\n",
+       "      <td>246</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Prostate</td>\n",
+       "      <td>245</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Spleen</td>\n",
+       "      <td>241</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Brain - Cerebellum</td>\n",
+       "      <td>241</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Artery - Coronary</td>\n",
+       "      <td>240</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>Liver</td>\n",
+       "      <td>226</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Brain - Cerebellar Hemisphere</td>\n",
+       "      <td>215</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Brain - Frontal Cortex (BA9)</td>\n",
+       "      <td>209</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>Brain - Putamen (basal ganglia)</td>\n",
+       "      <td>205</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>Brain - Hypothalamus</td>\n",
+       "      <td>202</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>Brain - Hippocampus</td>\n",
+       "      <td>197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>Small Intestine - Terminal Ileum</td>\n",
+       "      <td>187</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Ovary</td>\n",
+       "      <td>180</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45</th>\n",
+       "      <td>Brain - Anterior cingulate cortex (BA24)</td>\n",
+       "      <td>176</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Cells - EBV-transformed lymphocytes</td>\n",
+       "      <td>174</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Minor Salivary Gland</td>\n",
+       "      <td>162</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>Brain - Spinal cord (cervical c-1)</td>\n",
+       "      <td>159</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Vagina</td>\n",
+       "      <td>156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48</th>\n",
+       "      <td>Brain - Amygdala</td>\n",
+       "      <td>152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Uterus</td>\n",
+       "      <td>142</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>Brain - Substantia nigra</td>\n",
+       "      <td>139</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47</th>\n",
+       "      <td>Kidney - Cortex</td>\n",
+       "      <td>85</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>52</th>\n",
+       "      <td>Bladder</td>\n",
+       "      <td>21</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>51</th>\n",
+       "      <td>Cervix - Endocervix</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>Fallopian Tube</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49</th>\n",
+       "      <td>Cervix - Ectocervix</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>53</th>\n",
+       "      <td>Kidney - Medulla</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54</th>\n",
+       "      <td>Cells - Leukemia cell line (CML)</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       tissue  sample_size\n",
+       "3                           Muscle - Skeletal          803\n",
+       "0                                 Whole Blood          755\n",
+       "29             Skin - Sun Exposed (Lower leg)          701\n",
+       "4                             Artery - Tibial          663\n",
+       "2                      Adipose - Subcutaneous          663\n",
+       "16                                    Thyroid          653\n",
+       "30                             Nerve - Tibial          619\n",
+       "12        Skin - Not Sun Exposed (Suprapubic)          604\n",
+       "17                                       Lung          578\n",
+       "21                         Esophagus - Mucosa          555\n",
+       "7                Adipose - Visceral (Omentum)          541\n",
+       "20                     Esophagus - Muscularis          515\n",
+       "34               Cells - Cultured fibroblasts          504\n",
+       "11                    Breast - Mammary Tissue          459\n",
+       "31                     Heart - Left Ventricle          432\n",
+       "35                             Artery - Aorta          432\n",
+       "6                    Heart - Atrial Appendage          429\n",
+       "26                         Colon - Transverse          406\n",
+       "22      Esophagus - Gastroesophageal Junction          375\n",
+       "24                            Colon - Sigmoid          373\n",
+       "28                                     Testis          361\n",
+       "23                                    Stomach          359\n",
+       "19                                   Pancreas          328\n",
+       "32                                  Pituitary          283\n",
+       "15                              Adrenal Gland          258\n",
+       "14                             Brain - Cortex          255\n",
+       "38            Brain - Caudate (basal ganglia)          246\n",
+       "39  Brain - Nucleus accumbens (basal ganglia)          246\n",
+       "27                                   Prostate          245\n",
+       "18                                     Spleen          241\n",
+       "33                         Brain - Cerebellum          241\n",
+       "5                           Artery - Coronary          240\n",
+       "43                                      Liver          226\n",
+       "37              Brain - Cerebellar Hemisphere          215\n",
+       "1                Brain - Frontal Cortex (BA9)          209\n",
+       "40            Brain - Putamen (basal ganglia)          205\n",
+       "41                       Brain - Hypothalamus          202\n",
+       "44                        Brain - Hippocampus          197\n",
+       "25           Small Intestine - Terminal Ileum          187\n",
+       "8                                       Ovary          180\n",
+       "45   Brain - Anterior cingulate cortex (BA24)          176\n",
+       "36        Cells - EBV-transformed lymphocytes          174\n",
+       "13                       Minor Salivary Gland          162\n",
+       "42         Brain - Spinal cord (cervical c-1)          159\n",
+       "10                                     Vagina          156\n",
+       "48                           Brain - Amygdala          152\n",
+       "9                                      Uterus          142\n",
+       "46                   Brain - Substantia nigra          139\n",
+       "47                            Kidney - Cortex           85\n",
+       "52                                    Bladder           21\n",
+       "51                        Cervix - Endocervix           10\n",
+       "50                             Fallopian Tube            9\n",
+       "49                        Cervix - Ectocervix            9\n",
+       "53                           Kidney - Medulla            4\n",
+       "54           Cells - Leukemia cell line (CML)            0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "tissue_sample_size = pd.DataFrame(\n",
+    "    [{\"tissue\": k, \"sample_size\": len(v)} for k, v in sample_ids_by_tissue.items()]\n",
+    ")\n",
+    "\n",
+    "tissue_sample_size = tissue_sample_size.sort_values(\"sample_size\", ascending=False)\n",
+    "display(tissue_sample_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple validations\n",
+    "_tmp = tissue_sample_size.set_index(\"tissue\").squeeze()\n",
+    "assert _tmp.loc[\"Muscle - Skeletal\"] == 803\n",
+    "assert _tmp.loc[\"Whole Blood\"] == 755\n",
+    "assert _tmp.loc[\"Skin - Not Sun Exposed (Suprapubic)\"] == 604\n",
+    "assert _tmp.loc[\"Kidney - Medulla\"] == 4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split expression data by tissue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Cells - Leukemia cell line (CML): 100%|█████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 4357.51it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping Muscle - Skeletal - file already exists\n",
+      "Skipping Whole Blood - file already exists\n",
+      "Skipping Skin - Sun Exposed (Lower leg) - file already exists\n",
+      "Skipping Artery - Tibial - file already exists\n",
+      "Skipping Adipose - Subcutaneous - file already exists\n",
+      "Skipping Thyroid - file already exists\n",
+      "Skipping Nerve - Tibial - file already exists\n",
+      "Skipping Skin - Not Sun Exposed (Suprapubic) - file already exists\n",
+      "Skipping Lung - file already exists\n",
+      "Skipping Esophagus - Mucosa - file already exists\n",
+      "Skipping Adipose - Visceral (Omentum) - file already exists\n",
+      "Skipping Esophagus - Muscularis - file already exists\n",
+      "Skipping Cells - Cultured fibroblasts - file already exists\n",
+      "Skipping Breast - Mammary Tissue - file already exists\n",
+      "Skipping Heart - Left Ventricle - file already exists\n",
+      "Skipping Artery - Aorta - file already exists\n",
+      "Skipping Heart - Atrial Appendage - file already exists\n",
+      "Skipping Colon - Transverse - file already exists\n",
+      "Skipping Esophagus - Gastroesophageal Junction - file already exists\n",
+      "Skipping Colon - Sigmoid - file already exists\n",
+      "Skipping Testis - file already exists\n",
+      "Skipping Stomach - file already exists\n",
+      "Skipping Pancreas - file already exists\n",
+      "Skipping Pituitary - file already exists\n",
+      "Skipping Adrenal Gland - file already exists\n",
+      "Skipping Brain - Cortex - file already exists\n",
+      "Skipping Brain - Caudate (basal ganglia) - file already exists\n",
+      "Skipping Brain - Nucleus accumbens (basal ganglia) - file already exists\n",
+      "Skipping Prostate - file already exists\n",
+      "Skipping Spleen - file already exists\n",
+      "Skipping Brain - Cerebellum - file already exists\n",
+      "Skipping Artery - Coronary - file already exists\n",
+      "Skipping Liver - file already exists\n",
+      "Skipping Brain - Cerebellar Hemisphere - file already exists\n",
+      "Skipping Brain - Frontal Cortex (BA9) - file already exists\n",
+      "Skipping Brain - Putamen (basal ganglia) - file already exists\n",
+      "Skipping Brain - Hypothalamus - file already exists\n",
+      "Skipping Brain - Hippocampus - file already exists\n",
+      "Skipping Small Intestine - Terminal Ileum - file already exists\n",
+      "Skipping Ovary - file already exists\n",
+      "Skipping Brain - Anterior cingulate cortex (BA24) - file already exists\n",
+      "Skipping Cells - EBV-transformed lymphocytes - file already exists\n",
+      "Skipping Minor Salivary Gland - file already exists\n",
+      "Skipping Brain - Spinal cord (cervical c-1) - file already exists\n",
+      "Skipping Vagina - file already exists\n",
+      "Skipping Brain - Amygdala - file already exists\n",
+      "Skipping Uterus - file already exists\n",
+      "Skipping Brain - Substantia nigra - file already exists\n",
+      "Skipping Kidney - Cortex - file already exists\n",
+      "Skipping Bladder - file already exists\n",
+      "Skipping Cervix - Endocervix - file already exists\n",
+      "Skipping Fallopian Tube - file already exists\n",
+      "Skipping Cervix - Ectocervix - file already exists\n",
+      "Skipping Kidney - Medulla - file already exists\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "TISSUE_DATA_DIR = ANALYSIS_DIR / \"data_by_tissue\"\n",
+    "TISSUE_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "pbar = tqdm(tissue_sample_size[\"tissue\"])\n",
+    "\n",
+    "gene_id_symbol_map_tuples = set()\n",
+    "\n",
+    "for tissue_name in pbar:\n",
+    "    pbar.set_description(tissue_name)\n",
+    "\n",
+    "    tissue_ids = sample_ids_by_tissue[tissue_name]\n",
+    "    if len(tissue_ids) == 0:\n",
+    "        continue\n",
+    "\n",
+    "    # Generate output filename\n",
+    "    tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))\n",
+    "    output_file = TISSUE_DATA_DIR / f\"gtex_v8_data_{tissue_name_simple}.pkl\"\n",
+    "    output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n",
+    "    \n",
+    "    # Skip if file already exists\n",
+    "    if output_file.exists() and output_gene_mappings.exists():\n",
+    "        print(f\"Skipping {tissue_name} - file already exists\")\n",
+    "        continue\n",
+    "\n",
+    "    try:\n",
+    "        tissue_data = pd.read_csv(\n",
+    "            file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n",
+    "            sep=\"\\t\",\n",
+    "            skiprows=2,\n",
+    "            usecols=[\"Name\", \"Description\"] + tissue_ids,\n",
+    "        )\n",
+    "\n",
+    "        tissue_data = tissue_data.rename(\n",
+    "            columns={\n",
+    "                \"Name\": \"gene_ens_id\",\n",
+    "                \"Description\": \"gene_symbol\",\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "        # Validate data before processing\n",
+    "        if tissue_data.empty:\n",
+    "            print(f\"Warning: No data found for {tissue_name}\")\n",
+    "            continue\n",
+    "\n",
+    "        # add gene id / gene symbol to mapping variable\n",
+    "        gene_id_symbol_map_tuples.update(\n",
+    "            tissue_data[[\"gene_ens_id\", \"gene_symbol\"]].itertuples(index=False)\n",
+    "        )\n",
+    "\n",
+    "        tissue_data = tissue_data.drop(columns=[\"gene_symbol\"]).set_index(\"gene_ens_id\")\n",
+    "\n",
+    "        # Data quality checks\n",
+    "        assert not tissue_data.isna().any().any(), f\"NaN values found in {tissue_name}\"\n",
+    "        assert tissue_data.index.is_unique, f\"Non-unique gene IDs in {tissue_name}\"\n",
+    "        assert tissue_data.columns.is_unique, f\"Non-unique sample IDs in {tissue_name}\"\n",
+    "\n",
+    "        # save\n",
+    "        tissue_data.to_pickle(path=output_file)\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing {tissue_name}: {str(e)}\")\n",
+    "        continue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple validations\n",
+    "_tmp = pd.read_pickle(TISSUE_DATA_DIR / \"gtex_v8_data_brain_cerebellar_hemisphere.pkl\")\n",
+    "\n",
+    "assert \"GTEX-11DXY-0011-R11a-SM-DNZZN\" in _tmp.columns\n",
+    "assert \"GTEX-WL46-0011-R11A-SM-3MJFT\" in _tmp.columns\n",
+    "assert \"GTEX-ZF28-0011-R11a-SM-4WWEI\" in _tmp.columns\n",
+    "\n",
+    "_v = _tmp.loc[\"ENSG00000223972.5\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n",
+    "assert _v == 0.04045, _v\n",
+    "_v = _tmp.loc[\"ENSG00000278267.1\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n",
+    "assert _v == 0.0, _v\n",
+    "\n",
+    "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n",
+    "assert _v == 146.4000, _v\n",
+    "_v = _tmp.loc[\"ENSG00000237118.2\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n",
+    "assert _v == 0.3357, _v\n",
+    "\n",
+    "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n",
+    "assert _v == 30.7200, _v\n",
+    "_v = _tmp.loc[\"ENSG00000186907.7\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n",
+    "assert _v == 0.94720, _v"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save gene mappings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded existing gene mappings from /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+      "gene_mappings.shape: (56200, 2)\n",
+      "          gene_ens_id  gene_symbol\n",
+      "0  ENSG00000144278.14      GALNT13\n",
+      "1   ENSG00000260976.1    LINC01633\n",
+      "2  ENSG00000186660.14        ZFP91\n",
+      "3  ENSG00000123560.13         PLP1\n",
+      "4   ENSG00000227371.1  RP11-3L10.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n",
+    "\n",
+    "if output_gene_mappings.exists():\n",
+    "    gene_mappings = pd.read_pickle(output_gene_mappings)\n",
+    "    print(f\"Loaded existing gene mappings from {output_gene_mappings}\")\n",
+    "else:\n",
+    "    gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)\n",
+    "    gene_mappings.to_pickle(output_gene_mappings)\n",
+    "    print(f\"Created and saved gene mappings to {output_gene_mappings}\")\n",
+    "\n",
+    "print(f\"gene_mappings.shape: {gene_mappings.shape}\")\n",
+    "print(gene_mappings.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple validations\n",
+    "# no null\n",
+    "assert gene_mappings.dropna(how=\"any\").shape == gene_mappings.shape\n",
+    "# no duplicates\n",
+    "assert gene_mappings.drop_duplicates().shape == gene_mappings.shape\n",
+    "\n",
+    "_tmp = gene_mappings.set_index(\"gene_ens_id\").squeeze()\n",
+    "assert _tmp.loc[\"ENSG00000223972.5\"] == \"DDX11L1\"\n",
+    "assert _tmp.loc[\"ENSG00000243485.5\"] == \"MIR1302-2HG\"\n",
+    "assert _tmp.loc[\"ENSG00000274059.1\"] == \"5S_rRNA\"  # repeated gene\n",
+    "assert _tmp.loc[\"ENSG00000275305.1\"] == \"5S_rRNA\"  # repeated gene"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute correlation coefficients"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We provide a command-line tool for computing CCC, Spearman, and Pearson correlations between two genes in a given tissue.\n",
+    "\n",
+    "```bash\n",
+    "usage: compute_single_gene_pair_correlations_cli.py [-h] [--tissue TISSUE] [--data-dir DATA_DIR] [--gene-mapping GENE_MAPPING] [--list-tissues] [--show-genes TISSUE] [--n-genes N_GENES] [--debug] [genes ...]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 11:33:38,498 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n",
+      "[2025-09-25 11:33:38,644 - root] INFO: Tissue data shape: (56200, 755)\n",
+      "[2025-09-25 11:33:38,644 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+      "[2025-09-25 11:33:38,649 - root] INFO: Loaded 56200 gene mappings\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Tissue: whole_blood ===\n",
+      "Total genes: 56,200\n",
+      "Total samples: 755\n",
+      "\n",
+      "First 20 genes:\n",
+      "------------------------------------------------------------\n",
+      "#    Gene Symbol     Ensembl ID          \n",
+      "------------------------------------------------------------\n",
+      "1    DDX11L1         ENSG00000223972.5   \n",
+      "2    WASH7P          ENSG00000227232.5   \n",
+      "3    MIR6859-1       ENSG00000278267.1   \n",
+      "4    MIR1302-2HG     ENSG00000243485.5   \n",
+      "5    FAM138A         ENSG00000237613.2   \n",
+      "6    OR4G4P          ENSG00000268020.3   \n",
+      "7    OR4G11P         ENSG00000240361.1   \n",
+      "8    OR4F5           ENSG00000186092.4   \n",
+      "9    RP11-34P13.7    ENSG00000238009.6   \n",
+      "10   CICP27          ENSG00000233750.3   \n",
+      "11   RP11-34P13.15   ENSG00000268903.1   \n",
+      "12   RP11-34P13.16   ENSG00000269981.1   \n",
+      "13   RP11-34P13.14   ENSG00000239906.1   \n",
+      "14   RP11-34P13.13   ENSG00000241860.6   \n",
+      "15   RNU6-1100P      ENSG00000222623.1   \n",
+      "16   RP11-34P13.9    ENSG00000241599.1   \n",
+      "17   ABC7-43046700E7.1 ENSG00000279928.2   \n",
+      "18   RP11-34P13.18   ENSG00000279457.4   \n",
+      "19   MIR6859-2       ENSG00000273874.1   \n",
+      "20   AP006222.2      ENSG00000228463.9   \n",
+      "... and 56,180 more genes\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Make sure you start the notebook from the ROOT directory of the project\n",
+    "\n",
+    "# Preview genes in a tissue\n",
+    "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py --show-genes whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 11:33:38,676 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+      "[2025-09-25 11:33:38,681 - root] INFO: Loaded 56200 gene mappings\n",
+      "[2025-09-25 11:33:38,686 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n",
+      "[2025-09-25 11:33:38,824 - root] INFO: Tissue data shape: (56200, 755)\n",
+      "[2025-09-25 11:33:38,827 - root] INFO: Computing correlations for 755 samples\n",
+      "[2025-09-25 11:33:38,832 - root] INFO: Computing CCC correlation...\n",
+      "[2025-09-25 11:33:38,857 - root] INFO: Computing Pearson correlation...\n",
+      "[2025-09-25 11:33:38,871 - root] INFO: Computing Spearman correlation...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "============================================================\n",
+      "GENE PAIR CORRELATION RESULTS\n",
+      "============================================================\n",
+      "Gene 1: DDX11L1 (ENSG00000223972.5)\n",
+      "Gene 2: WASH7P (ENSG00000227232.5)\n",
+      "Tissue: whole_blood\n",
+      "Samples: 755\n",
+      "------------------------------------------------------------\n",
+      "         CCC: 0.005060\n",
+      "     PEARSON: 0.063041\n",
+      "    SPEARMAN: 0.040069\n",
+      "============================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Compute CCC, Spearman, and Pearson correlations between two genes in a given tissue\n",
+    "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py DDX11L1 WASH7P --tissue whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Metadata Correlation\n",
+    "We will compute the correlation between the gene expression and the metadata for each tissue. Metadata is downloaded from: https://www.gtexportal.org/home/downloads/adult-gtex/metadata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(22951, 62)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load GTEx samples info\n",
+    "gtex_samples = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\", index_col=\"SAMPID\")\n",
+    "print(gtex_samples.shape)\n",
+    "assert gtex_samples.index.is_unique"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(980, 4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load GTEx subject attributes\n",
+    "gtex_phenotypes = pd.read_csv(file_paths[\"gtex_subject_attrs\"], sep=\"\\t\")\n",
+    "print(gtex_phenotypes.shape)\n",
+    "assert gtex_phenotypes.index.is_unique"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['GTEX-1117F-0003-SM-58Q7G', 'GTEX-1117F-0003-SM-5DWSB', 'GTEX-1117F-0003-SM-6WBT7', 'GTEX-1117F-0011-R10a-SM-AHZ7F', 'GTEX-1117F-0011-R10b-SM-CYKQ8']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get GTEx sample metadata\n",
+    "gtex_samples_ids = gtex_samples.index.to_list()\n",
+    "print(gtex_samples_ids[:5])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0             GTEX-1117F-0003-SM-58Q7G\n",
+       "1             GTEX-1117F-0003-SM-5DWSB\n",
+       "2             GTEX-1117F-0003-SM-6WBT7\n",
+       "3        GTEX-1117F-0011-R10a-SM-AHZ7F\n",
+       "4        GTEX-1117F-0011-R10b-SM-CYKQ8\n",
+       "                     ...              \n",
+       "22946                   K-562-SM-E9EZC\n",
+       "22947                   K-562-SM-E9EZI\n",
+       "22948                   K-562-SM-E9EZO\n",
+       "22949                   K-562-SM-E9EZT\n",
+       "22950                   K-562-SM-E9EZZ\n",
+       "Name: SAMPID, Length: 22951, dtype: object"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_samples_ids = pd.Series(gtex_samples_ids).rename(\"SAMPID\")\n",
+    "gtex_samples_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0        GTEX-1117F\n",
+       "1        GTEX-1117F\n",
+       "2        GTEX-1117F\n",
+       "3        GTEX-1117F\n",
+       "4        GTEX-1117F\n",
+       "            ...    \n",
+       "22946         K-562\n",
+       "22947         K-562\n",
+       "22948         K-562\n",
+       "22949         K-562\n",
+       "22950         K-562\n",
+       "Name: SUBJID, Length: 22951, dtype: object"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_subjects_ids = gtex_samples_ids.str.extract(\n",
+    "    r\"([\\w\\d]+\\-[\\w\\d]+)\", flags=re.IGNORECASE, expand=True\n",
+    ")[0].rename(\"SUBJID\")\n",
+    "\n",
+    "gtex_subjects_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SAMPID</th>\n",
+       "      <th>SUBJID</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GTEX-1117F-0003-SM-58Q7G</td>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GTEX-1117F-0003-SM-5DWSB</td>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GTEX-1117F-0003-SM-6WBT7</td>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GTEX-1117F-0011-R10a-SM-AHZ7F</td>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GTEX-1117F-0011-R10b-SM-CYKQ8</td>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22946</th>\n",
+       "      <td>K-562-SM-E9EZC</td>\n",
+       "      <td>K-562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22947</th>\n",
+       "      <td>K-562-SM-E9EZI</td>\n",
+       "      <td>K-562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22948</th>\n",
+       "      <td>K-562-SM-E9EZO</td>\n",
+       "      <td>K-562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22949</th>\n",
+       "      <td>K-562-SM-E9EZT</td>\n",
+       "      <td>K-562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22950</th>\n",
+       "      <td>K-562-SM-E9EZZ</td>\n",
+       "      <td>K-562</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>22951 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              SAMPID      SUBJID\n",
+       "0           GTEX-1117F-0003-SM-58Q7G  GTEX-1117F\n",
+       "1           GTEX-1117F-0003-SM-5DWSB  GTEX-1117F\n",
+       "2           GTEX-1117F-0003-SM-6WBT7  GTEX-1117F\n",
+       "3      GTEX-1117F-0011-R10a-SM-AHZ7F  GTEX-1117F\n",
+       "4      GTEX-1117F-0011-R10b-SM-CYKQ8  GTEX-1117F\n",
+       "...                              ...         ...\n",
+       "22946                 K-562-SM-E9EZC       K-562\n",
+       "22947                 K-562-SM-E9EZI       K-562\n",
+       "22948                 K-562-SM-E9EZO       K-562\n",
+       "22949                 K-562-SM-E9EZT       K-562\n",
+       "22950                 K-562-SM-E9EZZ       K-562\n",
+       "\n",
+       "[22951 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)\n",
+    "gtex_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJID</th>\n",
+       "      <th>SEX</th>\n",
+       "      <th>AGE</th>\n",
+       "      <th>DTHHRDY</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GTEX-111CU</td>\n",
+       "      <td>1</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GTEX-111FC</td>\n",
+       "      <td>1</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GTEX-111VG</td>\n",
+       "      <td>1</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GTEX-111YS</td>\n",
+       "      <td>1</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>975</th>\n",
+       "      <td>GTEX-ZYY3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>976</th>\n",
+       "      <td>GTEX-ZZ64</td>\n",
+       "      <td>1</td>\n",
+       "      <td>20-29</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>977</th>\n",
+       "      <td>GTEX-ZZPT</td>\n",
+       "      <td>1</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>978</th>\n",
+       "      <td>GTEX-ZZPU</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>979</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>980 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         SUBJID  SEX    AGE  DTHHRDY\n",
+       "0    GTEX-1117F    2  60-69      4.0\n",
+       "1    GTEX-111CU    1  50-59      0.0\n",
+       "2    GTEX-111FC    1  60-69      1.0\n",
+       "3    GTEX-111VG    1  60-69      3.0\n",
+       "4    GTEX-111YS    1  60-69      0.0\n",
+       "..          ...  ...    ...      ...\n",
+       "975   GTEX-ZYY3    2  60-69      4.0\n",
+       "976   GTEX-ZZ64    1  20-29      0.0\n",
+       "977   GTEX-ZZPT    1  50-59      4.0\n",
+       "978   GTEX-ZZPU    2  50-59      0.0\n",
+       "979       K-562    2  50-59      NaN\n",
+       "\n",
+       "[980 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_phenotypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJID</th>\n",
+       "      <th>SEX</th>\n",
+       "      <th>AGE</th>\n",
+       "      <th>DTHHRDY</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SAMPID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-58Q7G</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-5DWSB</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-6WBT7</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0011-R10a-SM-AHZ7F</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0011-R10b-SM-CYKQ8</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>2</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>K-562-SM-E9EZC</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>K-562-SM-E9EZI</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>K-562-SM-E9EZO</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>K-562-SM-E9EZT</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>K-562-SM-E9EZZ</th>\n",
+       "      <td>K-562</td>\n",
+       "      <td>2</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>22951 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   SUBJID  SEX    AGE  DTHHRDY\n",
+       "SAMPID                                                        \n",
+       "GTEX-1117F-0003-SM-58Q7G       GTEX-1117F    2  60-69      4.0\n",
+       "GTEX-1117F-0003-SM-5DWSB       GTEX-1117F    2  60-69      4.0\n",
+       "GTEX-1117F-0003-SM-6WBT7       GTEX-1117F    2  60-69      4.0\n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F  GTEX-1117F    2  60-69      4.0\n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8  GTEX-1117F    2  60-69      4.0\n",
+       "...                                   ...  ...    ...      ...\n",
+       "K-562-SM-E9EZC                      K-562    2  50-59      NaN\n",
+       "K-562-SM-E9EZI                      K-562    2  50-59      NaN\n",
+       "K-562-SM-E9EZO                      K-562    2  50-59      NaN\n",
+       "K-562-SM-E9EZT                      K-562    2  50-59      NaN\n",
+       "K-562-SM-E9EZZ                      K-562    2  50-59      NaN\n",
+       "\n",
+       "[22951 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 94,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index(\"SAMPID\")\n",
+    "gtex_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJID</th>\n",
+       "      <th>SEX</th>\n",
+       "      <th>AGE</th>\n",
+       "      <th>DTHHRDY</th>\n",
+       "      <th>SMATSSCR</th>\n",
+       "      <th>SMCENTER</th>\n",
+       "      <th>SMPTHNTS</th>\n",
+       "      <th>SMRIN</th>\n",
+       "      <th>SMTS</th>\n",
+       "      <th>SMTSD</th>\n",
+       "      <th>...</th>\n",
+       "      <th>SME1ANTI</th>\n",
+       "      <th>SMSPLTRD</th>\n",
+       "      <th>SMBSMMRT</th>\n",
+       "      <th>SME1SNSE</th>\n",
+       "      <th>SME1PCTS</th>\n",
+       "      <th>SMRRNART</th>\n",
+       "      <th>SME1MPRT</th>\n",
+       "      <th>SMNUM5CD</th>\n",
+       "      <th>SMDPMPRT</th>\n",
+       "      <th>SME2PCTS</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SAMPID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-58Q7G</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>B1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Blood</td>\n",
+       "      <td>Whole Blood</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-5DWSB</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>B1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Blood</td>\n",
+       "      <td>Whole Blood</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0003-SM-6WBT7</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>B1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Blood</td>\n",
+       "      <td>Whole Blood</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0011-R10a-SM-AHZ7F</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>B1, A1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Brain - Frontal Cortex (BA9)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GTEX-1117F-0011-R10b-SM-CYKQ8</th>\n",
+       "      <td>GTEX-1117F</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>B1, A1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7.2</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Brain - Frontal Cortex (BA9)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 66 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   SUBJID     SEX    AGE  DTHHRDY  SMATSSCR  \\\n",
+       "SAMPID                                                                        \n",
+       "GTEX-1117F-0003-SM-58Q7G       GTEX-1117F  Female  60-69      4.0       NaN   \n",
+       "GTEX-1117F-0003-SM-5DWSB       GTEX-1117F  Female  60-69      4.0       NaN   \n",
+       "GTEX-1117F-0003-SM-6WBT7       GTEX-1117F  Female  60-69      4.0       NaN   \n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F  GTEX-1117F  Female  60-69      4.0       NaN   \n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8  GTEX-1117F  Female  60-69      4.0       NaN   \n",
+       "\n",
+       "                              SMCENTER SMPTHNTS  SMRIN   SMTS  \\\n",
+       "SAMPID                                                          \n",
+       "GTEX-1117F-0003-SM-58Q7G            B1      NaN    NaN  Blood   \n",
+       "GTEX-1117F-0003-SM-5DWSB            B1      NaN    NaN  Blood   \n",
+       "GTEX-1117F-0003-SM-6WBT7            B1      NaN    NaN  Blood   \n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F   B1, A1      NaN    NaN  Brain   \n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8   B1, A1      NaN    7.2  Brain   \n",
+       "\n",
+       "                                                      SMTSD  ... SME1ANTI  \\\n",
+       "SAMPID                                                       ...            \n",
+       "GTEX-1117F-0003-SM-58Q7G                        Whole Blood  ...      NaN   \n",
+       "GTEX-1117F-0003-SM-5DWSB                        Whole Blood  ...      NaN   \n",
+       "GTEX-1117F-0003-SM-6WBT7                        Whole Blood  ...      NaN   \n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F  Brain - Frontal Cortex (BA9)  ...      NaN   \n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8  Brain - Frontal Cortex (BA9)  ...      NaN   \n",
+       "\n",
+       "                               SMSPLTRD  SMBSMMRT SME1SNSE SME1PCTS SMRRNART  \\\n",
+       "SAMPID                                                                         \n",
+       "GTEX-1117F-0003-SM-58Q7G            NaN       NaN      NaN      NaN      NaN   \n",
+       "GTEX-1117F-0003-SM-5DWSB            NaN       NaN      NaN      NaN      NaN   \n",
+       "GTEX-1117F-0003-SM-6WBT7            NaN       NaN      NaN      NaN      NaN   \n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F       NaN       NaN      NaN      NaN      NaN   \n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8       NaN       NaN      NaN      NaN      NaN   \n",
+       "\n",
+       "                              SME1MPRT SMNUM5CD SMDPMPRT SME2PCTS  \n",
+       "SAMPID                                                             \n",
+       "GTEX-1117F-0003-SM-58Q7G           NaN      NaN      NaN      NaN  \n",
+       "GTEX-1117F-0003-SM-5DWSB           NaN      NaN      NaN      NaN  \n",
+       "GTEX-1117F-0003-SM-6WBT7           NaN      NaN      NaN      NaN  \n",
+       "GTEX-1117F-0011-R10a-SM-AHZ7F      NaN      NaN      NaN      NaN  \n",
+       "GTEX-1117F-0011-R10b-SM-CYKQ8      NaN      NaN      NaN      NaN  \n",
+       "\n",
+       "[5 rows x 66 columns]"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)\n",
+    "\n",
+    "gtex_metadata = gtex_metadata.replace(\n",
+    "    {\n",
+    "        \"SEX\": {\n",
+    "            1: \"Male\",\n",
+    "            2: \"Female\",\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "gtex_metadata = gtex_metadata.sort_index()\n",
+    "\n",
+    "gtex_metadata.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple validations\n",
+    "assert not gtex_metadata[\"SUBJID\"].isna().any()\n",
+    "\n",
+    "assert not gtex_metadata[\"SMTS\"].isna().any()\n",
+    "assert not gtex_metadata[\"SMTSD\"].isna().any()\n",
+    "\n",
+    "assert not gtex_metadata[\"SEX\"].isna().any()\n",
+    "assert gtex_metadata[\"SEX\"].unique().shape[0] == 2\n",
+    "assert set(gtex_metadata[\"SEX\"].unique()) == {\"Female\", \"Male\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metadata\n",
+    "gtex_metadatadata_filename = ANALYSIS_DIR / \"gtex_v8-sample_metadata.pkl\"\n",
+    "gtex_metadata.to_pickle(gtex_metadatadata_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Metadata correlation\n",
+    "We also provide a command-line tool `nbs/common/metadata_corr_cli.py` for computing the correlation between the gene expression and the metadata for each tissue.\n",
+    "\n",
+    "```bash\n",
+    "usage: metadata_corr_cli.py [-h] [--expr-data-dir EXPR_DATA_DIR] [--include [INCLUDE ...]] [--exclude [EXCLUDE ...]] [--permutations PERMUTATIONS]\n",
+    "                            [--n-jobs N_JOBS] [--list-metadata-columns] [--list-tissues] [--output-dir OUTPUT_DIR] [--quiet] [--no-csv-output]\n",
+    "                            [--no-individual-logs] [--data-dir DATA_DIR]\n",
+    "                            gene_symbols [gene_symbols ...]\n",
+    "\n",
+    "Analyze gene expression correlations with metadata using CCC across multiple tissues\n",
+    "\n",
+    "positional arguments:\n",
+    "  gene_symbols          Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)\n",
+    "\n",
+    "options:\n",
+    "  -h, --help            show this help message and exit\n",
+    "  --expr-data-dir EXPR_DATA_DIR\n",
+    "                        Directory containing expression data files (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex/gene_selection/all)\n",
+    "  --include [INCLUDE ...]\n",
+    "                        Include only tissues matching these patterns (fuzzy match on tissue name) (default: None)\n",
+    "  --exclude [EXCLUDE ...]\n",
+    "                        Exclude tissues matching these patterns (fuzzy match on tissue name) (default: None)\n",
+    "  --permutations PERMUTATIONS\n",
+    "                        Number of permutations for p-value calculation (default: 100000)\n",
+    "  --n-jobs N_JOBS       Number of parallel jobs for computation (default: 4)\n",
+    "  --list-metadata-columns\n",
+    "                        List available metadata columns and exit (default: False)\n",
+    "  --list-tissues        List available tissue files and exit (default: False)\n",
+    "  --output-dir OUTPUT_DIR\n",
+    "                        Directory to save output files (default: current directory) (default: .)\n",
+    "  --quiet               Reduce output verbosity for batch processing (default: False)\n",
+    "  --no-csv-output       Skip CSV file generation (only create pickle files) (default: False)\n",
+    "  --no-individual-logs  Skip individual tissue log files (only keep summary logs) (default: False)\n",
+    "  --data-dir DATA_DIR   Directory containing GTEx data files (metadata and gene mappings) (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "METADATA_CORRELATIONS_RESULT_DIR = ANALYSIS_DIR / \"metadata_correlations\"\n",
+    "os.makedirs(METADATA_CORRELATIONS_RESULT_DIR, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:17,840 - summary] INFO: Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n",
+      "[2025-09-25 13:05:17,840 - summary] INFO: Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+      "[2025-09-25 13:05:17,840 - summary] INFO: Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n",
+      "[2025-09-25 13:05:17,840 - summary] INFO: Gene symbols to analyze: RASSF2, CYTIP\n",
+      "[2025-09-25 13:05:17,857 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:17,858 - summary] INFO: PROCESSING GENE 1/2: RASSF2\n",
+      "[2025-09-25 13:05:17,858 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:17,858 - summary] INFO: \n",
+      "[1/1] Starting processing for RASSF2 in whole_blood...\n",
+      "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: \n",
+      "============================================================\n",
+      "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Processing tissue: whole_blood\n",
+      "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n",
+      "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: ============================================================\n",
+      "[2025-09-25 13:05:17,860 - tissue_RASSF2_whole_blood] INFO: Loading expression data...\n",
+      "[2025-09-25 13:05:18,013 - tissue_RASSF2_whole_blood] INFO: Expression data shape: (56200, 755)\n",
+      "[2025-09-25 13:05:18,016 - tissue_RASSF2_whole_blood] INFO: Gene ID for RASSF2: ENSG00000101265.15\n",
+      "[2025-09-25 13:05:18,019 - tissue_RASSF2_whole_blood] INFO: Number of samples: 755\n",
+      "[2025-09-25 13:05:18,021 - tissue_RASSF2_whole_blood] INFO: Common samples: 755\n",
+      "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Computing CCC between RASSF2 expression and all metadata columns...\n",
+      "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Using 100000 permutations and 4 jobs\n",
+      "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing 66 metadata columns...\n",
+      "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing column 1/66: SUBJID\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n",
+      "Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+      "Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n",
+      "Gene symbols to analyze: RASSF2, CYTIP\n",
+      "Found 1 expression files to process:\n",
+      "  whole_blood: gtex_v8_data_whole_blood.pkl\n",
+      "Loading metadata and gene mapping files...\n",
+      "Loaded metadata: (22951, 66)\n",
+      "Loaded gene mapping: (56200, 2)\n",
+      "\n",
+      "====================================================================================================\n",
+      "PROCESSING GENE 1/2: RASSF2\n",
+      "====================================================================================================\n",
+      "\n",
+      "[1/1] Starting processing for RASSF2 in whole_blood...\n",
+      "\n",
+      "============================================================\n",
+      "Processing tissue: whole_blood\n",
+      "File: gtex_v8_data_whole_blood.pkl\n",
+      "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "============================================================\n",
+      "Loading expression data...\n",
+      "Expression data shape: (56200, 755)\n",
+      "Gene ID for RASSF2: ENSG00000101265.15\n",
+      "Number of samples: 755\n",
+      "Common samples: 755\n",
+      "Computing CCC between RASSF2 expression and all metadata columns...\n",
+      "Using 100000 permutations and 4 jobs\n",
+      "Processing 66 metadata columns...\n",
+      "Processing column 1/66: SUBJID\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:18,143 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.000000, p-value: 1.00e+00\n",
+      "[2025-09-25 13:05:18,144 - tissue_RASSF2_whole_blood] INFO: Processing column 2/66: SEX\n",
+      "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.007134, p-value: 1.23e-02\n",
+      "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO: Processing column 3/66: AGE\n",
+      "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.039824, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO: Processing column 4/66: DTHHRDY\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.000000, p-value: 1.00e+00\n",
+      "Processing column 2/66: SEX\n",
+      "  CCC: 0.007134, p-value: 1.23e-02\n",
+      "Processing column 3/66: AGE\n",
+      "  CCC: 0.039824, p-value: 1.00e-05\n",
+      "Processing column 4/66: DTHHRDY\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:18,547 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.464582, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 5/66: SMATSSCR\n",
+      "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO:   Skipping SMATSSCR: all values are NaN\n",
+      "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 6/66: SMCENTER\n",
+      "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.108148, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n",
+      "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO:   Skipping SMPTHNTS: all values are NaN\n",
+      "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO: Processing column 8/66: SMRIN\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.464582, p-value: 1.00e-05\n",
+      "Processing column 5/66: SMATSSCR\n",
+      "  Skipping SMATSSCR: all values are NaN\n",
+      "Processing column 6/66: SMCENTER\n",
+      "  CCC: 0.108148, p-value: 1.00e-05\n",
+      "Processing column 7/66: SMPTHNTS\n",
+      "  Skipping SMPTHNTS: all values are NaN\n",
+      "Processing column 8/66: SMRIN\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.048847, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO: Processing column 9/66: SMTS\n",
+      "[2025-09-25 13:05:18,873 - tissue_RASSF2_whole_blood] INFO:   Skipping SMTS: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 10/66: SMTSD\n",
+      "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO:   Skipping SMTSD: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 11/66: SMUBRID\n",
+      "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO:   Skipping SMUBRID: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO: Processing column 12/66: SMTSISCH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.048847, p-value: 1.00e-05\n",
+      "Processing column 9/66: SMTS\n",
+      "  Skipping SMTS: only 1 unique value(s)\n",
+      "Processing column 10/66: SMTSD\n",
+      "  Skipping SMTSD: only 1 unique value(s)\n",
+      "Processing column 11/66: SMUBRID\n",
+      "  Skipping SMUBRID: only 1 unique value(s)\n",
+      "Processing column 12/66: SMTSISCH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:19,129 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.528125, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 13/66: SMTSPAX\n",
+      "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO:   Skipping SMTSPAX: all values are NaN\n",
+      "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 14/66: SMNABTCH\n",
+      "[2025-09-25 13:05:19,194 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.000884, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,195 - tissue_RASSF2_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n",
+      "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO:   Skipping SMNABTCHT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n",
+      "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.000900, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n",
+      "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.003663, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.528125, p-value: 1.00e-05\n",
+      "Processing column 13/66: SMTSPAX\n",
+      "  Skipping SMTSPAX: all values are NaN\n",
+      "Processing column 14/66: SMNABTCH\n",
+      "  CCC: 0.000884, p-value: 1.00e-05\n",
+      "Processing column 15/66: SMNABTCHT\n",
+      "  Skipping SMNABTCHT: only 1 unique value(s)\n",
+      "Processing column 16/66: SMNABTCHD\n",
+      "  CCC: 0.000900, p-value: 1.00e-05\n",
+      "Processing column 17/66: SMGEBTCH\n",
+      "  CCC: 0.003663, p-value: 1.00e-05\n",
+      "Processing column 18/66: SMGEBTCHD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.005827, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n",
+      "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO:   Skipping SMGEBTCHT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Processing column 20/66: SMAFRZE\n",
+      "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO:   Skipping SMAFRZE: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 21/66: SMGTC\n",
+      "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO:   Skipping SMGTC: all values are NaN\n",
+      "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 22/66: SME2MPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.005827, p-value: 1.00e-05\n",
+      "Processing column 19/66: SMGEBTCHT\n",
+      "  Skipping SMGEBTCHT: only 1 unique value(s)\n",
+      "Processing column 20/66: SMAFRZE\n",
+      "  Skipping SMAFRZE: only 1 unique value(s)\n",
+      "Processing column 21/66: SMGTC\n",
+      "  Skipping SMGTC: all values are NaN\n",
+      "Processing column 22/66: SME2MPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.172974, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.172974, p-value: 1.00e-05\n",
+      "Processing column 23/66: SMCHMPRS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.143365, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO: Processing column 24/66: SMNTRART\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.143365, p-value: 1.00e-05\n",
+      "Processing column 24/66: SMNTRART\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:20,136 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.243071, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n",
+      "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO:   Skipping SMNUMGPS: all values are NaN\n",
+      "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 26/66: SMMAPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.243071, p-value: 1.00e-05\n",
+      "Processing column 25/66: SMNUMGPS\n",
+      "  Skipping SMNUMGPS: all values are NaN\n",
+      "Processing column 26/66: SMMAPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:20,392 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.168576, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:20,393 - tissue_RASSF2_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.168576, p-value: 1.00e-05\n",
+      "Processing column 27/66: SMEXNCRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:20,646 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.040140, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO: Processing column 28/66: SM550NRM\n",
+      "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO:   Skipping SM550NRM: all values are NaN\n",
+      "[2025-09-25 13:05:20,648 - tissue_RASSF2_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.040140, p-value: 1.00e-05\n",
+      "Processing column 28/66: SM550NRM\n",
+      "  Skipping SM550NRM: all values are NaN\n",
+      "Processing column 29/66: SMGNSDTC\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:20,902 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.043013, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n",
+      "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO:   Skipping SMUNMPRT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 31/66: SM350NRM\n",
+      "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO:   Skipping SM350NRM: all values are NaN\n",
+      "[2025-09-25 13:05:20,904 - tissue_RASSF2_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.043013, p-value: 1.00e-05\n",
+      "Processing column 30/66: SMUNMPRT\n",
+      "  Skipping SMUNMPRT: only 1 unique value(s)\n",
+      "Processing column 31/66: SM350NRM\n",
+      "  Skipping SM350NRM: all values are NaN\n",
+      "Processing column 32/66: SMRDLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:21,156 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.000028, p-value: 1.73e-01\n",
+      "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 33/66: SMMNCPB\n",
+      "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO:   Skipping SMMNCPB: all values are NaN\n",
+      "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 34/66: SME1MMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.000028, p-value: 1.73e-01\n",
+      "Processing column 33/66: SMMNCPB\n",
+      "  Skipping SMMNCPB: all values are NaN\n",
+      "Processing column 34/66: SME1MMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:21,411 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.018125, p-value: 1.40e-04\n",
+      "[2025-09-25 13:05:21,412 - tissue_RASSF2_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.018125, p-value: 1.40e-04\n",
+      "Processing column 35/66: SMSFLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:21,665 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.047258, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 36/66: SMESTLBS\n",
+      "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO:   Skipping SMESTLBS: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 37/66: SMMPPD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.047258, p-value: 1.00e-05\n",
+      "Processing column 36/66: SMESTLBS\n",
+      "  Skipping SMESTLBS: only 1 unique value(s)\n",
+      "Processing column 37/66: SMMPPD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.007761, p-value: 3.43e-02\n",
+      "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO: Processing column 38/66: SMNTERRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007761, p-value: 3.43e-02\n",
+      "Processing column 38/66: SMNTERRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.250997, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO: Processing column 39/66: SMRRNANM\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.250997, p-value: 1.00e-05\n",
+      "Processing column 39/66: SMRRNANM\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.036631, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO: Processing column 40/66: SMRDTTL\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.036631, p-value: 1.00e-05\n",
+      "Processing column 40/66: SMRDTTL\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.010388, p-value: 6.63e-03\n",
+      "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO: Processing column 41/66: SMVQCFL\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.010388, p-value: 6.63e-03\n",
+      "Processing column 41/66: SMVQCFL\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:22,941 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.001442, p-value: 9.24e-01\n",
+      "[2025-09-25 13:05:22,942 - tissue_RASSF2_whole_blood] INFO: Processing column 42/66: SMMNCV\n",
+      "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO:   Skipping SMMNCV: all values are NaN\n",
+      "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.001442, p-value: 9.24e-01\n",
+      "Processing column 42/66: SMMNCV\n",
+      "  Skipping SMMNCV: all values are NaN\n",
+      "Processing column 43/66: SMTRSCPT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.042714, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.042714, p-value: 1.00e-05\n",
+      "Processing column 44/66: SMMPPDPR\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:23,453 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.007761, p-value: 3.48e-02\n",
+      "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n",
+      "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO:   Skipping SMCGLGTH: all values are NaN\n",
+      "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n",
+      "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO:   Skipping SMGAPPCT: all values are NaN\n",
+      "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n",
+      "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO:   Skipping SMUNPDRD: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007761, p-value: 3.48e-02\n",
+      "Processing column 45/66: SMCGLGTH\n",
+      "  Skipping SMCGLGTH: all values are NaN\n",
+      "Processing column 46/66: SMGAPPCT\n",
+      "  Skipping SMGAPPCT: all values are NaN\n",
+      "Processing column 47/66: SMUNPDRD\n",
+      "  Skipping SMUNPDRD: only 1 unique value(s)\n",
+      "Processing column 48/66: SMNTRNRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.202936, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.202936, p-value: 1.00e-05\n",
+      "Processing column 49/66: SMMPUNRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.168576, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.168576, p-value: 1.00e-05\n",
+      "Processing column 50/66: SMEXPEFF\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.059931, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.059931, p-value: 1.00e-05\n",
+      "Processing column 51/66: SMMPPDUN\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.007761, p-value: 3.43e-02\n",
+      "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO: Processing column 52/66: SME2MMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007761, p-value: 3.43e-02\n",
+      "Processing column 52/66: SME2MMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:24,730 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.003990, p-value: 4.05e-01\n",
+      "[2025-09-25 13:05:24,731 - tissue_RASSF2_whole_blood] INFO: Processing column 53/66: SME2ANTI\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.003990, p-value: 4.05e-01\n",
+      "Processing column 53/66: SME2ANTI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:24,987 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.020742, p-value: 1.10e-04\n",
+      "[2025-09-25 13:05:24,988 - tissue_RASSF2_whole_blood] INFO: Processing column 54/66: SMALTALG\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.020742, p-value: 1.10e-04\n",
+      "Processing column 54/66: SMALTALG\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.177009, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO: Processing column 55/66: SME2SNSE\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.177009, p-value: 1.00e-05\n",
+      "Processing column 55/66: SME2SNSE\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:25,496 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.019048, p-value: 1.80e-04\n",
+      "[2025-09-25 13:05:25,497 - tissue_RASSF2_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.019048, p-value: 1.80e-04\n",
+      "Processing column 56/66: SMMFLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.019296, p-value: 1.50e-04\n",
+      "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO: Processing column 57/66: SME1ANTI\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.019296, p-value: 1.50e-04\n",
+      "Processing column 57/66: SME1ANTI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:26,007 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.021058, p-value: 1.20e-04\n",
+      "[2025-09-25 13:05:26,008 - tissue_RASSF2_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.021058, p-value: 1.20e-04\n",
+      "Processing column 58/66: SMSPLTRD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:26,263 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.057786, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:26,264 - tissue_RASSF2_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.057786, p-value: 1.00e-05\n",
+      "Processing column 59/66: SMBSMMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.005333, p-value: 1.83e-01\n",
+      "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO: Processing column 60/66: SME1SNSE\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.005333, p-value: 1.83e-01\n",
+      "Processing column 60/66: SME1SNSE\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.022008, p-value: 5.00e-05\n",
+      "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO: Processing column 61/66: SME1PCTS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.022008, p-value: 5.00e-05\n",
+      "Processing column 61/66: SME1PCTS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.032073, p-value: 2.00e-05\n",
+      "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO: Processing column 62/66: SMRRNART\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.032073, p-value: 2.00e-05\n",
+      "Processing column 62/66: SMRRNART\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:27,285 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.048437, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:27,286 - tissue_RASSF2_whole_blood] INFO: Processing column 63/66: SME1MPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.048437, p-value: 1.00e-05\n",
+      "Processing column 63/66: SME1MPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.181940, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n",
+      "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO:   Skipping SMNUM5CD: all values are NaN\n",
+      "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n",
+      "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO:   Skipping SMDPMPRT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 66/66: SME2PCTS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.181940, p-value: 1.00e-05\n",
+      "Processing column 64/66: SMNUM5CD\n",
+      "  Skipping SMNUM5CD: all values are NaN\n",
+      "Processing column 65/66: SMDPMPRT\n",
+      "  Skipping SMDPMPRT: only 1 unique value(s)\n",
+      "Processing column 66/66: SME2PCTS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:27,796 - tissue_RASSF2_whole_blood] INFO:   CCC: 0.029344, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: \n",
+      "Completed processing whole_blood:\n",
+      "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO:   Total metadata columns: 66\n",
+      "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO:   Successful analyses: 44\n",
+      "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO:   Skipped/Failed: 22\n",
+      "[2025-09-25 13:05:27,821 - summary] INFO: Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n",
+      "[2025-09-25 13:05:27,821 - summary] INFO: Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "[2025-09-25 13:05:27,822 - summary] INFO: Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:27,823 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:27,823 - summary] INFO: COMBINED RESULTS SUMMARY\n",
+      "[2025-09-25 13:05:27,823 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:27,824 - summary] INFO: Gene Symbol: RASSF2\n",
+      "[2025-09-25 13:05:27,824 - summary] INFO: Gene ID: ENSG00000101265.15\n",
+      "[2025-09-25 13:05:27,824 - summary] INFO: Permutations: 100,000\n",
+      "[2025-09-25 13:05:27,824 - summary] INFO: Tissues processed: 1\n",
+      "[2025-09-25 13:05:27,825 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n",
+      "[2025-09-25 13:05:27,825 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n",
+      "[2025-09-25 13:05:27,826 - summary] INFO: \n",
+      "Total successful analyses across all tissues: 44\n",
+      "[2025-09-25 13:05:27,826 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:27,826 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+      "[2025-09-25 13:05:27,826 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:27,827 - summary] INFO: Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "[2025-09-25 13:05:27,827 - summary] INFO: ------------------------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood          SMTSISCH                    0.528125    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood          DTHHRDY                     0.464582    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood          SMNTERRT                    0.250997    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood          SMNTRART                    0.243071    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood          SMNTRNRT                    0.202936    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood          SME1MPRT                    0.181940    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood          SMALTALG                    0.177009    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood          SME2MPRT                    0.172974    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood          SMMPUNRT                    0.168576    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood          SMMAPRT                     0.168576    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood          SMCHMPRS                    0.143365    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood          SMCENTER                    0.108148    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood          SMEXPEFF                    0.059931    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood          SMSPLTRD                    0.057786    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood          SMRIN                       0.048847    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood          SMRRNART                    0.048437    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood          SMSFLGTH                    0.047258    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood          SMGNSDTC                    0.043013    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood          SMTRSCPT                    0.042714    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood          SMEXNCRT                    0.040140    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:27,834 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:27,834 - summary] INFO: SUMMARY BY TISSUE\n",
+      "[2025-09-25 13:05:27,834 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:27,834 - summary] INFO: Tissue               N Samples  Successful   Mean |CCC|   Max |CCC|   \n",
+      "[2025-09-25 13:05:27,835 - summary] INFO: ----------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:27,835 - summary] INFO: whole_blood          755        44           0.079987     0.528125    \n",
+      "[2025-09-25 13:05:27,835 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:27,835 - summary] INFO: RUNTIME SUMMARY\n",
+      "[2025-09-25 13:05:27,836 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:27,836 - summary] INFO: Total runtime: 9.96 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:27,836 - summary] INFO: Average runtime per tissue: 9.96 seconds\n",
+      "[2025-09-25 13:05:27,836 - summary] INFO: \n",
+      "Runtime by tissue:\n",
+      "[2025-09-25 13:05:27,837 - summary] INFO: Tissue                    Runtime (sec)   Runtime (min)   Status    \n",
+      "[2025-09-25 13:05:27,837 - summary] INFO: ----------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:27,837 - summary] INFO: whole_blood               9.96            0.17            Success   \n",
+      "[2025-09-25 13:05:27,837 - summary] INFO: \n",
+      "Fastest: whole_blood (9.96 seconds)\n",
+      "[2025-09-25 13:05:27,837 - summary] INFO: Slowest: whole_blood (9.96 seconds)\n",
+      "[2025-09-25 13:05:27,838 - summary] INFO: Speed ratio: 1.0x\n",
+      "[2025-09-25 13:05:27,838 - summary] INFO: Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:27,838 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:27,838 - summary] INFO: PROCESSING GENE 2/2: CYTIP\n",
+      "[2025-09-25 13:05:27,838 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:27,839 - summary] INFO: \n",
+      "[1/1] Starting processing for CYTIP in whole_blood...\n",
+      "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: \n",
+      "============================================================\n",
+      "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: Processing tissue: whole_blood\n",
+      "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n",
+      "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: ============================================================\n",
+      "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Loading expression data...\n",
+      "[2025-09-25 13:05:27,981 - tissue_CYTIP_whole_blood] INFO: Expression data shape: (56200, 755)\n",
+      "[2025-09-25 13:05:27,984 - tissue_CYTIP_whole_blood] INFO: Gene ID for CYTIP: ENSG00000115165.9\n",
+      "[2025-09-25 13:05:27,986 - tissue_CYTIP_whole_blood] INFO: Number of samples: 755\n",
+      "[2025-09-25 13:05:27,987 - tissue_CYTIP_whole_blood] INFO: Common samples: 755\n",
+      "[2025-09-25 13:05:27,988 - tissue_CYTIP_whole_blood] INFO: Computing CCC between CYTIP expression and all metadata columns...\n",
+      "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Using 100000 permutations and 4 jobs\n",
+      "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing 66 metadata columns...\n",
+      "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing column 1/66: SUBJID\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.029344, p-value: 1.00e-05\n",
+      "\n",
+      "Completed processing whole_blood:\n",
+      "  Total metadata columns: 66\n",
+      "  Successful analyses: 44\n",
+      "  Skipped/Failed: 22\n",
+      "Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n",
+      "Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n",
+      "\n",
+      "================================================================================\n",
+      "COMBINED RESULTS SUMMARY\n",
+      "================================================================================\n",
+      "Gene Symbol: RASSF2\n",
+      "Gene ID: ENSG00000101265.15\n",
+      "Permutations: 100,000\n",
+      "Tissues processed: 1\n",
+      "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n",
+      "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n",
+      "\n",
+      "Total successful analyses across all tissues: 44\n",
+      "\n",
+      "================================================================================\n",
+      "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+      "================================================================================\n",
+      "Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "------------------------------------------------------------------------------------------\n",
+      "whole_blood          SMTSISCH                    0.528125    1.00e-05  ***            \n",
+      "whole_blood          DTHHRDY                     0.464582    1.00e-05  ***            \n",
+      "whole_blood          SMNTERRT                    0.250997    1.00e-05  ***            \n",
+      "whole_blood          SMNTRART                    0.243071    1.00e-05  ***            \n",
+      "whole_blood          SMNTRNRT                    0.202936    1.00e-05  ***            \n",
+      "whole_blood          SME1MPRT                    0.181940    1.00e-05  ***            \n",
+      "whole_blood          SMALTALG                    0.177009    1.00e-05  ***            \n",
+      "whole_blood          SME2MPRT                    0.172974    1.00e-05  ***            \n",
+      "whole_blood          SMMPUNRT                    0.168576    1.00e-05  ***            \n",
+      "whole_blood          SMMAPRT                     0.168576    1.00e-05  ***            \n",
+      "whole_blood          SMCHMPRS                    0.143365    1.00e-05  ***            \n",
+      "whole_blood          SMCENTER                    0.108148    1.00e-05  ***            \n",
+      "whole_blood          SMEXPEFF                    0.059931    1.00e-05  ***            \n",
+      "whole_blood          SMSPLTRD                    0.057786    1.00e-05  ***            \n",
+      "whole_blood          SMRIN                       0.048847    1.00e-05  ***            \n",
+      "whole_blood          SMRRNART                    0.048437    1.00e-05  ***            \n",
+      "whole_blood          SMSFLGTH                    0.047258    1.00e-05  ***            \n",
+      "whole_blood          SMGNSDTC                    0.043013    1.00e-05  ***            \n",
+      "whole_blood          SMTRSCPT                    0.042714    1.00e-05  ***            \n",
+      "whole_blood          SMEXNCRT                    0.040140    1.00e-05  ***            \n",
+      "\n",
+      "================================================================================\n",
+      "SUMMARY BY TISSUE\n",
+      "================================================================================\n",
+      "Tissue               N Samples  Successful   Mean |CCC|   Max |CCC|   \n",
+      "----------------------------------------------------------------------\n",
+      "whole_blood          755        44           0.079987     0.528125    \n",
+      "\n",
+      "================================================================================\n",
+      "RUNTIME SUMMARY\n",
+      "================================================================================\n",
+      "Total runtime: 9.96 seconds (0.17 minutes)\n",
+      "Average runtime per tissue: 9.96 seconds\n",
+      "\n",
+      "Runtime by tissue:\n",
+      "Tissue                    Runtime (sec)   Runtime (min)   Status    \n",
+      "----------------------------------------------------------------------\n",
+      "whole_blood               9.96            0.17            Success   \n",
+      "\n",
+      "Fastest: whole_blood (9.96 seconds)\n",
+      "Slowest: whole_blood (9.96 seconds)\n",
+      "Speed ratio: 1.0x\n",
+      "Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n",
+      "\n",
+      "====================================================================================================\n",
+      "PROCESSING GENE 2/2: CYTIP\n",
+      "====================================================================================================\n",
+      "\n",
+      "[1/1] Starting processing for CYTIP in whole_blood...\n",
+      "\n",
+      "============================================================\n",
+      "Processing tissue: whole_blood\n",
+      "File: gtex_v8_data_whole_blood.pkl\n",
+      "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "============================================================\n",
+      "Loading expression data...\n",
+      "Expression data shape: (56200, 755)\n",
+      "Gene ID for CYTIP: ENSG00000115165.9\n",
+      "Number of samples: 755\n",
+      "Common samples: 755\n",
+      "Computing CCC between CYTIP expression and all metadata columns...\n",
+      "Using 100000 permutations and 4 jobs\n",
+      "Processing 66 metadata columns...\n",
+      "Processing column 1/66: SUBJID\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.000000, p-value: 1.00e+00\n",
+      "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO: Processing column 2/66: SEX\n",
+      "[2025-09-25 13:05:28,156 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.001409, p-value: 3.98e-01\n",
+      "[2025-09-25 13:05:28,157 - tissue_CYTIP_whole_blood] INFO: Processing column 3/66: AGE\n",
+      "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.018997, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO: Processing column 4/66: DTHHRDY\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.000000, p-value: 1.00e+00\n",
+      "Processing column 2/66: SEX\n",
+      "  CCC: 0.001409, p-value: 3.98e-01\n",
+      "Processing column 3/66: AGE\n",
+      "  CCC: 0.018997, p-value: 1.00e-05\n",
+      "Processing column 4/66: DTHHRDY\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:28,481 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.184226, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 5/66: SMATSSCR\n",
+      "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO:   Skipping SMATSSCR: all values are NaN\n",
+      "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 6/66: SMCENTER\n",
+      "[2025-09-25 13:05:28,551 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.084684, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n",
+      "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO:   Skipping SMPTHNTS: all values are NaN\n",
+      "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 8/66: SMRIN\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.184226, p-value: 1.00e-05\n",
+      "Processing column 5/66: SMATSSCR\n",
+      "  Skipping SMATSSCR: all values are NaN\n",
+      "Processing column 6/66: SMCENTER\n",
+      "  CCC: 0.084684, p-value: 1.00e-05\n",
+      "Processing column 7/66: SMPTHNTS\n",
+      "  Skipping SMPTHNTS: all values are NaN\n",
+      "Processing column 8/66: SMRIN\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.003196, p-value: 5.68e-01\n",
+      "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO: Processing column 9/66: SMTS\n",
+      "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO:   Skipping SMTS: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Processing column 10/66: SMTSD\n",
+      "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO:   Skipping SMTSD: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 11/66: SMUBRID\n",
+      "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO:   Skipping SMUBRID: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 12/66: SMTSISCH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.003196, p-value: 5.68e-01\n",
+      "Processing column 9/66: SMTS\n",
+      "  Skipping SMTS: only 1 unique value(s)\n",
+      "Processing column 10/66: SMTSD\n",
+      "  Skipping SMTSD: only 1 unique value(s)\n",
+      "Processing column 11/66: SMUBRID\n",
+      "  Skipping SMUBRID: only 1 unique value(s)\n",
+      "Processing column 12/66: SMTSISCH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.215092, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO: Processing column 13/66: SMTSPAX\n",
+      "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO:   Skipping SMTSPAX: all values are NaN\n",
+      "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO: Processing column 14/66: SMNABTCH\n",
+      "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.000304, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n",
+      "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO:   Skipping SMNABTCHT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n",
+      "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.000256, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n",
+      "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.001533, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.215092, p-value: 1.00e-05\n",
+      "Processing column 13/66: SMTSPAX\n",
+      "  Skipping SMTSPAX: all values are NaN\n",
+      "Processing column 14/66: SMNABTCH\n",
+      "  CCC: 0.000304, p-value: 1.00e-05\n",
+      "Processing column 15/66: SMNABTCHT\n",
+      "  Skipping SMNABTCHT: only 1 unique value(s)\n",
+      "Processing column 16/66: SMNABTCHD\n",
+      "  CCC: 0.000256, p-value: 1.00e-05\n",
+      "Processing column 17/66: SMGEBTCH\n",
+      "  CCC: 0.001533, p-value: 1.00e-05\n",
+      "Processing column 18/66: SMGEBTCHD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:29,317 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.002104, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n",
+      "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO:   Skipping SMGEBTCHT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 20/66: SMAFRZE\n",
+      "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO:   Skipping SMAFRZE: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 21/66: SMGTC\n",
+      "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO:   Skipping SMGTC: all values are NaN\n",
+      "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO: Processing column 22/66: SME2MPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.002104, p-value: 1.00e-05\n",
+      "Processing column 19/66: SMGEBTCHT\n",
+      "  Skipping SMGEBTCHT: only 1 unique value(s)\n",
+      "Processing column 20/66: SMAFRZE\n",
+      "  Skipping SMAFRZE: only 1 unique value(s)\n",
+      "Processing column 21/66: SMGTC\n",
+      "  Skipping SMGTC: all values are NaN\n",
+      "Processing column 22/66: SME2MPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:29,573 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.021744, p-value: 5.00e-05\n",
+      "[2025-09-25 13:05:29,574 - tissue_CYTIP_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.021744, p-value: 5.00e-05\n",
+      "Processing column 23/66: SMCHMPRS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.015946, p-value: 4.50e-04\n",
+      "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO: Processing column 24/66: SMNTRART\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.015946, p-value: 4.50e-04\n",
+      "Processing column 24/66: SMNTRART\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:30,082 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.024407, p-value: 3.00e-05\n",
+      "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n",
+      "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO:   Skipping SMNUMGPS: all values are NaN\n",
+      "[2025-09-25 13:05:30,084 - tissue_CYTIP_whole_blood] INFO: Processing column 26/66: SMMAPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.024407, p-value: 3.00e-05\n",
+      "Processing column 25/66: SMNUMGPS\n",
+      "  Skipping SMNUMGPS: all values are NaN\n",
+      "Processing column 26/66: SMMAPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:30,338 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.021052, p-value: 7.00e-05\n",
+      "[2025-09-25 13:05:30,339 - tissue_CYTIP_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.021052, p-value: 7.00e-05\n",
+      "Processing column 27/66: SMEXNCRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.126241, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: Processing column 28/66: SM550NRM\n",
+      "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO:   Skipping SM550NRM: all values are NaN\n",
+      "[2025-09-25 13:05:30,594 - tissue_CYTIP_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.126241, p-value: 1.00e-05\n",
+      "Processing column 28/66: SM550NRM\n",
+      "  Skipping SM550NRM: all values are NaN\n",
+      "Processing column 29/66: SMGNSDTC\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.050841, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n",
+      "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO:   Skipping SMUNMPRT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Processing column 31/66: SM350NRM\n",
+      "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO:   Skipping SM350NRM: all values are NaN\n",
+      "[2025-09-25 13:05:30,849 - tissue_CYTIP_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.050841, p-value: 1.00e-05\n",
+      "Processing column 30/66: SMUNMPRT\n",
+      "  Skipping SMUNMPRT: only 1 unique value(s)\n",
+      "Processing column 31/66: SM350NRM\n",
+      "  Skipping SM350NRM: all values are NaN\n",
+      "Processing column 32/66: SMRDLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:31,103 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.000003, p-value: 9.42e-01\n",
+      "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 33/66: SMMNCPB\n",
+      "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO:   Skipping SMMNCPB: all values are NaN\n",
+      "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 34/66: SME1MMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.000003, p-value: 9.42e-01\n",
+      "Processing column 33/66: SMMNCPB\n",
+      "  Skipping SMMNCPB: all values are NaN\n",
+      "Processing column 34/66: SME1MMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.005089, p-value: 2.16e-01\n",
+      "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.005089, p-value: 2.16e-01\n",
+      "Processing column 35/66: SMSFLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:31,614 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.015322, p-value: 7.10e-04\n",
+      "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO: Processing column 36/66: SMESTLBS\n",
+      "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO:   Skipping SMESTLBS: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:31,616 - tissue_CYTIP_whole_blood] INFO: Processing column 37/66: SMMPPD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.015322, p-value: 7.10e-04\n",
+      "Processing column 36/66: SMESTLBS\n",
+      "  Skipping SMESTLBS: only 1 unique value(s)\n",
+      "Processing column 37/66: SMMPPD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007147, p-value: 5.32e-02\n",
+      "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO: Processing column 38/66: SMNTERRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007147, p-value: 5.32e-02\n",
+      "Processing column 38/66: SMNTERRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:32,125 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.023433, p-value: 4.00e-05\n",
+      "[2025-09-25 13:05:32,126 - tissue_CYTIP_whole_blood] INFO: Processing column 39/66: SMRRNANM\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.023433, p-value: 4.00e-05\n",
+      "Processing column 39/66: SMRRNANM\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.005677, p-value: 1.45e-01\n",
+      "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO: Processing column 40/66: SMRDTTL\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.005677, p-value: 1.45e-01\n",
+      "Processing column 40/66: SMRDTTL\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:32,632 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.008033, p-value: 2.92e-02\n",
+      "[2025-09-25 13:05:32,633 - tissue_CYTIP_whole_blood] INFO: Processing column 41/66: SMVQCFL\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.008033, p-value: 2.92e-02\n",
+      "Processing column 41/66: SMVQCFL\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:32,887 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.003136, p-value: 6.00e-01\n",
+      "[2025-09-25 13:05:32,888 - tissue_CYTIP_whole_blood] INFO: Processing column 42/66: SMMNCV\n",
+      "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO:   Skipping SMMNCV: all values are NaN\n",
+      "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.003136, p-value: 6.00e-01\n",
+      "Processing column 42/66: SMMNCV\n",
+      "  Skipping SMMNCV: all values are NaN\n",
+      "Processing column 43/66: SMTRSCPT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:33,142 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.051533, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:33,143 - tissue_CYTIP_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.051533, p-value: 1.00e-05\n",
+      "Processing column 44/66: SMMPPDPR\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.005880, p-value: 1.29e-01\n",
+      "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n",
+      "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO:   Skipping SMCGLGTH: all values are NaN\n",
+      "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n",
+      "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO:   Skipping SMGAPPCT: all values are NaN\n",
+      "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n",
+      "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO:   Skipping SMUNPDRD: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.005880, p-value: 1.29e-01\n",
+      "Processing column 45/66: SMCGLGTH\n",
+      "  Skipping SMCGLGTH: all values are NaN\n",
+      "Processing column 46/66: SMGAPPCT\n",
+      "  Skipping SMGAPPCT: all values are NaN\n",
+      "Processing column 47/66: SMUNPDRD\n",
+      "  Skipping SMUNPDRD: only 1 unique value(s)\n",
+      "Processing column 48/66: SMNTRNRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.261762, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.261762, p-value: 1.00e-05\n",
+      "Processing column 49/66: SMMPUNRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.021052, p-value: 7.00e-05\n",
+      "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.021052, p-value: 7.00e-05\n",
+      "Processing column 50/66: SMEXPEFF\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.086945, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.086945, p-value: 1.00e-05\n",
+      "Processing column 51/66: SMMPPDUN\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007147, p-value: 5.32e-02\n",
+      "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO: Processing column 52/66: SME2MMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007147, p-value: 5.32e-02\n",
+      "Processing column 52/66: SME2MMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:34,674 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.004187, p-value: 3.68e-01\n",
+      "[2025-09-25 13:05:34,675 - tissue_CYTIP_whole_blood] INFO: Processing column 53/66: SME2ANTI\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.004187, p-value: 3.68e-01\n",
+      "Processing column 53/66: SME2ANTI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:34,929 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007334, p-value: 4.67e-02\n",
+      "[2025-09-25 13:05:34,930 - tissue_CYTIP_whole_blood] INFO: Processing column 54/66: SMALTALG\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007334, p-value: 4.67e-02\n",
+      "Processing column 54/66: SMALTALG\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:35,186 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.038381, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:35,187 - tissue_CYTIP_whole_blood] INFO: Processing column 55/66: SME2SNSE\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.038381, p-value: 1.00e-05\n",
+      "Processing column 55/66: SME2SNSE\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:35,441 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.006734, p-value: 7.08e-02\n",
+      "[2025-09-25 13:05:35,442 - tissue_CYTIP_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.006734, p-value: 7.08e-02\n",
+      "Processing column 56/66: SMMFLGTH\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.010863, p-value: 4.63e-03\n",
+      "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO: Processing column 57/66: SME1ANTI\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.010863, p-value: 4.63e-03\n",
+      "Processing column 57/66: SME1ANTI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:35,950 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007210, p-value: 5.04e-02\n",
+      "[2025-09-25 13:05:35,951 - tissue_CYTIP_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007210, p-value: 5.04e-02\n",
+      "Processing column 58/66: SMSPLTRD\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.030117, p-value: 1.00e-05\n",
+      "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.030117, p-value: 1.00e-05\n",
+      "Processing column 59/66: SMBSMMRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:36,464 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.004293, p-value: 3.48e-01\n",
+      "[2025-09-25 13:05:36,465 - tissue_CYTIP_whole_blood] INFO: Processing column 60/66: SME1SNSE\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.004293, p-value: 3.48e-01\n",
+      "Processing column 60/66: SME1SNSE\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007285, p-value: 4.87e-02\n",
+      "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO: Processing column 61/66: SME1PCTS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007285, p-value: 4.87e-02\n",
+      "Processing column 61/66: SME1PCTS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.004663, p-value: 2.81e-01\n",
+      "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO: Processing column 62/66: SMRRNART\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.004663, p-value: 2.81e-01\n",
+      "Processing column 62/66: SMRRNART\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:37,228 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.013729, p-value: 1.10e-03\n",
+      "[2025-09-25 13:05:37,229 - tissue_CYTIP_whole_blood] INFO: Processing column 63/66: SME1MPRT\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.013729, p-value: 1.10e-03\n",
+      "Processing column 63/66: SME1MPRT\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.021952, p-value: 4.00e-05\n",
+      "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n",
+      "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO:   Skipping SMNUM5CD: all values are NaN\n",
+      "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n",
+      "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO:   Skipping SMDPMPRT: only 1 unique value(s)\n",
+      "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO: Processing column 66/66: SME2PCTS\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.021952, p-value: 4.00e-05\n",
+      "Processing column 64/66: SMNUM5CD\n",
+      "  Skipping SMNUM5CD: all values are NaN\n",
+      "Processing column 65/66: SMDPMPRT\n",
+      "  Skipping SMDPMPRT: only 1 unique value(s)\n",
+      "Processing column 66/66: SME2PCTS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-09-25 13:05:37,739 - tissue_CYTIP_whole_blood] INFO:   CCC: 0.007812, p-value: 3.38e-02\n",
+      "[2025-09-25 13:05:37,740 - tissue_CYTIP_whole_blood] INFO: \n",
+      "Completed processing whole_blood:\n",
+      "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO:   Total metadata columns: 66\n",
+      "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO:   Successful analyses: 44\n",
+      "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO:   Skipped/Failed: 22\n",
+      "[2025-09-25 13:05:37,765 - summary] INFO: Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n",
+      "[2025-09-25 13:05:37,765 - summary] INFO: Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "[2025-09-25 13:05:37,766 - summary] INFO: Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:37,767 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:37,767 - summary] INFO: COMBINED RESULTS SUMMARY\n",
+      "[2025-09-25 13:05:37,767 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:37,768 - summary] INFO: Gene Symbol: CYTIP\n",
+      "[2025-09-25 13:05:37,768 - summary] INFO: Gene ID: ENSG00000115165.9\n",
+      "[2025-09-25 13:05:37,768 - summary] INFO: Permutations: 100,000\n",
+      "[2025-09-25 13:05:37,768 - summary] INFO: Tissues processed: 1\n",
+      "[2025-09-25 13:05:37,769 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n",
+      "[2025-09-25 13:05:37,769 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n",
+      "[2025-09-25 13:05:37,769 - summary] INFO: \n",
+      "Total successful analyses across all tissues: 44\n",
+      "[2025-09-25 13:05:37,769 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:37,770 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+      "[2025-09-25 13:05:37,771 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:37,771 - summary] INFO: Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "[2025-09-25 13:05:37,771 - summary] INFO: ------------------------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood          SMNTRNRT                    0.261762    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood          SMTSISCH                    0.215092    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood          DTHHRDY                     0.184226    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood          SMEXNCRT                    0.126241    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood          SMEXPEFF                    0.086945    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood          SMCENTER                    0.084684    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood          SMTRSCPT                    0.051533    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood          SMGNSDTC                    0.050841    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood          SMALTALG                    0.038381    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood          SMSPLTRD                    0.030117    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood          SMNTRART                    0.024407    3.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood          SMNTERRT                    0.023433    4.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood          SME1MPRT                    0.021952    4.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood          SME2MPRT                    0.021744    5.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood          SMMAPRT                     0.021052    7.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood          SMMPUNRT                    0.021052    7.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood          AGE                         0.018997    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood          SMCHMPRS                    0.015946    4.50e-04  ***            \n",
+      "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood          SMSFLGTH                    0.015322    7.10e-04  ***            \n",
+      "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood          SMRRNART                    0.013729    1.10e-03  **             \n",
+      "[2025-09-25 13:05:37,777 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:37,778 - summary] INFO: SUMMARY BY TISSUE\n",
+      "[2025-09-25 13:05:37,778 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:37,778 - summary] INFO: Tissue               N Samples  Successful   Mean |CCC|   Max |CCC|   \n",
+      "[2025-09-25 13:05:37,778 - summary] INFO: ----------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,779 - summary] INFO: whole_blood          755        44           0.032699     0.261762    \n",
+      "[2025-09-25 13:05:37,779 - summary] INFO: \n",
+      "================================================================================\n",
+      "[2025-09-25 13:05:37,780 - summary] INFO: RUNTIME SUMMARY\n",
+      "[2025-09-25 13:05:37,780 - summary] INFO: ================================================================================\n",
+      "[2025-09-25 13:05:37,780 - summary] INFO: Total runtime: 9.93 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:37,780 - summary] INFO: Average runtime per tissue: 9.93 seconds\n",
+      "[2025-09-25 13:05:37,780 - summary] INFO: \n",
+      "Runtime by tissue:\n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: Tissue                    Runtime (sec)   Runtime (min)   Status    \n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: ----------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: whole_blood               9.93            0.17            Success   \n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: \n",
+      "Fastest: whole_blood (9.93 seconds)\n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: Slowest: whole_blood (9.93 seconds)\n",
+      "[2025-09-25 13:05:37,781 - summary] INFO: Speed ratio: 1.0x\n",
+      "[2025-09-25 13:05:37,782 - summary] INFO: Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:37,782 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:37,782 - summary] INFO: OVERALL RESULTS SUMMARY\n",
+      "[2025-09-25 13:05:37,782 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:37,783 - summary] INFO: Gene symbols processed: RASSF2, CYTIP\n",
+      "[2025-09-25 13:05:37,783 - summary] INFO: Total genes: 2\n",
+      "[2025-09-25 13:05:37,783 - summary] INFO: Permutations: 100,000\n",
+      "[2025-09-25 13:05:37,783 - summary] INFO: Tissues per gene: 1\n",
+      "[2025-09-25 13:05:37,784 - summary] INFO: All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n",
+      "[2025-09-25 13:05:37,785 - summary] INFO: All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n",
+      "[2025-09-25 13:05:37,785 - summary] INFO: \n",
+      "Log files created:\n",
+      "[2025-09-25 13:05:37,785 - summary] INFO:   RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "[2025-09-25 13:05:37,786 - summary] INFO:   CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "[2025-09-25 13:05:37,786 - summary] INFO: \n",
+      "Total successful analyses across all genes and tissues: 88\n",
+      "[2025-09-25 13:05:37,786 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:37,787 - summary] INFO: TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n",
+      "[2025-09-25 13:05:37,787 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:37,788 - summary] INFO: Gene         Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "[2025-09-25 13:05:37,788 - summary] INFO: --------------------------------------------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2       whole_blood          SMTSISCH                    0.528125    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2       whole_blood          DTHHRDY                     0.464582    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP        whole_blood          SMNTRNRT                    0.261762    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2       whole_blood          SMNTERRT                    0.250997    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2       whole_blood          SMNTRART                    0.243071    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP        whole_blood          SMTSISCH                    0.215092    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2       whole_blood          SMNTRNRT                    0.202936    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,790 - summary] INFO: CYTIP        whole_blood          DTHHRDY                     0.184226    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2       whole_blood          SME1MPRT                    0.181940    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2       whole_blood          SMALTALG                    0.177009    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2       whole_blood          SME2MPRT                    0.172974    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2       whole_blood          SMMPUNRT                    0.168576    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2       whole_blood          SMMAPRT                     0.168576    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2       whole_blood          SMCHMPRS                    0.143365    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP        whole_blood          SMEXNCRT                    0.126241    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,792 - summary] INFO: RASSF2       whole_blood          SMCENTER                    0.108148    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP        whole_blood          SMEXPEFF                    0.086945    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP        whole_blood          SMCENTER                    0.084684    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2       whole_blood          SMEXPEFF                    0.059931    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2       whole_blood          SMSPLTRD                    0.057786    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP        whole_blood          SMTRSCPT                    0.051533    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,794 - summary] INFO: CYTIP        whole_blood          SMGNSDTC                    0.050841    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2       whole_blood          SMRIN                       0.048847    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2       whole_blood          SMRRNART                    0.048437    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2       whole_blood          SMSFLGTH                    0.047258    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2       whole_blood          SMGNSDTC                    0.043013    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2       whole_blood          SMTRSCPT                    0.042714    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2       whole_blood          SMEXNCRT                    0.040140    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2       whole_blood          AGE                         0.039824    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,796 - summary] INFO: CYTIP        whole_blood          SMALTALG                    0.038381    1.00e-05  ***            \n",
+      "[2025-09-25 13:05:37,796 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:37,796 - summary] INFO: SUMMARY BY GENE\n",
+      "[2025-09-25 13:05:37,796 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:37,797 - summary] INFO: \n",
+      "Gene: RASSF2 (ID: ENSG00000101265.15)\n",
+      "[2025-09-25 13:05:37,797 - summary] INFO:   Tissues processed: 1\n",
+      "[2025-09-25 13:05:37,797 - summary] INFO:   Successful analyses: 44\n",
+      "[2025-09-25 13:05:37,797 - summary] INFO:   Mean |CCC|: 0.079987\n",
+      "[2025-09-25 13:05:37,797 - summary] INFO:   Max |CCC|: 0.528125\n",
+      "[2025-09-25 13:05:37,798 - summary] INFO:   Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n",
+      "[2025-09-25 13:05:37,798 - summary] INFO:   Runtime: 9.96 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:37,799 - summary] INFO: \n",
+      "Gene: CYTIP (ID: ENSG00000115165.9)\n",
+      "[2025-09-25 13:05:37,799 - summary] INFO:   Tissues processed: 1\n",
+      "[2025-09-25 13:05:37,799 - summary] INFO:   Successful analyses: 44\n",
+      "[2025-09-25 13:05:37,800 - summary] INFO:   Mean |CCC|: 0.032699\n",
+      "[2025-09-25 13:05:37,800 - summary] INFO:   Max |CCC|: 0.261762\n",
+      "[2025-09-25 13:05:37,800 - summary] INFO:   Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n",
+      "[2025-09-25 13:05:37,801 - summary] INFO:   Runtime: 9.93 seconds (0.17 minutes)\n",
+      "[2025-09-25 13:05:37,801 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:37,801 - summary] INFO: SUMMARY BY TISSUE (across all genes)\n",
+      "[2025-09-25 13:05:37,801 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:37,801 - summary] INFO: Tissue                    N Genes    Successful   Mean |CCC|   Max |CCC|   \n",
+      "[2025-09-25 13:05:37,801 - summary] INFO: ---------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,802 - summary] INFO: whole_blood               2          88           0.056343     0.528125    \n",
+      "[2025-09-25 13:05:37,802 - summary] INFO: \n",
+      "====================================================================================================\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: RUNTIME SUMMARY\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: ====================================================================================================\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: Total runtime: 19.92 seconds (0.33 minutes)\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: Average runtime per gene: 9.96 seconds\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: Total gene-tissue combinations: 2\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: \n",
+      "Runtime by gene:\n",
+      "[2025-09-25 13:05:37,803 - summary] INFO: Gene            Runtime (sec)   Runtime (min)   Tissues    Successful  \n",
+      "[2025-09-25 13:05:37,804 - summary] INFO: ---------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,804 - summary] INFO: RASSF2          9.96            0.17            1          1           \n",
+      "[2025-09-25 13:05:37,804 - summary] INFO: CYTIP           9.93            0.17            1          1           \n",
+      "[2025-09-25 13:05:37,804 - summary] INFO: \n",
+      "Average runtime by tissue (across all genes):\n",
+      "[2025-09-25 13:05:37,805 - summary] INFO: Tissue                    Avg Runtime (sec)  Avg Runtime (min)  N Runs   Min        Max       \n",
+      "[2025-09-25 13:05:37,805 - summary] INFO: -----------------------------------------------------------------------------------------------\n",
+      "[2025-09-25 13:05:37,805 - summary] INFO: whole_blood               9.94               0.17               2        9.93       9.96      \n",
+      "[2025-09-25 13:05:37,805 - summary] INFO: \n",
+      "Fastest tissue (avg): whole_blood (9.94 seconds)\n",
+      "[2025-09-25 13:05:37,805 - summary] INFO: Slowest tissue (avg): whole_blood (9.94 seconds)\n",
+      "[2025-09-25 13:05:37,806 - summary] INFO: Speed ratio: 1.0x\n",
+      "[2025-09-25 13:05:37,806 - summary] INFO: \n",
+      "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+      "[2025-09-25 13:05:37,806 - summary] INFO: Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  CCC: 0.007812, p-value: 3.38e-02\n",
+      "\n",
+      "Completed processing whole_blood:\n",
+      "  Total metadata columns: 66\n",
+      "  Successful analyses: 44\n",
+      "  Skipped/Failed: 22\n",
+      "Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n",
+      "Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n",
+      "\n",
+      "================================================================================\n",
+      "COMBINED RESULTS SUMMARY\n",
+      "================================================================================\n",
+      "Gene Symbol: CYTIP\n",
+      "Gene ID: ENSG00000115165.9\n",
+      "Permutations: 100,000\n",
+      "Tissues processed: 1\n",
+      "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n",
+      "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n",
+      "\n",
+      "Total successful analyses across all tissues: 44\n",
+      "\n",
+      "================================================================================\n",
+      "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+      "================================================================================\n",
+      "Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "------------------------------------------------------------------------------------------\n",
+      "whole_blood          SMNTRNRT                    0.261762    1.00e-05  ***            \n",
+      "whole_blood          SMTSISCH                    0.215092    1.00e-05  ***            \n",
+      "whole_blood          DTHHRDY                     0.184226    1.00e-05  ***            \n",
+      "whole_blood          SMEXNCRT                    0.126241    1.00e-05  ***            \n",
+      "whole_blood          SMEXPEFF                    0.086945    1.00e-05  ***            \n",
+      "whole_blood          SMCENTER                    0.084684    1.00e-05  ***            \n",
+      "whole_blood          SMTRSCPT                    0.051533    1.00e-05  ***            \n",
+      "whole_blood          SMGNSDTC                    0.050841    1.00e-05  ***            \n",
+      "whole_blood          SMALTALG                    0.038381    1.00e-05  ***            \n",
+      "whole_blood          SMSPLTRD                    0.030117    1.00e-05  ***            \n",
+      "whole_blood          SMNTRART                    0.024407    3.00e-05  ***            \n",
+      "whole_blood          SMNTERRT                    0.023433    4.00e-05  ***            \n",
+      "whole_blood          SME1MPRT                    0.021952    4.00e-05  ***            \n",
+      "whole_blood          SME2MPRT                    0.021744    5.00e-05  ***            \n",
+      "whole_blood          SMMAPRT                     0.021052    7.00e-05  ***            \n",
+      "whole_blood          SMMPUNRT                    0.021052    7.00e-05  ***            \n",
+      "whole_blood          AGE                         0.018997    1.00e-05  ***            \n",
+      "whole_blood          SMCHMPRS                    0.015946    4.50e-04  ***            \n",
+      "whole_blood          SMSFLGTH                    0.015322    7.10e-04  ***            \n",
+      "whole_blood          SMRRNART                    0.013729    1.10e-03  **             \n",
+      "\n",
+      "================================================================================\n",
+      "SUMMARY BY TISSUE\n",
+      "================================================================================\n",
+      "Tissue               N Samples  Successful   Mean |CCC|   Max |CCC|   \n",
+      "----------------------------------------------------------------------\n",
+      "whole_blood          755        44           0.032699     0.261762    \n",
+      "\n",
+      "================================================================================\n",
+      "RUNTIME SUMMARY\n",
+      "================================================================================\n",
+      "Total runtime: 9.93 seconds (0.17 minutes)\n",
+      "Average runtime per tissue: 9.93 seconds\n",
+      "\n",
+      "Runtime by tissue:\n",
+      "Tissue                    Runtime (sec)   Runtime (min)   Status    \n",
+      "----------------------------------------------------------------------\n",
+      "whole_blood               9.93            0.17            Success   \n",
+      "\n",
+      "Fastest: whole_blood (9.93 seconds)\n",
+      "Slowest: whole_blood (9.93 seconds)\n",
+      "Speed ratio: 1.0x\n",
+      "Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n",
+      "\n",
+      "====================================================================================================\n",
+      "OVERALL RESULTS SUMMARY\n",
+      "====================================================================================================\n",
+      "Gene symbols processed: RASSF2, CYTIP\n",
+      "Total genes: 2\n",
+      "Permutations: 100,000\n",
+      "Tissues per gene: 1\n",
+      "All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n",
+      "All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n",
+      "\n",
+      "Log files created:\n",
+      "  RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+      "  CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+      "\n",
+      "Total successful analyses across all genes and tissues: 88\n",
+      "\n",
+      "====================================================================================================\n",
+      "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n",
+      "====================================================================================================\n",
+      "Gene         Tissue               Metadata Column           CCC Value    P-value      Significance   \n",
+      "--------------------------------------------------------------------------------------------------------------\n",
+      "RASSF2       whole_blood          SMTSISCH                    0.528125    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          DTHHRDY                     0.464582    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMNTRNRT                    0.261762    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMNTERRT                    0.250997    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMNTRART                    0.243071    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMTSISCH                    0.215092    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMNTRNRT                    0.202936    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          DTHHRDY                     0.184226    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SME1MPRT                    0.181940    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMALTALG                    0.177009    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SME2MPRT                    0.172974    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMMPUNRT                    0.168576    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMMAPRT                     0.168576    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMCHMPRS                    0.143365    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMEXNCRT                    0.126241    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMCENTER                    0.108148    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMEXPEFF                    0.086945    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMCENTER                    0.084684    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMEXPEFF                    0.059931    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMSPLTRD                    0.057786    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMTRSCPT                    0.051533    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMGNSDTC                    0.050841    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMRIN                       0.048847    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMRRNART                    0.048437    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMSFLGTH                    0.047258    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMGNSDTC                    0.043013    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMTRSCPT                    0.042714    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          SMEXNCRT                    0.040140    1.00e-05  ***            \n",
+      "RASSF2       whole_blood          AGE                         0.039824    1.00e-05  ***            \n",
+      "CYTIP        whole_blood          SMALTALG                    0.038381    1.00e-05  ***            \n",
+      "\n",
+      "====================================================================================================\n",
+      "SUMMARY BY GENE\n",
+      "====================================================================================================\n",
+      "\n",
+      "Gene: RASSF2 (ID: ENSG00000101265.15)\n",
+      "  Tissues processed: 1\n",
+      "  Successful analyses: 44\n",
+      "  Mean |CCC|: 0.079987\n",
+      "  Max |CCC|: 0.528125\n",
+      "  Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n",
+      "  Runtime: 9.96 seconds (0.17 minutes)\n",
+      "\n",
+      "Gene: CYTIP (ID: ENSG00000115165.9)\n",
+      "  Tissues processed: 1\n",
+      "  Successful analyses: 44\n",
+      "  Mean |CCC|: 0.032699\n",
+      "  Max |CCC|: 0.261762\n",
+      "  Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n",
+      "  Runtime: 9.93 seconds (0.17 minutes)\n",
+      "\n",
+      "====================================================================================================\n",
+      "SUMMARY BY TISSUE (across all genes)\n",
+      "====================================================================================================\n",
+      "Tissue                    N Genes    Successful   Mean |CCC|   Max |CCC|   \n",
+      "---------------------------------------------------------------------------\n",
+      "whole_blood               2          88           0.056343     0.528125    \n",
+      "\n",
+      "====================================================================================================\n",
+      "RUNTIME SUMMARY\n",
+      "====================================================================================================\n",
+      "Total runtime: 19.92 seconds (0.33 minutes)\n",
+      "Average runtime per gene: 9.96 seconds\n",
+      "Total gene-tissue combinations: 2\n",
+      "\n",
+      "Runtime by gene:\n",
+      "Gene            Runtime (sec)   Runtime (min)   Tissues    Successful  \n",
+      "---------------------------------------------------------------------------\n",
+      "RASSF2          9.96            0.17            1          1           \n",
+      "CYTIP           9.93            0.17            1          1           \n",
+      "\n",
+      "Average runtime by tissue (across all genes):\n",
+      "Tissue                    Avg Runtime (sec)  Avg Runtime (min)  N Runs   Min        Max       \n",
+      "-----------------------------------------------------------------------------------------------\n",
+      "whole_blood               9.94               0.17               2        9.93       9.96      \n",
+      "\n",
+      "Fastest tissue (avg): whole_blood (9.94 seconds)\n",
+      "Slowest tissue (avg): whole_blood (9.94 seconds)\n",
+      "Speed ratio: 1.0x\n",
+      "\n",
+      "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+      "Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n"
+     ]
+    }
+   ],
+   "source": [
+    "%run  ./nbs/common/metadata_corr_cli.py RASSF2 CYTIP --include whole_blood --expr-data-dir {TISSUE_DATA_DIR} --data-dir {ANALYSIS_DIR} --output-dir {METADATA_CORRELATIONS_RESULT_DIR}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['RASSF2_all_tissues_correlation_results.csv',\n",
+       " 'CYTIP_whole_blood.log',\n",
+       " 'RASSF2_whole_blood_correlation_results.pkl',\n",
+       " 'RASSF2_all_tissues_correlation_results.pkl',\n",
+       " 'RASSF2_whole_blood.log',\n",
+       " 'CYTIP_all_tissues_correlation_results.pkl',\n",
+       " 'CYTIP_whole_blood_correlation_results.pkl',\n",
+       " '_all_genes_all_tissues_correlation_results.csv',\n",
+       " '_RASSF2_CYTIP_summary_tables.log',\n",
+       " '_RASSF2_CYTIP_summary_execution.log',\n",
+       " 'CYTIP_all_tissues_correlation_results.csv',\n",
+       " '_all_genes_all_tissues_correlation_results.pkl']"
+      ]
+     },
+     "execution_count": 105,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# You can find the results in the `METADATA_CORRELATIONS_RESULT_DIR` directory\n",
+    "os.listdir(METADATA_CORRELATIONS_RESULT_DIR)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs/common/README.md b/nbs/common/README.md
new file mode 100644
index 00000000..ea9f6262
--- /dev/null
+++ b/nbs/common/README.md
@@ -0,0 +1,527 @@
+# Common Analysis Tools
+
+This directory contains command-line tools for gene expression analysis using the CCC-GPU package.
+
+## Available Tools
+
+1. **[Single Gene Pair Correlation Analysis](#single-gene-pair-correlation-analysis)** (`compute_single_gene_pair_correlations_cli.py`)
+2. **[Gene Expression-Metadata Correlation Analysis](#gene-expression-metadata-correlation-analysis)** (`metadata_corr_cli.py`)
+
+---
+
+# Single Gene Pair Correlation Analysis
+
+A command-line tool for exploring gene expression data and computing correlations between specific gene pairs using CCC (Clustered Correlation Coefficient), Spearman, and Pearson correlation methods.
+
+## Features
+
+- **Data Exploration**: Browse available tissues and genes with their symbols
+- **Gene Pair Correlation**: Compute three correlation coefficients (CCC, Pearson, Spearman) for any gene pair
+- **Flexible Gene Input**: Accept both gene symbols (e.g., TP53) and Ensembl IDs (e.g., ENSG00000141510.16)
+- **Tissue-Specific Analysis**: Analyze correlations within specific tissue contexts
+- **Robust Gene Resolution**: Handle version numbers and case-insensitive matching
+- **Comprehensive Error Handling**: Clear error messages and debugging support
+
+## Installation Requirements
+
+```bash
+# Required packages
+pip install pandas numpy
+# CCC-GPU package (install from source as per project instructions)
+```
+
+## Quick Start
+
+### 1. Explore Available Data
+
+```bash
+# List all available tissues
+python compute_single_gene_pair_correlations_cli.py --list-tissues
+
+# Show genes available in whole blood tissue
+python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood
+
+# Show more genes (default is 20)
+python compute_single_gene_pair_correlations_cli.py --show-genes liver --n-genes 50
+```
+
+### 2. Compute Gene Pair Correlations
+
+```bash
+# Basic correlation analysis between TP53 and BRCA1 in whole blood
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood
+
+# Use Ensembl IDs instead of symbols
+python compute_single_gene_pair_correlations_cli.py ENSG00000141510.16 ENSG00000012048.20 --tissue liver
+
+# Mixed input (symbol and Ensembl ID)
+python compute_single_gene_pair_correlations_cli.py TP53 ENSG00000012048.20 --tissue brain_cortex
+```
+
+### 3. Save Results and Logs
+
+```bash
+# Save results and logs to a specific directory
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \
+    --output-dir ./results
+
+# Combine with debug logging for detailed output
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \
+    --output-dir ./detailed_analysis --debug
+```
+
+### 4. Custom Data Paths
+
+```bash
+# Use custom data directory and gene mapping file
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \
+    --data-dir /custom/path/to/tissue/data \
+    --gene-mapping /custom/path/to/gene_mappings.pkl \
+    --output-dir ./custom_results
+```
+
+## Command Line Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `genes` | str+ | Required | Two gene symbols or Ensembl IDs for correlation analysis |
+| `--tissue` | str | Required | Tissue name for correlation analysis |
+| `--data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing tissue expression data |
+| `--gene-mapping` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl` | Gene mapping file path |
+| `--output-dir` | str | None | Directory to save output files and logs (optional) |
+| `--list-tissues` | flag | False | List all available tissues and exit |
+| `--show-genes` | str | None | Show genes for specified tissue and exit |
+| `--n-genes` | int | 20 | Number of genes to display |
+| `--debug` | flag | False | Enable debug logging |
+
+## Output Format
+
+### Tissue and Gene Discovery
+```
+=== Available Tissues (49) ===
+ 1. adipose_subcutaneous
+ 2. adipose_visceral_omentum
+ 3. adrenal_gland
+ ...
+
+=== Tissue: whole_blood ===
+Total genes: 56,200
+Total samples: 755
+
+First 20 genes:
+------------------------------------------------------------
+#    Gene Symbol     Ensembl ID          
+------------------------------------------------------------
+1    DDX11L1         ENSG00000223972.5   
+2    WASH7P          ENSG00000227232.5   
+3    MIR6859-1       ENSG00000278267.1   
+...
+```
+
+### Correlation Results
+```
+============================================================
+GENE PAIR CORRELATION RESULTS
+============================================================
+Gene 1: TP53 (ENSG00000141510.16)
+Gene 2: BRCA1 (ENSG00000012048.20)
+Tissue: whole_blood
+Samples: 755
+------------------------------------------------------------
+         CCC: 0.123456
+     PEARSON: 0.234567
+    SPEARMAN: 0.345678
+============================================================
+Results saved to:
+  JSON: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.json
+  Pickle: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.pkl
+Log file: gene_pair_correlation_analysis_20240925_143022.log
+```
+
+### Output Files (when --output-dir is used)
+
+1. **JSON Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.json`
+   - Human-readable format with all correlation results
+   - Can be easily imported into other tools or scripts
+
+2. **Pickle Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.pkl`
+   - Python-specific format preserving exact data types
+   - Optimal for downstream analysis in Python
+
+3. **Log File**: `gene_pair_correlation_analysis_{timestamp}.log`
+   - Detailed processing information and debug messages
+   - Useful for troubleshooting and audit trails
+
+Example JSON output:
+```json
+{
+  "gene1_symbol": "TP53",
+  "gene1_ensembl_id": "ENSG00000141510.16",
+  "gene2_symbol": "BRCA1", 
+  "gene2_ensembl_id": "ENSG00000012048.20",
+  "tissue": "whole_blood",
+  "n_samples": 755,
+  "ccc": 0.123456,
+  "pearson": 0.234567,
+  "spearman": 0.345678
+}
+```
+
+## Input Data Format
+
+### Tissue Expression Files
+- **Format**: Pickle (.pkl) files
+- **Naming**: `gtex_v8_data_{tissue_name}.pkl`
+- **Structure**: DataFrame with Ensembl gene IDs as index, sample IDs as columns
+- **Content**: Log2-transformed gene expression values
+
+### Gene Mapping File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol`
+- **Content**: Mapping between Ensembl gene IDs and HUGO gene symbols
+
+## Statistical Methods
+
+### Correlation Coefficients
+
+1. **CCC (Clustered Correlation Coefficient)**
+   - GPU-accelerated implementation
+   - Robust to outliers and non-linear relationships
+   - Particularly suited for detecting complex correlation patterns
+
+2. **Pearson Correlation**
+   - Standard linear correlation coefficient
+   - Measures linear relationship strength
+
+3. **Spearman Correlation**
+   - Rank-based correlation coefficient
+   - Robust to outliers and monotonic relationships
+
+## Example Workflows
+
+### 1. Cancer Gene Analysis
+```bash
+# Explore brain tissues for TP53-related genes
+python compute_single_gene_pair_correlations_cli.py --list-tissues | grep brain
+
+# Analyze TP53 interactions in different brain regions with output saving
+python compute_single_gene_pair_correlations_cli.py TP53 MDM2 --tissue brain_cortex \
+    --output-dir ./cancer_gene_analysis --debug
+python compute_single_gene_pair_correlations_cli.py TP53 CDKN1A --tissue brain_hippocampus \
+    --output-dir ./cancer_gene_analysis --debug
+```
+
+### 2. Housekeeping Gene Analysis
+```bash
+# Compare expression correlation of housekeeping genes
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue whole_blood
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue liver
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue muscle_skeletal
+```
+
+### 3. Tissue-Specific Gene Discovery
+```bash
+# Find genes in specific tissues and analyze their relationships
+python compute_single_gene_pair_correlations_cli.py --show-genes heart_left_ventricle --n-genes 100 | grep MYH
+python compute_single_gene_pair_correlations_cli.py MYH6 MYH7 --tissue heart_left_ventricle
+```
+
+## Error Handling
+
+The tool provides comprehensive error handling:
+
+- **Gene not found**: Suggestions to check spelling or use `--show-genes`
+- **Tissue not found**: List of available tissues
+- **Data issues**: Clear messages about insufficient samples or missing data
+- **Path issues**: Validation of data directory and gene mapping file
+
+## Performance Considerations
+
+- **Memory usage**: ~100-500MB depending on tissue size
+- **Computation time**: 1-5 seconds per gene pair
+- **CCC computation**: GPU-accelerated when available
+
+---
+
+# Gene Expression-Metadata Correlation Analysis
+
+## Overview
+
+This tool computes correlations between specific gene expression levels and all available metadata columns across multiple GTEx tissues. It uses the **Clustered Correlation Coefficient (CCC)** method, which is particularly suited for detecting non-linear relationships and complex correlation patterns.
+
+### Key Features
+
+- **Multi-gene Analysis**: Process multiple genes simultaneously
+- **Cross-tissue Analysis**: Analyze correlations across all available GTEx tissues
+- **Comprehensive Metadata Coverage**: Correlate against all metadata columns automatically
+- **Statistical Significance**: Permutation-based p-value calculation with customizable iterations
+- **Flexible Tissue Filtering**: Include/exclude tissues using pattern matching
+- **Parallel Processing**: Multi-threaded computation support
+- **Detailed Logging**: Individual logs per gene-tissue combination plus comprehensive summaries
+- **Multiple Output Formats**: Results in both pickle (.pkl) and CSV formats
+- **Runtime Tracking**: Detailed performance monitoring and optimization insights
+
+## Requirements
+
+### Dependencies
+
+```python
+pandas
+numpy
+ccc  # Clustered Correlation Coefficient library
+```
+
+### Required Data Files
+
+The tool expects specific data files in predetermined locations:
+
+1. **Expression Data**: GTEx v8 expression files in the format `gtex_v8_data_{tissue_name}-var_pc_log2.pkl`
+2. **Metadata**: GTEx v8 sample metadata (`gtex_v8-sample_metadata.pkl`)
+3. **Gene Mappings**: Gene ID to symbol mappings (`gtex_gene_id_symbol_mappings.pkl`)
+
+## Installation
+
+```bash
+# Clone or download the script
+# Ensure all required Python packages are installed
+pip install pandas numpy ccc
+```
+
+## Usage
+
+### Basic Usage
+
+```bash
+# Analyze single gene across all tissues
+python metadata_corr_cli.py RASSF2
+
+# Analyze multiple genes
+python metadata_corr_cli.py RASSF2 TP53 BRCA1
+
+# Specify custom output directory
+python metadata_corr_cli.py RASSF2 --output-dir ./results
+```
+
+### Advanced Usage
+
+```bash
+# Include only specific tissues (pattern matching)
+python metadata_corr_cli.py RASSF2 --include brain liver
+
+# Exclude specific tissues
+python metadata_corr_cli.py RASSF2 --exclude cells brain
+
+# Custom permutation settings and parallel processing
+python metadata_corr_cli.py RASSF2 --permutations 500000 --n-jobs 16
+
+# Combined filtering and custom settings
+python metadata_corr_cli.py TP53 BRCA1 \
+    --include muscle heart \
+    --exclude cells \
+    --permutations 1000000 \
+    --n-jobs 32 \
+    --output-dir ./tp53_brca1_analysis
+```
+
+### Discovery Commands
+
+```bash
+# List all available tissues
+python metadata_corr_cli.py GENE --list-tissues
+
+# List all available metadata columns
+python metadata_corr_cli.py GENE --list-metadata-columns
+```
+
+## Command Line Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `gene_symbols` | str+ | Required | Gene symbol(s) to analyze (e.g., RASSF2 TP53) |
+| `--expr-data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing expression data files |
+| `--include` | str* | None | Include only tissues matching these patterns |
+| `--exclude` | str* | None | Exclude tissues matching these patterns |
+| `--permutations` | int | 100,000 | Number of permutations for p-value calculation |
+| `--n-jobs` | int | 24 | Number of parallel jobs for computation |
+| `--output-dir` | str | `.` | Directory to save output files |
+| `--list-metadata-columns` | flag | False | List available metadata columns and exit |
+| `--list-tissues` | flag | False | List available tissue files and exit |
+
+## Input File Formats
+
+### Expression Data Files
+- **Format**: Pickle (.pkl) files
+- **Structure**: DataFrame with genes as rows, samples as columns
+- **Naming**: `gtex_v8_data_{tissue_name}-var_pc_log2.pkl`
+- **Content**: Log2-transformed, variance-filtered gene expression data
+
+### Metadata File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with samples as rows, metadata columns as columns
+- **Content**: All GTEx v8 sample metadata including demographics, sampling info, etc.
+
+### Gene Mapping File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol`
+- **Content**: Mapping between Ensembl gene IDs and gene symbols
+
+## Output Files
+
+### Per Gene-Tissue Results
+- **Individual Results**: `{gene}_{tissue}_correlation_results.pkl`
+- **Individual Logs**: `{gene}_{tissue}.log`
+- **Content**: Correlation results for each metadata column
+
+### Per Gene Summaries
+- **Combined Results**: `{gene}_all_tissues_correlation_results.pkl`
+- **Combined CSV**: `{gene}_all_tissues_correlation_results.csv`
+- **Content**: All tissues combined for single gene
+
+### Overall Results
+- **Mega Results**: `_all_genes_all_tissues_correlation_results.pkl`
+- **Mega CSV**: `_all_genes_all_tissues_correlation_results.csv`
+- **Summary Log**: `_{genes}_summary_execution.log`
+- **Summary Tables**: `_{genes}_summary_tables.log`
+
+### Result DataFrame Structure
+
+```python
+# Each results DataFrame contains:
+{
+    'ccc_value': float,      # CCC correlation coefficient
+    'p_value': float,        # Permutation-based p-value
+    'status': str,           # 'success', 'all_nan', 'insufficient_variation', or 'error'
+    'tissue': str,           # Tissue name
+    'gene_symbol': str,      # Gene symbol
+    'gene_id': str,         # Ensembl gene ID
+    'n_samples': int        # Number of samples used
+}
+```
+
+## Analysis Workflow
+
+### 1. **Gene Discovery**
+- Converts gene symbols to Ensembl IDs using gene mapping
+- Validates gene existence across tissues
+
+### 2. **Tissue Processing**
+- Loads expression data for each tissue
+- Filters to common samples between expression and metadata
+- Handles missing data and insufficient variation gracefully
+
+### 3. **Correlation Analysis**
+- Computes CCC between gene expression and each metadata column
+- Calculates statistical significance via permutation testing
+- Handles various data types and edge cases
+
+### 4. **Results Compilation**
+- Aggregates results across tissues and genes
+- Generates comprehensive summary statistics
+- Creates ranked lists of strongest correlations
+
+### 5. **Performance Monitoring**
+- Tracks runtime for each gene-tissue combination
+- Identifies computational bottlenecks
+- Provides optimization recommendations
+
+## Statistical Methods
+
+### Clustered Correlation Coefficient (CCC)
+- **Purpose**: Detects both linear and non-linear relationships
+- **Advantages**: Robust to outliers, captures complex patterns
+- **Implementation**: Uses permutation-based significance testing
+
+### Significance Levels
+- `***`: p < 0.001 (highly significant)
+- `**`: p < 0.01 (significant)
+- `*`: p < 0.05 (marginally significant)
+- `ns`: p ≥ 0.05 (not significant)
+
+## Performance Considerations
+
+### Computational Requirements
+- **Memory**: ~2-8 GB depending on tissue size and number of genes
+- **CPU**: Benefits from multi-core systems (default: 24 cores)
+- **Time**: ~1-5 minutes per gene-tissue combination
+
+### Optimization Tips
+- **Parallel Processing**: Increase `--n-jobs` for faster computation
+- **Permutations**: Reduce `--permutations` for faster (less precise) p-values
+- **Tissue Filtering**: Use `--include`/`--exclude` to focus on relevant tissues
+- **Batch Processing**: Process multiple genes together for efficiency
+
+## Example Workflows
+
+### 1. Cancer Gene Analysis
+```bash
+# Analyze tumor suppressor genes across cancer-relevant tissues
+python metadata_corr_cli.py TP53 BRCA1 BRCA2 PTEN \
+    --include breast ovary lung liver \
+    --permutations 1000000 \
+    --n-jobs 32 \
+    --output-dir ./cancer_genes_analysis
+```
+
+### 2. Brain-Specific Gene Study
+```bash
+# Focus on brain tissues for neurological genes
+python metadata_corr_cli.py APOE MAPT SNCA \
+    --include brain \
+    --exclude cells \
+    --output-dir ./brain_genes
+```
+
+### 3. Exploratory Analysis
+```bash
+# Quick exploration with reduced permutations
+python metadata_corr_cli.py GENE_OF_INTEREST \
+    --permutations 10000 \
+    --n-jobs 8 \
+    --output-dir ./exploratory
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Gene Not Found**: Check gene symbol spelling and availability in gene mapping
+2. **No Expression Data**: Verify gene is expressed in selected tissues
+3. **Memory Errors**: Reduce number of parallel jobs or process fewer genes at once
+4. **File Not Found**: Ensure all required data files exist in expected locations
+
+### Error Codes
+- **Gene symbol not found**: Gene not in mapping file
+- **No common samples**: Expression and metadata samples don't overlap
+- **All NaN values**: Metadata column contains only missing values
+- **Insufficient variation**: Metadata column has ≤1 unique values
+
+## Output Interpretation
+
+### Top Results Tables
+- Results ranked by absolute CCC value
+- Include significance levels and tissue information
+- Show strongest correlations across all analyses
+
+### Summary Statistics
+- **Mean |CCC|**: Average absolute correlation strength
+- **Max |CCC|**: Strongest correlation found
+- **Success Rate**: Proportion of successful analyses
+- **Runtime Metrics**: Performance characteristics
+
+## Citation
+
+If you use this tool in your research, please cite the CCC method and relevant GTEx publications.
+
+## Version Information
+
+- **Script**: metadata_corr_cli.py
+- **Converted from**: 00-data-exploration.ipynb
+- **GTEx Version**: v8
+- **CCC Implementation**: Uses ccc.coef module
+
+## Support
+
+For issues related to:
+- **CCC Method**: Refer to CCC library documentation
+- **GTEx Data**: Consult GTEx consortium resources
+- **Script Usage**: Check this README or examine log files for detailed error messages 
\ No newline at end of file
diff --git a/nbs/common/compute_single_gene_pair_correlations_cli.py b/nbs/common/compute_single_gene_pair_correlations_cli.py
new file mode 100755
index 00000000..2cd2c690
--- /dev/null
+++ b/nbs/common/compute_single_gene_pair_correlations_cli.py
@@ -0,0 +1,624 @@
+#!/usr/bin/env python3
+"""
+Single Gene Pair Correlation Analysis Tool
+
+A command-line tool for exploring gene expression data and computing correlations 
+between specific gene pairs using CCC (Clustered Correlation Coefficient), 
+Spearman, and Pearson correlation methods.
+
+This script provides two main functionalities:
+1. Data exploration: Show available genes and their symbols for a tissue
+2. Correlation analysis: Compute correlations for a specific gene pair in a tissue
+
+Author: Generated for CCC-GPU project
+Version: 1.0
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Union
+
+import pandas as pd
+import numpy as np
+
+# Import correlation methods
+try:
+    from ccc.corr import ccc_gpu, pearson, spearman
+except ImportError:
+    print("Error: CCC library not found. Please install the ccc package.")
+    sys.exit(1)
+
+
+def setup_logging(debug: bool = False, output_dir: Optional[Path] = None) -> Optional[Path]:
+    """Configure logging for the script.
+    
+    Args:
+        debug: Enable debug level logging if True
+        output_dir: Directory to write log files to (optional)
+        
+    Returns:
+        Path to log file if output_dir provided, None otherwise
+    """
+    level = logging.DEBUG if debug else logging.INFO
+    
+    # Clear any existing handlers
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    
+    # Setup formatters
+    console_formatter = logging.Formatter(
+        '%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%H:%M:%S'
+    )
+    file_formatter = logging.Formatter(
+        '%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    
+    # Setup handlers
+    handlers = []
+    
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(console_formatter)
+    handlers.append(console_handler)
+    
+    # File handler (if output directory provided)
+    log_file = None
+    if output_dir:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        from datetime import datetime
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_file = output_dir / f"gene_pair_correlation_analysis_{timestamp}.log"
+        
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(file_formatter)
+        handlers.append(file_handler)
+    
+    # Configure root logger
+    logging.basicConfig(
+        level=level,
+        handlers=handlers,
+        force=True
+    )
+    
+    if log_file:
+        logging.info(f"Log file created: {log_file}")
+        
+    return log_file
+
+
+class GeneExpressionAnalyzer:
+    """Main class for gene expression analysis and correlation computation."""
+    
+    def __init__(self, data_dir: str, gene_mapping_file: str):
+        """Initialize the analyzer with data directory and gene mapping file.
+        
+        Args:
+            data_dir: Directory containing tissue expression data files
+            gene_mapping_file: Path to gene ID to symbol mapping file
+        """
+        self.data_dir = Path(data_dir)
+        self.gene_mapping_file = Path(gene_mapping_file)
+        self._gene_mapping = None
+        self._validate_inputs()
+    
+    def _validate_inputs(self) -> None:
+        """Validate that input paths exist and are accessible."""
+        if not self.data_dir.exists():
+            raise FileNotFoundError(f"Data directory not found: {self.data_dir}")
+        
+        if not self.gene_mapping_file.exists():
+            raise FileNotFoundError(f"Gene mapping file not found: {self.gene_mapping_file}")
+    
+    @property
+    def gene_mapping(self) -> pd.DataFrame:
+        """Load and cache gene mapping data."""
+        if self._gene_mapping is None:
+            logging.info(f"Loading gene mapping from: {self.gene_mapping_file}")
+            self._gene_mapping = pd.read_pickle(self.gene_mapping_file)
+            logging.info(f"Loaded {len(self._gene_mapping)} gene mappings")
+        return self._gene_mapping
+    
+    def list_available_tissues(self) -> list:
+        """Get list of available tissue files.
+        
+        Returns:
+            List of tissue names (without file extensions)
+        """
+        tissue_files = list(self.data_dir.glob("gtex_v8_data_*.pkl"))
+        tissues = [f.stem.replace("gtex_v8_data_", "") for f in tissue_files]
+        return sorted(tissues)
+    
+    def _find_tissue_file(self, tissue: str) -> Path:
+        """Find the tissue file for a given tissue name.
+        
+        Args:
+            tissue: Tissue name
+            
+        Returns:
+            Path to tissue file
+            
+        Raises:
+            FileNotFoundError: If tissue file is not found
+        """
+        # Try exact match first
+        exact_file = self.data_dir / f"gtex_v8_data_{tissue}.pkl"
+        if exact_file.exists():
+            return exact_file
+        
+        # Try partial matching
+        tissue_files = list(self.data_dir.glob(f"gtex_v8_data_*{tissue}*.pkl"))
+        if len(tissue_files) == 1:
+            return tissue_files[0]
+        elif len(tissue_files) > 1:
+            matches = [f.stem for f in tissue_files]
+            raise ValueError(
+                f"Multiple tissue files match '{tissue}': {matches}. "
+                "Please be more specific."
+            )
+        else:
+            available = self.list_available_tissues()
+            raise FileNotFoundError(
+                f"No tissue file found for '{tissue}'. "
+                f"Available tissues: {available[:10]}..." if len(available) > 10 
+                else f"Available tissues: {available}"
+            )
+    
+    def show_tissue_genes(self, tissue: str, n_genes: int = 20) -> None:
+        """Display available genes and their symbols for a tissue.
+        
+        Args:
+            tissue: Tissue name
+            n_genes: Number of genes to display (default: 20)
+        """
+        # Load tissue data
+        tissue_file = self._find_tissue_file(tissue)
+        logging.info(f"Loading tissue data from: {tissue_file}")
+        
+        tissue_data = pd.read_pickle(tissue_file)
+        logging.info(f"Tissue data shape: {tissue_data.shape}")
+        
+        # Get gene IDs and map to symbols
+        gene_ids = tissue_data.index.tolist()
+        
+        # Create mapping lookup for faster access
+        gene_mapping = self.gene_mapping.set_index('gene_ens_id')
+        
+        print(f"\n=== Tissue: {tissue} ===")
+        print(f"Total genes: {len(gene_ids):,}")
+        print(f"Total samples: {tissue_data.shape[1]:,}")
+        print(f"\nFirst {n_genes} genes:")
+        print("-" * 60)
+        print(f"{'#':<4} {'Gene Symbol':<15} {'Ensembl ID':<20}")
+        print("-" * 60)
+        
+        for i, gene_id in enumerate(gene_ids[:n_genes], 1):
+            # Remove version from gene ID for mapping lookup
+            clean_gene_id = gene_id.split('.')[0] if '.' in gene_id else gene_id
+            
+            # Look up symbol
+            symbol = "N/A"
+            if gene_id in gene_mapping.index:
+                symbol = gene_mapping.loc[gene_id, 'gene_symbol']
+            elif clean_gene_id in gene_mapping.index:
+                symbol = gene_mapping.loc[clean_gene_id, 'gene_symbol']
+            else:
+                # Search in original mapping
+                matches = self.gene_mapping[
+                    self.gene_mapping['gene_ens_id'].str.startswith(clean_gene_id)
+                ]
+                if len(matches) > 0:
+                    symbol = matches.iloc[0]['gene_symbol']
+            
+            print(f"{i:<4} {symbol:<15} {gene_id:<20}")
+        
+        if len(gene_ids) > n_genes:
+            print(f"... and {len(gene_ids) - n_genes:,} more genes")
+        print()
+    
+    def _resolve_gene(self, gene_input: str) -> Tuple[str, str]:
+        """Resolve gene input to Ensembl ID and symbol.
+        
+        Args:
+            gene_input: Gene symbol or Ensembl ID
+            
+        Returns:
+            Tuple of (ensembl_id, gene_symbol)
+            
+        Raises:
+            ValueError: If gene cannot be resolved
+        """
+        # Check if it's already an Ensembl ID
+        if gene_input.startswith('ENSG'):
+            # Look up the symbol
+            matches = self.gene_mapping[self.gene_mapping['gene_ens_id'] == gene_input]
+            if len(matches) == 0:
+                # Try without version
+                base_id = gene_input.split('.')[0]
+                matches = self.gene_mapping[
+                    self.gene_mapping['gene_ens_id'].str.startswith(base_id)
+                ]
+            
+            if len(matches) > 0:
+                return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+            else:
+                raise ValueError(f"Ensembl ID '{gene_input}' not found in mapping")
+        else:
+            # Assume it's a gene symbol
+            matches = self.gene_mapping[self.gene_mapping['gene_symbol'] == gene_input]
+            if len(matches) > 0:
+                return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+            else:
+                # Try case-insensitive search
+                matches = self.gene_mapping[
+                    self.gene_mapping['gene_symbol'].str.upper() == gene_input.upper()
+                ]
+                if len(matches) > 0:
+                    return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+                else:
+                    raise ValueError(
+                        f"Gene symbol '{gene_input}' not found. "
+                        "Use --show-genes to see available genes."
+                    )
+    
+    def compute_gene_pair_correlations(
+        self, 
+        gene1: str, 
+        gene2: str, 
+        tissue: str
+    ) -> Dict[str, Union[float, str]]:
+        """Compute correlations between two genes in a specific tissue.
+        
+        Args:
+            gene1: First gene (symbol or Ensembl ID)
+            gene2: Second gene (symbol or Ensembl ID)  
+            tissue: Tissue name
+            
+        Returns:
+            Dictionary with correlation results
+        """
+        # Resolve genes
+        gene1_id, gene1_symbol = self._resolve_gene(gene1)
+        gene2_id, gene2_symbol = self._resolve_gene(gene2)
+        
+        # Load tissue data
+        tissue_file = self._find_tissue_file(tissue)
+        logging.info(f"Loading tissue data from: {tissue_file}")
+        
+        tissue_data = pd.read_pickle(tissue_file)
+        logging.info(f"Tissue data shape: {tissue_data.shape}")
+        
+        # Extract gene expression data
+        gene1_expr = self._extract_gene_expression(tissue_data, gene1_id, gene1_symbol)
+        gene2_expr = self._extract_gene_expression(tissue_data, gene2_id, gene2_symbol)
+        
+        # Ensure we have the same samples
+        common_samples = gene1_expr.index.intersection(gene2_expr.index)
+        if len(common_samples) == 0:
+            raise ValueError("No common samples between the two genes")
+        
+        gene1_values = gene1_expr.loc[common_samples].values
+        gene2_values = gene2_expr.loc[common_samples].values
+        
+        # Remove any NaN values
+        mask = ~(np.isnan(gene1_values) | np.isnan(gene2_values))
+        gene1_clean = gene1_values[mask]
+        gene2_clean = gene2_values[mask]
+        
+        if len(gene1_clean) < 3:
+            raise ValueError("Insufficient valid data points for correlation analysis")
+        
+        logging.info(f"Computing correlations for {len(gene1_clean)} samples")
+        
+        # Compute correlations
+        results = {
+            'gene1_symbol': gene1_symbol,
+            'gene1_ensembl_id': gene1_id,
+            'gene2_symbol': gene2_symbol, 
+            'gene2_ensembl_id': gene2_id,
+            'tissue': tissue,
+            'n_samples': len(gene1_clean),
+        }
+        
+        # Create DataFrame for correlation computation (genes as rows, samples as columns)
+        # This matches the format expected by ccc.corr functions
+        data_df = pd.DataFrame({
+            f'sample_{i}': [gene1_clean[i], gene2_clean[i]] 
+            for i in range(len(gene1_clean))
+        }, index=[gene1_symbol, gene2_symbol])
+        
+        try:
+            # Compute CCC
+            logging.info("Computing CCC correlation...")
+            ccc_result = ccc_gpu(data_df, n_jobs=1)  # Use single job for pair
+            results['ccc'] = float(ccc_result.iloc[0, 1])  # Off-diagonal element
+        except Exception as e:
+            logging.warning(f"CCC computation failed: {e}")
+            results['ccc'] = None
+        
+        try:
+            # Compute Pearson correlation
+            logging.info("Computing Pearson correlation...")
+            pearson_result = pearson(data_df)
+            results['pearson'] = float(pearson_result.iloc[0, 1])
+        except Exception as e:
+            logging.warning(f"Pearson computation failed: {e}")
+            results['pearson'] = None
+        
+        try:
+            # Compute Spearman correlation
+            logging.info("Computing Spearman correlation...")
+            spearman_result = spearman(data_df)
+            results['spearman'] = float(spearman_result.iloc[0, 1])
+        except Exception as e:
+            logging.warning(f"Spearman computation failed: {e}")
+            results['spearman'] = None
+        
+        return results
+    
+    def _extract_gene_expression(self, tissue_data: pd.DataFrame, gene_id: str, gene_symbol: str) -> pd.Series:
+        """Extract expression data for a specific gene.
+        
+        Args:
+            tissue_data: Tissue expression DataFrame
+            gene_id: Ensembl gene ID
+            gene_symbol: Gene symbol
+            
+        Returns:
+            Series with gene expression values
+            
+        Raises:
+            ValueError: If gene is not found in tissue data
+        """
+        # Try exact match first
+        if gene_id in tissue_data.index:
+            return tissue_data.loc[gene_id]
+        
+        # Try without version
+        base_id = gene_id.split('.')[0]
+        matches = [idx for idx in tissue_data.index if idx.startswith(base_id)]
+        
+        if len(matches) == 1:
+            return tissue_data.loc[matches[0]]
+        elif len(matches) > 1:
+            logging.warning(f"Multiple matches for {gene_symbol} ({gene_id}), using first match")
+            return tissue_data.loc[matches[0]]
+        else:
+            raise ValueError(f"Gene {gene_symbol} ({gene_id}) not found in tissue data")
+
+
+def save_results(results: Dict[str, Union[float, str]], output_dir: Path) -> Tuple[Path, Path]:
+    """Save correlation results to files.
+    
+    Args:
+        results: Dictionary containing correlation results
+        output_dir: Directory to save files
+        
+    Returns:
+        Tuple of (json_file_path, pickle_file_path)
+    """
+    import json
+    import pickle
+    from datetime import datetime
+    
+    # Create filenames
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    gene1_symbol = results['gene1_symbol']
+    gene2_symbol = results['gene2_symbol'] 
+    tissue = results['tissue']
+    
+    base_filename = f"{gene1_symbol}_{gene2_symbol}_{tissue}_{timestamp}"
+    json_file = output_dir / f"{base_filename}_correlation_results.json"
+    pickle_file = output_dir / f"{base_filename}_correlation_results.pkl"
+    
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Save as JSON (human readable)
+    json_data = {}
+    for key, value in results.items():
+        if isinstance(value, (int, float, str)):
+            json_data[key] = value
+        else:
+            json_data[key] = str(value)
+    
+    with open(json_file, 'w') as f:
+        json.dump(json_data, f, indent=2)
+    
+    # Save as pickle (preserves data types)
+    with open(pickle_file, 'wb') as f:
+        pickle.dump(results, f)
+    
+    logging.info(f"Results saved to: {json_file}")
+    logging.info(f"Results saved to: {pickle_file}")
+    
+    return json_file, pickle_file
+
+
+def main():
+    """Main function to handle command line arguments and execute analysis."""
+    parser = argparse.ArgumentParser(
+        description="Single Gene Pair Correlation Analysis Tool",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show available tissues
+  python compute_single_gene_pair_correlations_cli.py --list-tissues
+  
+  # Show genes in whole blood tissue
+  python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood
+  
+  # Compute correlations between TP53 and BRCA1 in whole blood
+  python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood
+  
+  # Save results and logs to output directory
+  python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\
+    --output-dir ./results --debug
+  
+  # Use custom data directory and gene mapping
+  python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\
+    --data-dir /custom/path/data \\
+    --gene-mapping /custom/path/mappings.pkl \\
+    --output-dir ./custom_results
+        """
+    )
+    
+    # Positional arguments for gene pair analysis
+    parser.add_argument(
+        'genes',
+        nargs='*',
+        help='Two gene symbols or Ensembl IDs for correlation analysis (e.g., TP53 BRCA1)'
+    )
+    
+    # Main options
+    parser.add_argument(
+        '--tissue',
+        type=str,
+        help='Tissue name for analysis (required for correlation analysis)'
+    )
+    
+    parser.add_argument(
+        '--data-dir',
+        type=str,
+        default='/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue',
+        help='Directory containing tissue expression data files'
+    )
+    
+    parser.add_argument(
+        '--gene-mapping',
+        type=str,
+        default='/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl',
+        help='Path to gene ID to symbol mapping file'
+    )
+    
+    # Discovery options
+    parser.add_argument(
+        '--list-tissues',
+        action='store_true',
+        help='List all available tissues and exit'
+    )
+    
+    parser.add_argument(
+        '--show-genes',
+        type=str,
+        metavar='TISSUE',
+        help='Show available genes for specified tissue and exit'
+    )
+    
+    parser.add_argument(
+        '--n-genes',
+        type=int,
+        default=20,
+        help='Number of genes to show (default: 20)'
+    )
+    
+    # Output options
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        help='Directory to save output files and logs (optional)'
+    )
+    
+    # Utility options
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Enable debug logging'
+    )
+    
+    args = parser.parse_args()
+    
+    # Setup output directory
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    
+    # Setup logging
+    log_file = setup_logging(debug=args.debug, output_dir=output_dir)
+    
+    try:
+        # Initialize analyzer
+        analyzer = GeneExpressionAnalyzer(args.data_dir, args.gene_mapping)
+        
+        # Handle discovery commands
+        if args.list_tissues:
+            tissues = analyzer.list_available_tissues()
+            print(f"\n=== Available Tissues ({len(tissues)}) ===")
+            for i, tissue in enumerate(tissues, 1):
+                print(f"{i:2d}. {tissue}")
+            print()
+            return
+        
+        if args.show_genes:
+            analyzer.show_tissue_genes(args.show_genes, args.n_genes)
+            return
+        
+        # Handle correlation analysis
+        if len(args.genes) != 2:
+            parser.error(
+                "Exactly two genes are required for correlation analysis. "
+                "Use --show-genes to see available genes, or --list-tissues to see available tissues."
+            )
+        
+        if not args.tissue:
+            parser.error(
+                "Tissue is required for correlation analysis. "
+                "Use --list-tissues to see available tissues."
+            )
+        
+        gene1, gene2 = args.genes
+        results = analyzer.compute_gene_pair_correlations(gene1, gene2, args.tissue)
+        
+        # Save results to files if output directory provided
+        saved_files = None
+        if output_dir:
+            try:
+                saved_files = save_results(results, output_dir)
+                logging.info(f"Results saved to output directory: {output_dir}")
+            except Exception as e:
+                logging.error(f"Failed to save results: {e}")
+        
+        # Print results
+        print("\n" + "="*60)
+        print("GENE PAIR CORRELATION RESULTS")
+        print("="*60)
+        print(f"Gene 1: {results['gene1_symbol']} ({results['gene1_ensembl_id']})")
+        print(f"Gene 2: {results['gene2_symbol']} ({results['gene2_ensembl_id']})")
+        print(f"Tissue: {results['tissue']}")
+        print(f"Samples: {results['n_samples']:,}")
+        print("-" * 60)
+        
+        for method in ['ccc', 'pearson', 'spearman']:
+            value = results.get(method)
+            if value is not None:
+                print(f"{method.upper():>12}: {value:.6f}")
+            else:
+                print(f"{method.upper():>12}: Failed to compute")
+        
+        print("="*60)
+        
+        # Show saved files info
+        if saved_files:
+            print(f"Results saved to:")
+            print(f"  JSON: {saved_files[0].name}")
+            print(f"  Pickle: {saved_files[1].name}")
+        
+        if log_file:
+            print(f"Log file: {log_file.name}")
+        
+        print()
+        
+        # Also return as dict for programmatic use
+        return results
+        
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        if args.debug:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/nbs/common/metadata_corr_cli.py b/nbs/common/metadata_corr_cli.py
new file mode 100755
index 00000000..669c9ccd
--- /dev/null
+++ b/nbs/common/metadata_corr_cli.py
@@ -0,0 +1,1191 @@
+#!/usr/bin/env python3
+"""
+CLI tool for exploring gene expression correlations with metadata.
+Converted from 00-data-exploration.ipynb
+"""
+
+import argparse
+import sys
+import warnings
+import re
+import time
+import logging
+from pathlib import Path
+import pandas as pd
+import numpy as np
+# from ccc.coef import ccc
+from ccc.coef.impl_gpu import ccc 
+
+# Suppress specific NumPy warnings
+warnings.filterwarnings("ignore", message="invalid value encountered in cast")
+warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
+
+# Global quiet flag for batch processing
+QUIET_MODE = False
+
+
+def find_expression_files(expr_data_dir, include_patterns=None, exclude_patterns=None, quiet=False):
+    """Find expression files matching include/exclude patterns."""
+    expr_data_dir = Path(expr_data_dir)
+
+    if not expr_data_dir.exists():
+        raise FileNotFoundError(f"Expression data directory not found: {expr_data_dir}")
+
+    # Find all .pkl files with the expected pattern
+    pattern = re.compile(r"gtex_v8_data_(.+)\.pkl$")
+    all_files = []
+
+    for file_path in expr_data_dir.glob("*.pkl"):
+        match = pattern.match(file_path.name)
+        if match:
+            tissue_name = match.group(1)
+            all_files.append((file_path, tissue_name))
+
+    if not all_files:
+        raise FileNotFoundError(
+            f"No matching expression files found in {expr_data_dir}"
+        )
+
+    # Apply include patterns
+    if include_patterns:
+        filtered_files = []
+        for file_path, tissue_name in all_files:
+            for pattern in include_patterns:
+                if re.search(pattern.lower(), tissue_name.lower()) or re.search(
+                    pattern.lower(), file_path.name.lower()
+                ):
+                    filtered_files.append((file_path, tissue_name))
+                    break
+        all_files = filtered_files
+
+    # Apply exclude patterns
+    if exclude_patterns:
+        filtered_files = []
+        for file_path, tissue_name in all_files:
+            excluded = False
+            for pattern in exclude_patterns:
+                if re.search(pattern.lower(), tissue_name.lower()) or re.search(
+                    pattern.lower(), file_path.name.lower()
+                ):
+                    excluded = True
+                    break
+            if not excluded:
+                filtered_files.append((file_path, tissue_name))
+        all_files = filtered_files
+
+    if not quiet:
+        print(f"Found {len(all_files)} expression files to process:")
+        for file_path, tissue_name in all_files:
+            print(f"  {tissue_name}: {file_path.name}")
+
+    return all_files
+
+
+def load_metadata_and_gene_map(data_dir, quiet=False):
+    """Load metadata and gene mapping files."""
+    # Define paths
+    DATA_DIR = Path(data_dir)
+
+    # File paths
+    METADATA_FILE = DATA_DIR / "gtex_v8-sample_metadata.pkl"
+    GENE_MAP_FILE = DATA_DIR / "gtex_gene_id_symbol_mappings.pkl"
+
+    # Check if files exist
+    for file_path in [METADATA_FILE, GENE_MAP_FILE]:
+        if not file_path.exists():
+            raise FileNotFoundError(f"Required file not found: {file_path}")
+
+    if not quiet:
+        print("Loading metadata and gene mapping files...")
+
+    # Load data
+    gtex_metadata = pd.read_pickle(METADATA_FILE)
+    gene_map = pd.read_pickle(GENE_MAP_FILE)
+
+    if not quiet:
+        print(f"Loaded metadata: {gtex_metadata.shape}")
+        print(f"Loaded gene mapping: {gene_map.shape}")
+
+    return gtex_metadata, gene_map
+
+
+def setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs=False):
+    """Set up a logger for a specific gene-tissue combination."""
+    logger_name = f"tissue_{gene_symbol}_{tissue_name}"
+    logger = logging.getLogger(logger_name)
+
+    # Clear any existing handlers
+    logger.handlers.clear()
+
+    # Set level
+    logger.setLevel(logging.INFO)
+
+    log_file = None
+    if not no_individual_logs:
+        # Create file handler
+        log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+        file_handler = logging.FileHandler(log_file, mode="w")
+        file_handler.setLevel(logging.INFO)
+
+        # Create formatter
+        formatter = logging.Formatter(
+            "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+        )
+        file_handler.setFormatter(formatter)
+
+        # Add handler to logger
+        logger.addHandler(file_handler)
+    
+    # Always return a logger (may have no handlers if individual logs disabled)
+    return logger, log_file
+
+
+def setup_summary_logger(gene_symbols, output_dir):
+    """Set up a logger for the main function summary."""
+    logger_name = "summary"
+    logger = logging.getLogger(logger_name)
+
+    # Clear any existing handlers
+    logger.handlers.clear()
+
+    # Set level
+    logger.setLevel(logging.INFO)
+
+    # Create file handler
+    genes_connected = "_".join(gene_symbols)
+    log_file = output_dir / f"_{genes_connected}_summary_execution.log"
+    file_handler = logging.FileHandler(log_file, mode="w")
+    file_handler.setLevel(logging.INFO)
+
+    # Create formatter
+    formatter = logging.Formatter(
+        "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    file_handler.setFormatter(formatter)
+
+    # Add handler to logger
+    logger.addHandler(file_handler)
+
+    return logger, log_file
+
+
+def log_and_print(message, logger=None, summary_file=None, quiet=None):
+    """Print message and log it if logger is provided, optionally write to summary file."""
+    # Use global quiet mode if not explicitly specified
+    if quiet is None:
+        quiet = QUIET_MODE
+    
+    if not quiet:
+        print(message)
+    if logger:
+        logger.info(message)
+    if summary_file:
+        summary_file.write(message + "\n")
+        summary_file.flush()  # Ensure immediate write to disk
+
+
+def get_gene_id(gene_symbol, gene_map):
+    """Get gene ID from gene symbol."""
+    matches = gene_map.loc[gene_map["gene_symbol"] == gene_symbol, "gene_ens_id"]
+
+    if len(matches) == 0:
+        raise ValueError(f"Gene symbol '{gene_symbol}' not found in gene mapping")
+    elif len(matches) > 1:
+        print(
+            f"Warning: Multiple matches found for '{gene_symbol}': {matches.tolist()}"
+        )
+        print(f"Using first match: {matches.iloc[0]}")
+
+    return matches.iloc[0]
+
+
+def compute_correlations_for_tissue(
+    gene_symbol,
+    tissue_name,
+    expr_file_path,
+    gtex_metadata,
+    gene_map,
+    output_dir,
+    pvalue_n_perms=1000000,
+    n_jobs=1,
+    no_individual_logs=False,
+):
+    """Compute correlation between gene expression and all metadata columns for a specific tissue."""
+
+    # Set up logging for this tissue
+    logger, log_file = setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs)
+
+    log_and_print(f"\n{'='*60}", logger)
+    log_and_print(f"Processing tissue: {tissue_name}", logger)
+    log_and_print(f"File: {expr_file_path.name}", logger)
+    log_and_print(f"Log file: {log_file}", logger)
+    log_and_print(f"{'='*60}", logger)
+
+    # Load expression data
+    log_and_print("Loading expression data...", logger)
+    expr_data = pd.read_pickle(expr_file_path)
+    log_and_print(f"Expression data shape: {expr_data.shape}", logger)
+
+    # Get gene ID
+    gene_id = get_gene_id(gene_symbol, gene_map)
+    log_and_print(f"Gene ID for {gene_symbol}: {gene_id}", logger)
+
+    # Check if gene exists in this tissue
+    if gene_id not in expr_data.index:
+        log_and_print(
+            f"Warning: Gene ID '{gene_id}' not found in {tissue_name} expression data",
+            logger,
+        )
+        return None, gene_id
+
+    # Get sample IDs from expression data
+    sample_ids = expr_data.columns
+    log_and_print(f"Number of samples: {len(sample_ids)}", logger)
+
+    # Get gene expression data
+    gene_expr_row = expr_data.loc[gene_id]
+
+    # Get metadata for these samples (only for samples that exist in both datasets)
+    common_samples = sample_ids.intersection(gtex_metadata.index)
+    if len(common_samples) == 0:
+        log_and_print(
+            f"Warning: No common samples found between {tissue_name} expression data and metadata",
+            logger,
+        )
+        return None, gene_id
+
+    log_and_print(f"Common samples: {len(common_samples)}", logger)
+
+    # Filter to common samples
+    gene_expr_filtered = gene_expr_row.loc[common_samples]
+    sample_metadata = gtex_metadata.loc[common_samples]
+
+    log_and_print(
+        f"Computing CCC between {gene_symbol} expression and all metadata columns...",
+        logger,
+    )
+    log_and_print(f"Using {pvalue_n_perms} permutations and {n_jobs} jobs", logger)
+    log_and_print(
+        f"Processing {len(sample_metadata.columns)} metadata columns...", logger
+    )
+
+    # Initialize results
+    results = []
+
+    # Iterate through all metadata columns
+    for i, column in enumerate(sample_metadata.columns, 1):
+        log_and_print(
+            f"Processing column {i}/{len(sample_metadata.columns)}: {column}", logger
+        )
+
+        try:
+            metadata_vector = sample_metadata[column]
+
+            # Skip columns with all NaN values
+            if metadata_vector.isna().all():
+                log_and_print(f"  Skipping {column}: all values are NaN", logger)
+                results.append(
+                    {
+                        "metadata_column": column,
+                        "ccc_value": np.nan,
+                        "p_value": np.nan,
+                        "status": "all_nan",
+                    }
+                )
+                continue
+
+            # Skip columns with only one unique value (after removing NaN)
+            unique_values = metadata_vector.dropna().nunique()
+            if unique_values <= 1:
+                log_and_print(
+                    f"  Skipping {column}: only {unique_values} unique value(s)", logger
+                )
+                results.append(
+                    {
+                        "metadata_column": column,
+                        "ccc_value": np.nan,
+                        "p_value": np.nan,
+                        "status": "insufficient_variation",
+                    }
+                )
+                continue
+
+            # Compute CCC (suppress numpy warnings during computation)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", RuntimeWarning)
+                ccc_val, ccc_pval = ccc(
+                    gene_expr_filtered,
+                    metadata_vector,
+                    pvalue_n_perms=pvalue_n_perms,
+                    n_jobs=n_jobs,
+                )
+
+            results.append(
+                {
+                    "metadata_column": column,
+                    "ccc_value": ccc_val,
+                    "p_value": ccc_pval,
+                    "status": "success",
+                }
+            )
+
+            log_and_print(f"  CCC: {ccc_val:.6f}, p-value: {ccc_pval:.2e}", logger)
+
+        except Exception as e:
+            log_and_print(f"  Error processing {column}: {e}", logger)
+            results.append(
+                {
+                    "metadata_column": column,
+                    "ccc_value": np.nan,
+                    "p_value": np.nan,
+                    "status": f"error: {str(e)}",
+                }
+            )
+
+    # Convert to DataFrame with metadata column names as index
+    results_df = pd.DataFrame(results)
+    results_df.set_index("metadata_column", inplace=True)
+
+    # Add tissue information
+    results_df["tissue"] = tissue_name
+    results_df["gene_symbol"] = gene_symbol
+    results_df["gene_id"] = gene_id
+    results_df["n_samples"] = len(common_samples)
+
+    # Log completion
+    successful_analyses = results_df[results_df["status"] == "success"]
+    log_and_print(f"\nCompleted processing {tissue_name}:", logger)
+    log_and_print(f"  Total metadata columns: {len(results_df)}", logger)
+    log_and_print(f"  Successful analyses: {len(successful_analyses)}", logger)
+    log_and_print(
+        f"  Skipped/Failed: {len(results_df) - len(successful_analyses)}", logger
+    )
+
+    # Close the logger
+    for handler in logger.handlers:
+        handler.close()
+        logger.removeHandler(handler)
+
+    return results_df, gene_id
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze gene expression correlations with metadata using CCC across multiple tissues",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "gene_symbols",
+        nargs="+",
+        help="Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)",
+    )
+
+    parser.add_argument(
+        "--expr-data-dir",
+        default="/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue",
+        help="Directory containing expression data files",
+    )
+
+    parser.add_argument(
+        "--include",
+        nargs="*",
+        help="Include only tissues matching these patterns (fuzzy match on tissue name)",
+    )
+
+    parser.add_argument(
+        "--exclude",
+        nargs="*",
+        help="Exclude tissues matching these patterns (fuzzy match on tissue name)",
+    )
+
+    parser.add_argument(
+        "--permutations",
+        type=int,
+        # default=1000000,
+        default=100000,
+        help="Number of permutations for p-value calculation",
+    )
+
+    parser.add_argument(
+        "--n-jobs", type=int, default=4, help="Number of parallel jobs for computation"
+    )
+
+    parser.add_argument(
+        "--list-metadata-columns",
+        action="store_true",
+        help="List available metadata columns and exit",
+    )
+
+    parser.add_argument(
+        "--list-tissues",
+        action="store_true",
+        help="List available tissue files and exit",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        default=".",
+        help="Directory to save output files (default: current directory)",
+    )
+    
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Reduce output verbosity for batch processing",
+    )
+    
+    parser.add_argument(
+        "--no-csv-output",
+        action="store_true",
+        help="Skip CSV file generation (only create pickle files)",
+    )
+    
+    parser.add_argument(
+        "--no-individual-logs",
+        action="store_true",
+        help="Skip individual tissue log files (only keep summary logs)",
+    )
+    
+    parser.add_argument(
+        "--data-dir",
+        default="/mnt/data/proj_data/ccc-gpu/data/tutorial",
+        help="Directory containing GTEx data files (metadata and gene mappings)",
+    )
+
+    args = parser.parse_args()
+
+    # Set global quiet mode
+    global QUIET_MODE
+    QUIET_MODE = args.quiet
+
+    try:
+        # Create output directory if it doesn't exist
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Set up summary logger
+        summary_logger, summary_log_file = setup_summary_logger(
+            args.gene_symbols, output_dir
+        )
+
+        # Set up summary tables file
+        genes_connected = "_".join(args.gene_symbols)
+        summary_tables_file_path = output_dir / f"_{genes_connected}_summary_tables.log"
+        summary_tables_file = open(summary_tables_file_path, "w")
+
+        log_and_print(f"Output directory: {output_dir.absolute()}", summary_logger)
+        log_and_print(f"Summary log file: {summary_log_file}", summary_logger)
+        log_and_print(
+            f"Summary tables file: {summary_tables_file_path}", summary_logger
+        )
+        log_and_print(
+            f"Gene symbols to analyze: {', '.join(args.gene_symbols)}", summary_logger
+        )
+
+        # Find expression files
+        expression_files = find_expression_files(
+            args.expr_data_dir,
+            include_patterns=args.include,
+            exclude_patterns=args.exclude,
+            quiet=args.quiet,
+        )
+
+        # If user wants to list tissues
+        if args.list_tissues:
+            log_and_print(
+                f"Available expression files in {args.expr_data_dir}:", summary_logger
+            )
+            for file_path, tissue_name in expression_files:
+                log_and_print(f"  {tissue_name}: {file_path.name}", summary_logger)
+            summary_tables_file.close()
+            return
+
+        # Load metadata and gene mapping
+        gtex_metadata, gene_map = load_metadata_and_gene_map(args.data_dir, quiet=args.quiet)
+
+        # If user wants to list metadata columns
+        if args.list_metadata_columns:
+            log_and_print("Available metadata columns:", summary_logger)
+            for col in sorted(gtex_metadata.columns):
+                log_and_print(f"  {col}", summary_logger)
+            summary_tables_file.close()
+            return
+
+        # Process each gene symbol
+        all_genes_results = {}
+        total_start_time = time.time()
+
+        for gene_idx, gene_symbol in enumerate(args.gene_symbols, 1):
+            log_and_print(f"\n{'='*100}", summary_logger)
+            log_and_print(
+                f"PROCESSING GENE {gene_idx}/{len(args.gene_symbols)}: {gene_symbol}",
+                summary_logger,
+            )
+            log_and_print(f"{'='*100}", summary_logger)
+
+            # Process each tissue for this gene
+            all_results = {}
+            gene_id = None
+            tissue_runtimes = {}
+            gene_start_time = time.time()
+
+            for i, (expr_file_path, tissue_name) in enumerate(expression_files, 1):
+                log_and_print(
+                    f"\n[{i}/{len(expression_files)}] Starting processing for {gene_symbol} in {tissue_name}...",
+                    summary_logger,
+                )
+                tissue_start_time = time.time()
+
+                try:
+                    results_df, current_gene_id = compute_correlations_for_tissue(
+                        gene_symbol,
+                        tissue_name,
+                        expr_file_path,
+                        gtex_metadata,
+                        gene_map,
+                        output_dir,
+                        args.permutations,
+                        args.n_jobs,
+                        args.no_individual_logs,
+                    )
+
+                    tissue_end_time = time.time()
+                    tissue_runtime = tissue_end_time - tissue_start_time
+                    tissue_runtimes[tissue_name] = tissue_runtime
+
+                    if results_df is not None:
+                        all_results[tissue_name] = results_df
+                        gene_id = current_gene_id
+
+                        # Save individual tissue results
+                        output_file = (
+                            output_dir
+                            / f"{gene_symbol}_{tissue_name}_correlation_results.pkl"
+                        )
+                        log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+                        results_df.to_pickle(output_file)
+                        log_and_print(
+                            f"Results for {gene_symbol} in {tissue_name} saved to: {output_file}",
+                            summary_logger,
+                        )
+                        if not args.no_individual_logs and log_file:
+                            log_and_print(
+                                f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+                                summary_logger,
+                            )
+                        log_and_print(
+                            f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+                            summary_logger,
+                        )
+                    else:
+                        log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+                        log_and_print(
+                            f"No results generated for {gene_symbol} in {tissue_name}",
+                            summary_logger,
+                        )
+                        if not args.no_individual_logs and log_file:
+                            log_and_print(
+                                f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+                                summary_logger,
+                            )
+                        log_and_print(
+                            f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+                            summary_logger,
+                        )
+
+                except Exception as e:
+                    tissue_end_time = time.time()
+                    tissue_runtime = tissue_end_time - tissue_start_time
+                    tissue_runtimes[tissue_name] = tissue_runtime
+                    log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+                    log_and_print(
+                        f"Error processing {gene_symbol} in {tissue_name}: {e}",
+                        summary_logger,
+                    )
+                    if not args.no_individual_logs and log_file:
+                        log_and_print(
+                            f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+                            summary_logger,
+                        )
+                    log_and_print(
+                        f"Runtime for {gene_symbol} in {tissue_name} (failed): {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+                        summary_logger,
+                    )
+                    continue
+
+            # Gene-level summary
+            gene_end_time = time.time()
+            gene_runtime = gene_end_time - gene_start_time
+
+            if not all_results:
+                log_and_print(
+                    f"No successful analyses completed for {gene_symbol}.",
+                    summary_logger,
+                )
+                log_and_print(
+                    f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+                    summary_logger,
+                )
+                continue
+
+            # Store results for this gene
+            all_genes_results[gene_symbol] = {
+                "results": all_results,
+                "gene_id": gene_id,
+                "tissue_runtimes": tissue_runtimes,
+                "gene_runtime": gene_runtime,
+            }
+
+            # Save combined results for this gene
+            combined_results = pd.concat(all_results.values(), ignore_index=False)
+            combined_output_file = (
+                output_dir / f"{gene_symbol}_all_tissues_correlation_results.pkl"
+            )
+            combined_results.to_pickle(combined_output_file)
+            if not args.no_csv_output:
+                combined_csv_file = (
+                    output_dir / f"{gene_symbol}_all_tissues_correlation_results.csv"
+                )
+                combined_results.to_csv(combined_csv_file)
+
+            # Gene-specific summary
+            log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+            log_and_print(
+                "COMBINED RESULTS SUMMARY", summary_logger, summary_tables_file
+            )
+            log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+            log_and_print(
+                f"Gene Symbol: {gene_symbol}", summary_logger, summary_tables_file
+            )
+            log_and_print(f"Gene ID: {gene_id}", summary_logger, summary_tables_file)
+            log_and_print(
+                f"Permutations: {args.permutations:,}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"Tissues processed: {len(all_results)}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"Combined results saved to: {combined_output_file}",
+                summary_logger,
+                summary_tables_file,
+            )
+            if not args.no_csv_output:
+                log_and_print(
+                    f"Combined results (CSV) saved to: {combined_csv_file}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            # Show summary statistics for this gene
+            successful_analyses = combined_results[
+                combined_results["status"] == "success"
+            ]
+            if len(successful_analyses) > 0:
+                log_and_print(
+                    f"\nTotal successful analyses across all tissues: {len(successful_analyses)}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+                log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+                log_and_print(
+                    "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+
+                # Sort by absolute CCC value (descending) - simplified approach
+                successful_analyses_copy = successful_analyses.copy()
+                successful_analyses_copy["abs_ccc"] = successful_analyses_copy[
+                    "ccc_value"
+                ].abs()
+                top_results = successful_analyses_copy.sort_values(
+                    "abs_ccc", ascending=False
+                )
+
+                # Display top results
+                log_and_print(
+                    f"{'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print("-" * 90, summary_logger, summary_tables_file)
+
+                for idx, row in top_results.head(20).iterrows():
+                    tissue = row["tissue"]
+                    ccc_val = row["ccc_value"]
+                    p_val = row["p_value"]
+
+                    # Determine significance
+                    if p_val < 0.001:
+                        significance = "***"
+                    elif p_val < 0.01:
+                        significance = "**"
+                    elif p_val < 0.05:
+                        significance = "*"
+                    else:
+                        significance = "ns"
+
+                    log_and_print(
+                        f"{tissue:<20} {idx:<25} {ccc_val:>10.6f}  {p_val:>10.2e}  {significance:<15}",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+
+                # Summary by tissue for this gene
+                log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+                log_and_print("SUMMARY BY TISSUE", summary_logger, summary_tables_file)
+                log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+
+                log_and_print(
+                    f"{'Tissue':<20} {'N Samples':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print("-" * 70, summary_logger, summary_tables_file)
+
+                for tissue_name in sorted(all_results.keys()):
+                    tissue_results = all_results[tissue_name]
+                    tissue_successful = tissue_results[
+                        tissue_results["status"] == "success"
+                    ]
+                    n_samples = (
+                        tissue_results["n_samples"].iloc[0]
+                        if len(tissue_results) > 0
+                        else 0
+                    )
+
+                    if len(tissue_successful) > 0:
+                        mean_ccc = tissue_successful["ccc_value"].abs().mean()
+                        max_ccc = tissue_successful["ccc_value"].abs().max()
+                        log_and_print(
+                            f"{tissue_name:<20} {n_samples:<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}",
+                            summary_logger,
+                            summary_tables_file,
+                        )
+                    else:
+                        log_and_print(
+                            f"{tissue_name:<20} {n_samples:<10} {'0':<12} {'N/A':<12} {'N/A':<12}",
+                            summary_logger,
+                            summary_tables_file,
+                        )
+
+            # Runtime summary for this gene
+            log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+            log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file)
+            log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+            log_and_print(
+                f"Total runtime: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"Average runtime per tissue: {gene_runtime/len(expression_files):.2f} seconds",
+                summary_logger,
+                summary_tables_file,
+            )
+
+            log_and_print("\nRuntime by tissue:", summary_logger, summary_tables_file)
+            log_and_print(
+                f"{'Tissue':<25} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Status':<10}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print("-" * 70, summary_logger, summary_tables_file)
+
+            for tissue_name in sorted(tissue_runtimes.keys()):
+                runtime = tissue_runtimes[tissue_name]
+                status = "Success" if tissue_name in all_results else "Failed"
+                log_and_print(
+                    f"{tissue_name:<25} {runtime:<15.2f} {runtime/60:<15.2f} {status:<10}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            if tissue_runtimes:
+                # Find fastest and slowest tissues
+                fastest_tissue = min(tissue_runtimes.items(), key=lambda x: x[1])
+                slowest_tissue = max(tissue_runtimes.items(), key=lambda x: x[1])
+
+                log_and_print(
+                    f"\nFastest: {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print(
+                    f"Slowest: {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print(
+                    f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            log_and_print(
+                f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+                summary_logger,
+            )
+
+        total_end_time = time.time()
+        total_runtime = total_end_time - total_start_time
+
+        if not all_genes_results:
+            log_and_print(
+                "No successful analyses completed for any gene.", summary_logger
+            )
+            summary_tables_file.close()
+            return
+
+        # Create overall summary
+        log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+        log_and_print("OVERALL RESULTS SUMMARY", summary_logger, summary_tables_file)
+        log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+        log_and_print(
+            f"Gene symbols processed: {', '.join(all_genes_results.keys())}",
+            summary_logger,
+            summary_tables_file,
+        )
+        log_and_print(
+            f"Total genes: {len(all_genes_results)}",
+            summary_logger,
+            summary_tables_file,
+        )
+        log_and_print(
+            f"Permutations: {args.permutations:,}", summary_logger, summary_tables_file
+        )
+        log_and_print(
+            f"Tissues per gene: {len(expression_files)}",
+            summary_logger,
+            summary_tables_file,
+        )
+
+        # Combine all results across genes
+        all_combined_results = []
+        for gene_symbol, gene_data in all_genes_results.items():
+            gene_combined = pd.concat(gene_data["results"].values(), ignore_index=False)
+            all_combined_results.append(gene_combined)
+
+        mega_combined_results = pd.concat(all_combined_results, ignore_index=False)
+
+        # Save mega combined results
+        mega_output_file = output_dir / "_all_genes_all_tissues_correlation_results.pkl"
+        mega_combined_results.to_pickle(mega_output_file)
+        log_and_print(
+            f"All genes combined results saved to: {mega_output_file}",
+            summary_logger,
+            summary_tables_file,
+        )
+
+        # Also save as CSV for easy viewing (if not disabled)
+        if not args.no_csv_output:
+            mega_csv_file = output_dir / "_all_genes_all_tissues_correlation_results.csv"
+            mega_combined_results.to_csv(mega_csv_file)
+            log_and_print(
+                f"All genes combined results (CSV) saved to: {mega_csv_file}",
+                summary_logger,
+                summary_tables_file,
+            )
+
+        # List all log files created (if individual logs are enabled)
+        if not args.no_individual_logs:
+            log_and_print("\nLog files created:", summary_logger)
+            for gene_symbol in all_genes_results.keys():
+                for tissue_name in [name for _, name in expression_files]:
+                    log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+                    if log_file.exists():
+                        log_and_print(
+                            f"  {gene_symbol} - {tissue_name}: {log_file}", summary_logger
+                        )
+
+        # Show summary statistics across all genes and tissues
+        successful_analyses = mega_combined_results[
+            mega_combined_results["status"] == "success"
+        ]
+        if len(successful_analyses) > 0:
+            log_and_print(
+                f"\nTotal successful analyses across all genes and tissues: {len(successful_analyses)}",
+                summary_logger,
+                summary_tables_file,
+            )
+
+            log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+            log_and_print(
+                "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+            # Sort by absolute CCC value (descending) - simplified approach
+            successful_analyses_copy = successful_analyses.copy()
+            successful_analyses_copy["abs_ccc"] = successful_analyses_copy[
+                "ccc_value"
+            ].abs()
+            top_results = successful_analyses_copy.sort_values(
+                "abs_ccc", ascending=False
+            )
+
+            # Display top results
+            log_and_print(
+                f"{'Gene':<12} {'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print("-" * 110, summary_logger, summary_tables_file)
+
+            for idx, row in top_results.head(30).iterrows():
+                gene = row["gene_symbol"]
+                tissue = row["tissue"]
+                ccc_val = row["ccc_value"]
+                p_val = row["p_value"]
+
+                # Determine significance
+                if p_val < 0.001:
+                    significance = "***"
+                elif p_val < 0.01:
+                    significance = "**"
+                elif p_val < 0.05:
+                    significance = "*"
+                else:
+                    significance = "ns"
+
+                log_and_print(
+                    f"{gene:<12} {tissue:<20} {idx:<25} {ccc_val:>10.6f}  {p_val:>10.2e}  {significance:<15}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            # Summary by gene
+            log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+            log_and_print("SUMMARY BY GENE", summary_logger, summary_tables_file)
+            log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+            for gene_symbol, gene_data in all_genes_results.items():
+                gene_combined = pd.concat(
+                    gene_data["results"].values(), ignore_index=False
+                )
+                gene_successful = gene_combined[gene_combined["status"] == "success"]
+
+                log_and_print(
+                    f"\nGene: {gene_symbol} (ID: {gene_data['gene_id']})",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print(
+                    f"  Tissues processed: {len(gene_data['results'])}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+                log_and_print(
+                    f"  Successful analyses: {len(gene_successful)}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+                if len(gene_successful) > 0:
+                    mean_ccc = gene_successful["ccc_value"].abs().mean()
+                    max_ccc = gene_successful["ccc_value"].abs().max()
+                    log_and_print(
+                        f"  Mean |CCC|: {mean_ccc:.6f}",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+                    log_and_print(
+                        f"  Max |CCC|: {max_ccc:.6f}",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+
+                    # Top correlation for this gene
+                    gene_successful_copy = gene_successful.copy()
+                    gene_successful_copy["abs_ccc"] = gene_successful_copy[
+                        "ccc_value"
+                    ].abs()
+                    top_corr = gene_successful_copy.sort_values(
+                        "abs_ccc", ascending=False
+                    ).iloc[0]
+                    log_and_print(
+                        f"  Top correlation: {top_corr.name} in {top_corr['tissue']} (CCC: {top_corr['ccc_value']:.6f}, p: {top_corr['p_value']:.2e})",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+
+                log_and_print(
+                    f"  Runtime: {gene_data['gene_runtime']:.2f} seconds ({gene_data['gene_runtime']/60:.2f} minutes)",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            # Summary by tissue across all genes
+            log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+            log_and_print(
+                "SUMMARY BY TISSUE (across all genes)",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+            tissue_summary = {}
+            for gene_symbol, gene_data in all_genes_results.items():
+                for tissue_name, tissue_results in gene_data["results"].items():
+                    if tissue_name not in tissue_summary:
+                        tissue_summary[tissue_name] = []
+                    tissue_summary[tissue_name].append(tissue_results)
+
+            log_and_print(
+                f"{'Tissue':<25} {'N Genes':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print("-" * 75, summary_logger, summary_tables_file)
+
+            for tissue_name in sorted(tissue_summary.keys()):
+                tissue_all_genes = pd.concat(
+                    tissue_summary[tissue_name], ignore_index=False
+                )
+                tissue_successful = tissue_all_genes[
+                    tissue_all_genes["status"] == "success"
+                ]
+
+                if len(tissue_successful) > 0:
+                    mean_ccc = tissue_successful["ccc_value"].abs().mean()
+                    max_ccc = tissue_successful["ccc_value"].abs().max()
+                    log_and_print(
+                        f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+                else:
+                    log_and_print(
+                        f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {'0':<12} {'N/A':<12} {'N/A':<12}",
+                        summary_logger,
+                        summary_tables_file,
+                    )
+
+        # Runtime summary
+        log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+        log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file)
+        log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+        log_and_print(
+            f"Total runtime: {total_runtime:.2f} seconds ({total_runtime/60:.2f} minutes)",
+            summary_logger,
+            summary_tables_file,
+        )
+        log_and_print(
+            f"Average runtime per gene: {total_runtime/len(args.gene_symbols):.2f} seconds",
+            summary_logger,
+            summary_tables_file,
+        )
+        log_and_print(
+            f"Total gene-tissue combinations: {len(args.gene_symbols) * len(expression_files)}",
+            summary_logger,
+            summary_tables_file,
+        )
+
+        # Runtime by gene
+        log_and_print("\nRuntime by gene:", summary_logger, summary_tables_file)
+        log_and_print(
+            f"{'Gene':<15} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Tissues':<10} {'Successful':<12}",
+            summary_logger,
+            summary_tables_file,
+        )
+        log_and_print("-" * 75, summary_logger, summary_tables_file)
+
+        for gene_symbol, gene_data in all_genes_results.items():
+            successful_tissues = len(gene_data["results"])
+            log_and_print(
+                f"{gene_symbol:<15} {gene_data['gene_runtime']:<15.2f} {gene_data['gene_runtime']/60:<15.2f} {len(expression_files):<10} {successful_tissues:<12}",
+                summary_logger,
+                summary_tables_file,
+            )
+
+        # Aggregate tissue runtime statistics across all genes
+        all_tissue_runtimes = {}
+        for gene_symbol, gene_data in all_genes_results.items():
+            for tissue_name, runtime in gene_data["tissue_runtimes"].items():
+                if tissue_name not in all_tissue_runtimes:
+                    all_tissue_runtimes[tissue_name] = []
+                all_tissue_runtimes[tissue_name].append(runtime)
+
+        if all_tissue_runtimes:
+            log_and_print(
+                "\nAverage runtime by tissue (across all genes):",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"{'Tissue':<25} {'Avg Runtime (sec)':<18} {'Avg Runtime (min)':<18} {'N Runs':<8} {'Min':<10} {'Max':<10}",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print("-" * 95, summary_logger, summary_tables_file)
+
+            tissue_avg_runtimes = []
+            for tissue_name in sorted(all_tissue_runtimes.keys()):
+                runtimes = all_tissue_runtimes[tissue_name]
+                avg_runtime = np.mean(runtimes)
+                min_runtime = np.min(runtimes)
+                max_runtime = np.max(runtimes)
+                tissue_avg_runtimes.append((tissue_name, avg_runtime))
+
+                log_and_print(
+                    f"{tissue_name:<25} {avg_runtime:<18.2f} {avg_runtime/60:<18.2f} {len(runtimes):<8} {min_runtime:<10.2f} {max_runtime:<10.2f}",
+                    summary_logger,
+                    summary_tables_file,
+                )
+
+            # Find fastest and slowest tissues (by average)
+            tissue_avg_runtimes.sort(key=lambda x: x[1])
+            fastest_tissue = tissue_avg_runtimes[0]
+            slowest_tissue = tissue_avg_runtimes[-1]
+
+            log_and_print(
+                f"\nFastest tissue (avg): {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"Slowest tissue (avg): {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)",
+                summary_logger,
+                summary_tables_file,
+            )
+            log_and_print(
+                f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x",
+                summary_logger,
+                summary_tables_file,
+            )
+
+        # Final message about summary log
+        log_and_print(f"\nSummary log saved to: {summary_log_file}", summary_logger)
+        log_and_print(
+            f"Summary tables saved to: {summary_tables_file_path}", summary_logger
+        )
+
+        # Close the summary tables file
+        summary_tables_file.close()
+
+        # Close the summary logger
+        for handler in summary_logger.handlers:
+            handler.close()
+            summary_logger.removeHandler(handler)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        # Try to close the summary tables file if it was opened
+        try:
+            summary_tables_file.close()
+        except:
+            pass
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()