diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
deleted file mode 100644
index 8c462bbd..00000000
--- a/.github/workflows/lint.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: lint
-on:
- push:
- pull_request:
- types: [opened, reopened]
-jobs:
- run-linters:
- name: Run linters
- runs-on: ubuntu-latest
-
- steps:
- - name: Check out Git repository
- uses: actions/checkout@v2
-
- - name: Set up Python
- uses: actions/setup-python@v1
- with:
- python-version: 3.9
-
- - name: Install Python dependencies
- run: pip install black flake8
-
- - name: Run linters
- uses: wearerequired/lint-action@v1
- with:
- github_token: ${{ secrets.github_token }}
- # Enable linters
- black: true
- flake8: true
- # Mark the following line true if you want linters to attempt to
- # autocorrect your code
- auto_fix: true
- git_name: "Greene Lab Linter"
- git_email: "miltondp@gmail.com"
- commit_message: "fix code style issues with ${linter}"
-
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
deleted file mode 100644
index 5abc2841..00000000
--- a/.github/workflows/pytest.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-name: tests
-on:
- push:
- pull_request:
- types: [opened, reopened]
-
-env:
- # Increase this value to reset cache if environment.yml has not changed.
- PY_CACHE_NUMBER: 2
- PY_ENV: ccc_gene_expr
-
-jobs:
- ccc_pytest:
- name: Python tests for CCC
- runs-on: ${{ matrix.os }}
- strategy:
- max-parallel: 4
- fail-fast: false
- matrix:
- python-version: ["3.10", "3.11"]
- os: [ubuntu-latest, macOS-latest, windows-latest]
- steps:
- - name: Checkout git repo
- uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install pytest "numpy<2.0" scipy numba pandas scikit-learn
- - name: Test CCC with pytest
- env:
- PYTHONPATH: libs/
- run: |
- pytest tests/test_coef.py tests/test_pytorch_core.py tests/test_scipy_stats.py tests/test_sklearn_metrics.py
-
-# pytest:
-# name: Python tests for analyses
-# runs-on: ${{ matrix.os }}
-# strategy:
-# max-parallel: 4
-# fail-fast: false
-# matrix:
-# python-version: ["3.9"]
-# os: [ubuntu-latest, macOS-latest, windows-latest]
-# steps:
-# - name: Checkout git repo
-# uses: actions/checkout@v3
-# - name: Cache conda
-# id: cache
-# uses: actions/cache@v3
-# with:
-# path: "${{ env.PY_ENV }}.tar.gz"
-# key: ${{ runner.os }}-${{ env.PY_CACHE_NUMBER }}-${{ hashFiles('environment/environment.yml', 'environment/scripts/install_r_packages.r', 'environment/scripts/install_other_packages.sh') }}
-# - name: Setup Miniconda
-# if: steps.cache.outputs.cache-hit != 'true'
-# uses: conda-incubator/setup-miniconda@v2
-# with:
-# activate-environment: ${{ env.PY_ENV }}
-# environment-file: environment/environment.yml
-# auto-activate-base: false
-# miniforge-variant: Mambaforge
-# miniforge-version: 'latest'
-# use-mamba: true
-# - name: Install other packages and Conda-Pack environment
-# if: steps.cache.outputs.cache-hit != 'true'
-# shell: bash -l {0}
-# run: |
-# # other packages (R packages mainly)
-# bash environment/scripts/install_other_packages.sh
-#
-# # install conda-pack, and pack environment
-# conda install --yes -c conda-forge conda-pack coverage
-# conda pack -f -n ${{ env.PY_ENV }} -o "${{ env.PY_ENV }}.tar.gz"
-# - name: Unpack environment
-# shell: bash -l {0}
-# run: |
-# mkdir -p "${{ env.PY_ENV }}"
-# tar -xzf "${{ env.PY_ENV }}.tar.gz" -C "${{ env.PY_ENV }}"
-# - name: Setup data and run pytest (Windows systems)
-# if: runner.os == 'Windows'
-# env:
-# PYTHONPATH: libs/
-# shell: cmd
-# run: |
-# echo on
-# cd ${{ env.PY_ENV }}
-# call .\Scripts\activate.bat
-# .\Scripts\conda-unpack.exe
-# cd ..
-# set R_HOME=%CONDA_PREFIX%\Lib\R
-# python environment\scripts\setup_data.py --mode testing
-# pytest -v -rs tests
-# - name: Setup data and run pytest (non-Windows systems)
-# if: runner.os != 'Windows'
-# shell: bash
-# env:
-# PYTHONPATH: libs/
-# run: |
-# source ${{ env.PY_ENV }}/bin/activate
-# conda-unpack
-#
-# python environment/scripts/setup_data.py --mode testing
-#
-# if [ "$RUNNER_OS" == "Linux" ]; then
-# # for linux/ubuntu, run the tests once: with numba jit activated
-# # (which is the expected implementation) and with the jit
-# # deactivated (otherwise coverage does not work).
-#
-# # numba jit activated
-# pytest -v -rs tests
-#
-# # numba jit deactivated + code coverage
-# export NUMBA_DISABLE_JIT=1
-# coverage run --source=libs/ -m pytest -v -rs tests
-# coverage xml -o coverage.xml
-# else
-# pytest -v -rs tests
-# fi
-# - name: Codecov upload
-# if: runner.os == 'Linux'
-# uses: codecov/codecov-action@v2
-# with:
-# files: ./coverage.xml
-# name: codecov-${{ matrix.os }}-python${{ matrix.python-version }}
-# fail_ci_if_error: true
-# verbose: true
diff --git a/README.md b/README.md
index 14285186..c5a12bb7 100644
--- a/README.md
+++ b/README.md
@@ -73,17 +73,13 @@ cd ccc-gpu
#### 2. Setup Environment with conda-lock
-This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean:
+This process uses [pipx](https://pipx.pypa.io/stable/) to install conda-lock in an isolated environment, keeping your base environment clean:
> **Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular `environment.yml` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development.
```bash
-# Create temporary environment for conda-lock
-conda create -n ccc-gpu-setup python=3.10 -y # or: mamba create -n ccc-gpu-setup python=3.10 -y
-conda activate ccc-gpu-setup
-
-# Install conda-lock in temporary environment
-conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y
+# Install conda-lock using pipx (installs in isolated environment)
+pipx install conda-lock
# Create the main ccc-gpu environment from lock file
conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
@@ -95,32 +91,6 @@ conda activate ccc-gpu
pip install .
```
-#### 3. Optional: Clean up temporary environment
-
-Once installation is complete, you can optionally remove the temporary setup environment:
-
-```bash
-# Remove temporary environment (optional)
-conda deactivate # Make sure you're not in ccc-gpu-setup
-conda remove -n ccc-gpu-setup --all -y # or: mamba remove -n ccc-gpu-setup --all -y
-```
-
-#### Alternative: Install conda-lock in base environment
-
-If you prefer to install conda-lock directly in your base environment:
-
-```bash
-# Option 1: Using pip
-pip install conda-lock
-
-# Option 2: Using conda
-conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y
-
-# Then create environment directly
-conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
-conda activate ccc-gpu
-pip install .
-```
> **Note**: If you prefer to use Mamba for faster package resolution, you can install MiniForge which includes Mamba:
> ```bash
@@ -139,6 +109,10 @@ bash ./scripts/run_tests.sh python
```
## Usage
+### End-to-End Tutorial
+
+You can find a tutorial showing simplified analysis steps for those we used in our paper in this [notebook](nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb) using the GTEx v8 data.
+
### Basic Usage
@@ -161,35 +135,16 @@ correlation = ccc(x, y)
print(f"CCC coefficient: {correlation:.3f}")
```
-### Controlling Debug Logging
-
-By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable:
-
-```bash
-# Run with default behavior (no debug output)
-python your_script.py
-
-# Enable debug logging for troubleshooting
-CCC_GPU_LOGGING=1 python your_script.py
-
-# Or set it for the session
-export CCC_GPU_LOGGING=1
-python your_script.py
-```
-
-This is particularly useful for:
-- Debugging GPU memory issues
-- Understanding CUDA device utilization
-- Monitoring batch processing performance
-- Troubleshooting installation problems
-
### Working with Gene Expression Data
CCC-GPU is particularly useful for genomics applications:
```python
import pandas as pd
-from ccc.coef import ccc
+# New CCC-GPU implementation import
+from ccc.coef.impl_gpu import ccc
+# Original CCC implementation import
+# from ccc.coef.impl import ccc
# Load gene expression data
# Assume genes are in columns, samples in rows
@@ -217,6 +172,28 @@ for i, j in zip(top_indices[0], top_indices[1]):
Refer to the original CCC Repository for more usage examples: [https://github.com/greenelab/ccc](https://github.com/greenelab/ccc)
+### Controlling Debug Logging
+
+By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable:
+
+```bash
+# Run with default behavior (no debug output)
+python your_script.py
+
+# Enable debug logging for troubleshooting
+CCC_GPU_LOGGING=1 python your_script.py
+
+# Or set it for the session
+export CCC_GPU_LOGGING=1
+python your_script.py
+```
+
+This is particularly useful for:
+- Debugging GPU memory issues
+- Understanding CUDA device utilization
+- Monitoring batch processing performance
+- Troubleshooting installation problems
+
## Performance Benchmarks
CCC-GPU provides significant performance improvements over CPU-only implementations:
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 68596b54..70b1065c 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -65,19 +65,15 @@ Install from source using the provided conda-lock environment:
2. Setup Environment with conda-lock
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean:
+This process uses pipx to install conda-lock in an isolated environment, keeping your base environment clean:
.. note::
**Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular ``environment.yml`` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development.
.. code-block:: bash
- # Create temporary environment for conda-lock
- conda create -n ccc-gpu-setup python=3.10 -y # or: mamba create -n ccc-gpu-setup python=3.10 -y
- conda activate ccc-gpu-setup
-
- # Install conda-lock in temporary environment
- conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y
+ # Install conda-lock using pipx (installs in isolated environment)
+ pipx install conda-lock
# Create the main ccc-gpu environment from lock file
conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba
@@ -88,21 +84,23 @@ This process uses a temporary environment to manage the conda-lock installation,
# Install the package from source
pip install .
-3. Optional: Clean up temporary environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. note::
+ If you don't have pipx installed, you can install it with ``pip install pipx`` or follow the `pipx installation guide `_.
+
+3. Optional: Remove conda-lock
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once installation is complete, you can optionally remove the temporary setup environment:
+If you no longer need conda-lock after installation, you can remove it:
.. code-block:: bash
- # Remove temporary environment (optional)
- conda deactivate # Make sure you're not in ccc-gpu-setup
- conda remove -n ccc-gpu-setup --all -y # or: mamba remove -n ccc-gpu-setup --all -y
+ # Remove conda-lock (optional)
+ pipx uninstall conda-lock
Alternative: Install conda-lock in base environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If you prefer to install conda-lock directly in your base environment:
+If you prefer to install conda-lock directly in your base environment instead of using pipx:
.. code-block:: bash
diff --git a/nbs/03-manuscript/40_prepare_supp_data/README.md b/nbs/03-manuscript/40_prepare_supp_data/README.md
index f972dcbe..6c733d8f 100644
--- a/nbs/03-manuscript/40_prepare_supp_data/README.md
+++ b/nbs/03-manuscript/40_prepare_supp_data/README.md
@@ -1,82 +1,188 @@
-# CCC Data Processing Script
+# CCC Data Processing Scripts
-This directory contains a script to process GTEx similarity matrices (.pkl files) and extract only the CCC (Clustered Correlation Coefficient) data.
+This directory contains scripts to process GTEx similarity matrices (.pkl files) and extract CCC (Clustered Correlation Coefficient) data in optimized formats for efficient storage and fast queries.
-## Script: `process_ccc_data.py`
+## Available Scripts
-### Description
-Processes all .pkl files in the source directory, extracts only the 'ccc' column with multi-indices, and saves individual .parquet files with snappy compression for each input. This significantly reduces file sizes compared to .pkl format.
+### 1. `process_ccc_to_duckdb.py` (Recommended)
-### Usage
+**Description:** Converts pickle files to DuckDB format for ultra-fast queries and efficient storage.
+
+#### Key Features
+- **Sub-millisecond query performance** for individual gene pairs
+- **4-5x better compression** than parquet format
+- **Minimal memory usage** (queries use <1GB RAM vs 13GB+ for parquet)
+- **SQL query capabilities** for complex analyses
+- **Support for both individual and consolidated databases**
+
+#### Usage
-#### Dry Run (recommended first step)
```bash
# Activate the conda environment
conda activate ccc-gpu
-# Run dry run to see what files would be processed
-python process_ccc_data.py --dry-run
+# Install DuckDB (if not already installed)
+pip install duckdb
+
+# Process all tissues into individual databases
+python process_ccc_to_duckdb.py \
+ --source-dir /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all \
+ --output-dir /mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb
+
+# Create a single consolidated database (all tissues)
+python process_ccc_to_duckdb.py --single-db
+
+# Process specific tissues only
+python process_ccc_to_duckdb.py --tissues bladder brain_cortex
+
+# Dry run to see what would be processed
+python process_ccc_to_duckdb.py --dry-run
```
-#### Full Processing
+#### Arguments
+- `--source-dir`: Source directory with .pkl files
+- `--output-dir`: Output directory for DuckDB files
+- `--single-db`: Create one consolidated database instead of individual ones
+- `--tissues`: Process specific tissues only
+- `--dry-run`: Show what would be processed without doing it
+- `--debug`: Enable debug logging
+
+### 2. `ccc_duckdb_query.py` - Query Interface
+
+**Description:** Python wrapper for fast queries on DuckDB databases.
+
+#### Usage as Module
+
+```python
+from ccc_duckdb_query import CCCDatabase
+
+# Open database
+db = CCCDatabase("/path/to/bladder_ccc.duckdb")
+
+# Query single gene pair
+ccc = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11")
+
+# Get all correlations for a gene
+correlations = db.get_gene_correlations("ENSG00000141510.16", min_ccc=0.5)
+
+# Get top correlations
+top_pairs = db.get_top_correlations(threshold=0.9, limit=100)
+
+# Batch query multiple pairs
+pairs = [("gene1", "gene2"), ("gene3", "gene4")]
+results = db.get_batch_correlations(pairs)
+
+# Custom SQL query
+df = db.query("SELECT * FROM ccc_data WHERE ccc > 0.95 LIMIT 10")
+
+# Get database statistics
+stats = db.get_statistics()
+
+db.close()
+```
+
+#### Usage as CLI
+
+```bash
+# Get database statistics
+python ccc_duckdb_query.py /path/to/database.duckdb --stats
+
+# Query specific gene pair
+python ccc_duckdb_query.py /path/to/database.duckdb \
+ --gene1 ENSG00000141510.16 --gene2 ENSG00000133703.11
+
+# Get correlations for a gene
+python ccc_duckdb_query.py /path/to/database.duckdb \
+ --gene ENSG00000141510.16 --limit 50
+
+# Get top correlations
+python ccc_duckdb_query.py /path/to/database.duckdb \
+ --top 0.9 --limit 100
+```
+
+### 3. `process_ccc_data.py` (Legacy - Parquet Output)
+
+**Description:** Original script that creates parquet files. Kept for compatibility but DuckDB format is recommended.
+
```bash
# Run with default paths
python process_ccc_data.py
-# Run with custom paths
+# Custom paths
python process_ccc_data.py --source-dir /path/to/source --output-dir /path/to/output
```
-### Arguments
-- `--source-dir`: Source directory containing .pkl files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all`)
-- `--output-dir`: Output directory for processed parquet files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet`)
-- `--dry-run`: Show what would be processed without actually doing it
-- `--debug`: Enable debug logging (shows detailed processing information)
-
-### Requirements
-- `pandas`: For reading .pkl files and writing .parquet files
-- `pyarrow`: Required for parquet format support with snappy compression
-- `tqdm`: For progress bars (optional, script will work without it)
-
-### Logging
-The script automatically creates detailed logs in the `logs/` directory with timestamps:
-- **Log location**: `logs/process_ccc_data_YYYYMMDD_HHMMSS.log`
-- **Log levels**: INFO (default) and DEBUG (with `--debug` flag)
-- **Log content**: Processing progress, file details, errors, timing information, and archive sizes
-- **Console output**: Key information is also printed to console for real-time monitoring
-
-Example log entries:
+## Performance Comparison
+
+| Metric | Parquet | DuckDB | Improvement |
+|--------|---------|---------|-------------|
+| Storage Size | 302 GB | ~60-80 GB | 4-5x smaller |
+| Load Time | >60s timeout | 0s (no loading) | Instant access |
+| Single Query | >100ms | <1ms | 100x+ faster |
+| Memory Usage | 13+ GB | <1 GB | 13x+ less |
+| Random Access | Very slow | Sub-millisecond | Orders of magnitude |
+
+## Requirements
+
+```bash
+# Core requirements
+conda activate ccc-gpu
+pip install pandas duckdb numpy tqdm
+
+# Optional for parquet support
+pip install pyarrow
+```
+
+## Output Structure
+
+### DuckDB Format (Recommended)
```
-2024-01-15 10:30:15,123 - INFO - Starting CCC data processing
-2024-01-15 10:30:15,124 - INFO - Found 54 .pkl files to process
-2024-01-15 10:30:16,200 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2024-01-15 10:35:22,456 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl
+/output_directory/
+├── bladder_ccc.duckdb # Individual tissue databases
+├── brain_cortex_ccc.duckdb
+├── whole_blood_ccc.duckdb
+└── all_tissues_ccc.duckdb # Optional consolidated database
```
-### Output
-The script will create individual `.parquet` files for each source file containing only CCC data with multi-indices preserved. Parquet format with snappy compression provides significant space savings compared to .pkl files while maintaining fast read/write performance.
+### Database Schema
+```sql
+-- Individual tissue table
+CREATE TABLE ccc_data (
+ gene1 VARCHAR NOT NULL,
+ gene2 VARCHAR NOT NULL,
+ ccc REAL NOT NULL,
+ PRIMARY KEY (gene1, gene2)
+);
+
+-- Indexes for fast lookups
+CREATE INDEX idx_gene2 ON ccc_data(gene2);
+CREATE INDEX idx_ccc ON ccc_data(ccc);
+```
-### File Naming Convention
-Input: `gtex_v8_data_whole_blood-var_pc_log2-all.pkl`
-Output: `gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet`
+## Example Workflow
-### Example
```bash
-# Activate environment and run
+# 1. Convert all pickle files to DuckDB
conda activate ccc-gpu
-python process_ccc_data.py
+python process_ccc_to_duckdb.py
+
+# 2. Test query performance
+python ccc_duckdb_query.py /path/to/bladder_ccc.duckdb --stats
+
+# 3. Use in Python scripts
+from ccc_duckdb_query import CCCDatabase
+
+with CCCDatabase("bladder_ccc.duckdb") as db:
+ # Fast queries for your analysis
+ ccc = db.get_correlation("gene1", "gene2")
+```
+
+## Advantages of DuckDB Format
-# Run with debug logging for more detailed information
-python process_ccc_data.py --debug
-
-# Expected output structure:
-# nbs/03-manuscript/40_prepare_supp_data/
-# └── logs/
-# └── process_ccc_data_YYYYMMDD_HHMMSS.log
-#
-# /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/
-# ├── gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-# ├── gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-# ├── gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-# └── ... (54 individual .parquet files total)
-```
\ No newline at end of file
+1. **No Loading Required**: Direct queries without loading entire dataset
+2. **Memory Efficient**: Uses memory-mapped IO, minimal RAM footprint
+3. **Fast Random Access**: Indexed lookups in microseconds
+4. **SQL Support**: Complex queries and aggregations
+5. **Better Compression**: Columnar storage with efficient encoding
+6. **Concurrent Access**: Multiple readers can query simultaneously
+7. **ACID Compliance**: Data integrity guarantees
\ No newline at end of file
diff --git a/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py
new file mode 100755
index 00000000..38938650
--- /dev/null
+++ b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+"""
+Python wrapper for fast CCC correlation queries from DuckDB databases.
+
+This module provides a simple interface for querying gene pair correlations
+from the DuckDB databases created by process_ccc_to_duckdb.py.
+
+Example usage:
+ from ccc_duckdb_query import CCCDatabase
+
+ # Single tissue database
+ db = CCCDatabase("/path/to/bladder_ccc.duckdb")
+
+ # Query single gene pair
+ ccc_value = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11")
+
+ # Get all correlations for a gene
+ correlations = db.get_gene_correlations("ENSG00000141510.16")
+
+ # Get top correlations
+ top_pairs = db.get_top_correlations(threshold=0.9, limit=100)
+
+ # Batch query multiple pairs
+ pairs = [("gene1", "gene2"), ("gene3", "gene4")]
+ results = db.get_batch_correlations(pairs)
+"""
+
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Union
+import pandas as pd
+import duckdb
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class CCCDatabase:
+ """Wrapper for querying CCC correlation data from DuckDB databases."""
+
+ def __init__(self, db_path: Union[str, Path], tissue: Optional[str] = None):
+ """
+ Initialize connection to DuckDB database.
+
+ Args:
+ db_path: Path to DuckDB database file
+ tissue: Tissue name (for consolidated database)
+ """
+ self.db_path = Path(db_path)
+ if not self.db_path.exists():
+ raise FileNotFoundError(f"Database not found: {db_path}")
+
+ self.con = duckdb.connect(str(self.db_path), read_only=True)
+ self.tissue = tissue
+
+ # Detect database type (single tissue or consolidated)
+ tables = self.con.execute("SHOW TABLES").fetchall()
+ table_names = [t[0] for t in tables]
+
+ if "tissues" in table_names:
+ # Consolidated database
+ self.db_type = "consolidated"
+ self.tissues = self._get_available_tissues()
+
+ if tissue:
+ if tissue not in self.tissues:
+ raise ValueError(f"Tissue '{tissue}' not found. Available: {self.tissues}")
+ self.table_name = f"ccc_{tissue}"
+ else:
+ logger.info(f"Consolidated database with {len(self.tissues)} tissues")
+ logger.info(f"Available tissues: {', '.join(self.tissues[:5])}...")
+ else:
+ # Single tissue database
+ self.db_type = "single"
+ self.table_name = "ccc_data"
+ self.tissues = None
+
+ def _get_available_tissues(self) -> List[str]:
+ """Get list of available tissues in consolidated database."""
+ result = self.con.execute("SELECT tissue_name FROM tissues ORDER BY tissue_name").fetchall()
+ return [r[0] for r in result]
+
+ def get_correlation(self, gene1: str, gene2: str) -> Optional[float]:
+ """
+ Get CCC correlation for a specific gene pair.
+
+ Args:
+ gene1: First gene ID
+ gene2: Second gene ID
+
+ Returns:
+ CCC correlation value or None if not found
+ """
+ if self.tissue is None and self.db_type == "consolidated":
+ raise ValueError("Tissue must be specified for consolidated database")
+
+ # Try both orientations since correlation is symmetric
+ query = f"""
+ SELECT ccc FROM {self.table_name}
+ WHERE (gene1 = ? AND gene2 = ?)
+ OR (gene1 = ? AND gene2 = ?)
+ LIMIT 1
+ """
+
+ result = self.con.execute(query, [gene1, gene2, gene2, gene1]).fetchone()
+ return result[0] if result else None
+
+ def get_gene_correlations(
+ self,
+ gene: str,
+ min_ccc: Optional[float] = None,
+ limit: Optional[int] = None
+ ) -> pd.DataFrame:
+ """
+ Get all correlations for a specific gene.
+
+ Args:
+ gene: Gene ID
+ min_ccc: Minimum CCC threshold (optional)
+ limit: Maximum number of results (optional)
+
+ Returns:
+ DataFrame with columns: gene_pair, ccc
+ """
+ if self.tissue is None and self.db_type == "consolidated":
+ raise ValueError("Tissue must be specified for consolidated database")
+
+ where_clause = ""
+ if min_ccc is not None:
+ where_clause = f"AND ccc >= {min_ccc}"
+
+ limit_clause = ""
+ if limit is not None:
+ limit_clause = f"LIMIT {limit}"
+
+ query = f"""
+ SELECT
+ CASE
+ WHEN gene1 = ? THEN gene2
+ ELSE gene1
+ END as gene_pair,
+ ccc
+ FROM {self.table_name}
+ WHERE (gene1 = ? OR gene2 = ?)
+ {where_clause}
+ ORDER BY ccc DESC
+ {limit_clause}
+ """
+
+ result = self.con.execute(query, [gene, gene, gene]).df()
+ return result
+
+ def get_top_correlations(
+ self,
+ threshold: float = 0.9,
+ limit: int = 100
+ ) -> pd.DataFrame:
+ """
+ Get top correlations above a threshold.
+
+ Args:
+ threshold: Minimum CCC value
+ limit: Maximum number of results
+
+ Returns:
+ DataFrame with columns: gene1, gene2, ccc
+ """
+ if self.tissue is None and self.db_type == "consolidated":
+ raise ValueError("Tissue must be specified for consolidated database")
+
+ query = f"""
+ SELECT gene1, gene2, ccc
+ FROM {self.table_name}
+ WHERE ccc >= ?
+ ORDER BY ccc DESC
+ LIMIT ?
+ """
+
+ result = self.con.execute(query, [threshold, limit]).df()
+ return result
+
+ def get_batch_correlations(
+ self,
+ pairs: List[Tuple[str, str]]
+ ) -> Dict[Tuple[str, str], Optional[float]]:
+ """
+ Get correlations for multiple gene pairs efficiently.
+
+ Args:
+ pairs: List of (gene1, gene2) tuples
+
+ Returns:
+ Dictionary mapping (gene1, gene2) to CCC values
+ """
+ if self.tissue is None and self.db_type == "consolidated":
+ raise ValueError("Tissue must be specified for consolidated database")
+
+ if not pairs:
+ return {}
+
+ # Create temporary table for batch lookup
+ self.con.execute("CREATE TEMPORARY TABLE query_pairs (gene1 VARCHAR, gene2 VARCHAR)")
+
+ # Insert pairs
+ for g1, g2 in pairs:
+ self.con.execute("INSERT INTO query_pairs VALUES (?, ?)", [g1, g2])
+
+ # Batch query with joins
+ query = f"""
+ SELECT
+ COALESCE(qp.gene1, qp2.gene1) as query_gene1,
+ COALESCE(qp.gene2, qp2.gene2) as query_gene2,
+ c.ccc
+ FROM query_pairs qp
+ LEFT JOIN {self.table_name} c
+ ON qp.gene1 = c.gene1 AND qp.gene2 = c.gene2
+ LEFT JOIN query_pairs qp2
+ ON qp2.gene1 = c.gene2 AND qp2.gene2 = c.gene1
+ WHERE c.ccc IS NOT NULL
+ """
+
+ results = self.con.execute(query).fetchall()
+
+ # Drop temporary table
+ self.con.execute("DROP TABLE query_pairs")
+
+ # Convert to dictionary
+ result_dict = {}
+ for row in results:
+ result_dict[(row[0], row[1])] = row[2]
+
+ # Add None for missing pairs
+ for pair in pairs:
+ if pair not in result_dict and (pair[1], pair[0]) not in result_dict:
+ result_dict[pair] = None
+
+ return result_dict
+
+ def get_cross_tissue_correlation(
+ self,
+ gene1: str,
+ gene2: str
+ ) -> pd.DataFrame:
+ """
+ Get correlation values across all tissues (consolidated database only).
+
+ Args:
+ gene1: First gene ID
+ gene2: Second gene ID
+
+ Returns:
+ DataFrame with columns: tissue, ccc
+ """
+ if self.db_type != "consolidated":
+ raise ValueError("Cross-tissue query requires consolidated database")
+
+ query = """
+ SELECT tissue, ccc
+ FROM all_correlations
+ WHERE (gene1 = ? AND gene2 = ?)
+ OR (gene1 = ? AND gene2 = ?)
+ ORDER BY ccc DESC
+ """
+
+ result = self.con.execute(query, [gene1, gene2, gene2, gene1]).df()
+ return result
+
+ def query(self, sql: str, parameters: Optional[List] = None) -> pd.DataFrame:
+ """
+ Execute custom SQL query on the database.
+
+ Args:
+ sql: SQL query string
+ parameters: Query parameters (optional)
+
+ Returns:
+ Query results as DataFrame
+ """
+ if parameters:
+ return self.con.execute(sql, parameters).df()
+ else:
+ return self.con.execute(sql).df()
+
+ def get_statistics(self) -> Dict:
+ """
+ Get database statistics.
+
+ Returns:
+ Dictionary with database statistics
+ """
+ stats = {}
+
+ if self.db_type == "consolidated":
+ # Get tissue statistics
+ tissue_stats = self.con.execute("""
+ SELECT
+ COUNT(*) as num_tissues,
+ SUM(num_pairs) as total_pairs,
+ MIN(min_ccc) as global_min_ccc,
+ MAX(max_ccc) as global_max_ccc,
+ AVG(mean_ccc) as avg_mean_ccc
+ FROM tissues
+ """).fetchone()
+
+ stats['type'] = 'consolidated'
+ stats['num_tissues'] = tissue_stats[0]
+ stats['total_pairs'] = tissue_stats[1]
+ stats['global_min_ccc'] = tissue_stats[2]
+ stats['global_max_ccc'] = tissue_stats[3]
+ stats['avg_mean_ccc'] = tissue_stats[4]
+
+ if self.tissue:
+ # Get specific tissue stats
+ tissue_info = self.con.execute("""
+ SELECT num_pairs, min_ccc, max_ccc, mean_ccc
+ FROM tissues
+ WHERE tissue_name = ?
+ """, [self.tissue]).fetchone()
+
+ if tissue_info:
+ stats['tissue'] = self.tissue
+ stats['tissue_pairs'] = tissue_info[0]
+ stats['tissue_min_ccc'] = tissue_info[1]
+ stats['tissue_max_ccc'] = tissue_info[2]
+ stats['tissue_mean_ccc'] = tissue_info[3]
+
+ else:
+ # Single tissue database statistics
+ result = self.con.execute(f"""
+ SELECT
+ COUNT(*) as num_pairs,
+ MIN(ccc) as min_ccc,
+ MAX(ccc) as max_ccc,
+ AVG(ccc) as mean_ccc
+ FROM {self.table_name}
+ """).fetchone()
+
+ stats['type'] = 'single'
+ stats['num_pairs'] = result[0]
+ stats['min_ccc'] = result[1]
+ stats['max_ccc'] = result[2]
+ stats['mean_ccc'] = result[3]
+
+ # Database file size
+ stats['database_size_gb'] = self.db_path.stat().st_size / (1024**3)
+
+ return stats
+
+ def close(self):
+ """Close database connection."""
+ self.con.close()
+
+ def __enter__(self):
+ """Context manager entry."""
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """Context manager exit."""
+ self.close()
+
+
+def main():
+ """Example usage and simple CLI."""
+ import argparse
+
+ parser = argparse.ArgumentParser(description="Query CCC correlation database")
+ parser.add_argument("database", help="Path to DuckDB database")
+ parser.add_argument("--tissue", help="Tissue name (for consolidated database)")
+ parser.add_argument("--gene1", help="First gene ID")
+ parser.add_argument("--gene2", help="Second gene ID")
+ parser.add_argument("--gene", help="Get all correlations for this gene")
+ parser.add_argument("--top", type=float, help="Get top correlations above threshold")
+ parser.add_argument("--limit", type=int, default=100, help="Limit number of results")
+ parser.add_argument("--stats", action="store_true", help="Show database statistics")
+
+ args = parser.parse_args()
+
+ # Initialize database
+ with CCCDatabase(args.database, tissue=args.tissue) as db:
+
+ if args.stats:
+ stats = db.get_statistics()
+ print("\nDatabase Statistics:")
+ for key, value in stats.items():
+ if isinstance(value, float):
+ print(f" {key}: {value:.4f}")
+ else:
+ print(f" {key}: {value}")
+
+ elif args.gene1 and args.gene2:
+ # Query specific pair
+ ccc = db.get_correlation(args.gene1, args.gene2)
+ if ccc is not None:
+ print(f"CCC({args.gene1}, {args.gene2}) = {ccc:.6f}")
+ else:
+ print(f"No correlation found for pair ({args.gene1}, {args.gene2})")
+
+ elif args.gene:
+ # Get all correlations for gene
+ results = db.get_gene_correlations(args.gene, limit=args.limit)
+ print(f"\nTop {len(results)} correlations for {args.gene}:")
+ print(results.to_string())
+
+ elif args.top:
+ # Get top correlations
+ results = db.get_top_correlations(threshold=args.top, limit=args.limit)
+ print(f"\nTop {len(results)} correlations above {args.top}:")
+ print(results.to_string())
+
+ else:
+ print("Please specify a query option (--gene1/--gene2, --gene, --top, or --stats)")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log b/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
deleted file mode 100644
index 0508e68b..00000000
--- a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
+++ /dev/null
@@ -1,392 +0,0 @@
-2025-09-11 22:50:04,101 - INFO - Starting CCC data processing
-2025-09-11 22:50:04,101 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
-2025-09-11 22:50:04,101 - DEBUG - Debug logging enabled
-2025-09-11 22:50:04,101 - INFO - Script arguments: source_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all, output_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet, dry_run=False
-2025-09-11 22:50:04,101 - INFO - Scanning directory for .pkl files: /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all
-2025-09-11 22:50:04,102 - INFO - Found 54 .pkl files to process
-2025-09-11 22:50:04,102 - DEBUG - First few files: ['gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl', 'gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl', 'gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl', 'gtex_v8_data_artery_aorta-var_pc_log2-all.pkl', 'gtex_v8_data_artery_coronary-var_pc_log2-all.pkl']
-2025-09-11 22:50:04,102 - INFO - Output directory created/verified: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet
-2025-09-11 22:50:04,102 - INFO - Starting processing of 54 files
-2025-09-11 22:50:04,104 - DEBUG - Processing file 1/54: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl
-2025-09-11 22:50:04,104 - INFO - Processing file: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl
-2025-09-11 22:50:38,091 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:50:40,159 - DEBUG - Extracted CCC data shape: (1460025703, 1)
-2025-09-11 22:50:40,159 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:54:34,827 - INFO - Successfully processed gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl -> gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:54:34,827 - INFO - Size reduction: 19.04 GB -> 6.55 GB (65.6% smaller)
-2025-09-11 22:54:36,104 - DEBUG - Processing file 2/54: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl
-2025-09-11 22:54:36,105 - INFO - Processing file: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl
-2025-09-11 22:55:12,030 - DEBUG - Loaded data shape: (1440046611, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:55:14,077 - DEBUG - Extracted CCC data shape: (1440046611, 1)
-2025-09-11 22:55:14,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:59:03,657 - INFO - Successfully processed gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl -> gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 22:59:03,657 - INFO - Size reduction: 18.78 GB -> 6.35 GB (66.2% smaller)
-2025-09-11 22:59:04,942 - DEBUG - Processing file 3/54: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl
-2025-09-11 22:59:04,942 - INFO - Processing file: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl
-2025-09-11 22:59:38,946 - DEBUG - Loaded data shape: (1358012670, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 22:59:40,880 - DEBUG - Extracted CCC data shape: (1358012670, 1)
-2025-09-11 22:59:40,880 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:03:24,150 - INFO - Successfully processed gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl -> gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:03:24,150 - INFO - Size reduction: 17.71 GB -> 5.59 GB (68.4% smaller)
-2025-09-11 23:03:25,363 - DEBUG - Processing file 4/54: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl
-2025-09-11 23:03:25,363 - INFO - Processing file: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl
-2025-09-11 23:04:00,118 - DEBUG - Loaded data shape: (1419832116, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:04:02,111 - DEBUG - Extracted CCC data shape: (1419832116, 1)
-2025-09-11 23:04:02,112 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:07:50,792 - INFO - Successfully processed gtex_v8_data_artery_aorta-var_pc_log2-all.pkl -> gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:07:50,793 - INFO - Size reduction: 18.51 GB -> 6.18 GB (66.6% smaller)
-2025-09-11 23:07:52,114 - DEBUG - Processing file 5/54: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl
-2025-09-11 23:07:52,114 - INFO - Processing file: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl
-2025-09-11 23:08:31,245 - DEBUG - Loaded data shape: (1373430255, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:08:33,214 - DEBUG - Extracted CCC data shape: (1373430255, 1)
-2025-09-11 23:08:33,214 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:12:11,613 - INFO - Successfully processed gtex_v8_data_artery_coronary-var_pc_log2-all.pkl -> gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:12:11,613 - INFO - Size reduction: 17.91 GB -> 5.74 GB (67.9% smaller)
-2025-09-11 23:12:12,970 - DEBUG - Processing file 6/54: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl
-2025-09-11 23:12:12,970 - INFO - Processing file: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl
-2025-09-11 23:12:48,104 - DEBUG - Loaded data shape: (1454033701, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:12:50,137 - DEBUG - Extracted CCC data shape: (1454033701, 1)
-2025-09-11 23:12:50,137 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:16:47,353 - INFO - Successfully processed gtex_v8_data_artery_tibial-var_pc_log2-all.pkl -> gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:16:47,353 - INFO - Size reduction: 18.96 GB -> 6.49 GB (65.8% smaller)
-2025-09-11 23:16:48,739 - DEBUG - Processing file 7/54: gtex_v8_data_bladder-var_pc_log2-all.pkl
-2025-09-11 23:16:48,739 - INFO - Processing file: gtex_v8_data_bladder-var_pc_log2-all.pkl
-2025-09-11 23:17:16,930 - DEBUG - Loaded data shape: (995271420, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:17:18,340 - DEBUG - Extracted CCC data shape: (995271420, 1)
-2025-09-11 23:17:18,340 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:19:53,633 - INFO - Successfully processed gtex_v8_data_bladder-var_pc_log2-all.pkl -> gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:19:53,633 - INFO - Size reduction: 12.98 GB -> 3.37 GB (74.0% smaller)
-2025-09-11 23:19:54,610 - DEBUG - Processing file 8/54: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl
-2025-09-11 23:19:54,611 - INFO - Processing file: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl
-2025-09-11 23:20:26,794 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:20:28,666 - DEBUG - Extracted CCC data shape: (1313153128, 1)
-2025-09-11 23:20:28,666 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:24:01,118 - INFO - Successfully processed gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl -> gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:24:01,118 - INFO - Size reduction: 17.12 GB -> 5.14 GB (70.0% smaller)
-2025-09-11 23:24:02,286 - DEBUG - Processing file 9/54: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl
-2025-09-11 23:24:02,286 - INFO - Processing file: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl
-2025-09-11 23:24:40,076 - DEBUG - Loaded data shape: (1345637503, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:24:41,958 - DEBUG - Extracted CCC data shape: (1345637503, 1)
-2025-09-11 23:24:41,958 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:28:18,530 - INFO - Successfully processed gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl -> gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:28:18,530 - INFO - Size reduction: 17.55 GB -> 5.44 GB (69.0% smaller)
-2025-09-11 23:28:19,802 - DEBUG - Processing file 10/54: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl
-2025-09-11 23:28:19,802 - INFO - Processing file: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl
-2025-09-11 23:28:52,599 - DEBUG - Loaded data shape: (1377836265, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:28:54,575 - DEBUG - Extracted CCC data shape: (1377836265, 1)
-2025-09-11 23:28:54,576 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:32:34,520 - INFO - Successfully processed gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:32:34,520 - INFO - Size reduction: 17.97 GB -> 5.74 GB (68.0% smaller)
-2025-09-11 23:32:35,848 - DEBUG - Processing file 11/54: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl
-2025-09-11 23:32:35,848 - INFO - Processing file: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl
-2025-09-11 23:33:09,984 - DEBUG - Loaded data shape: (1357283151, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:33:11,931 - DEBUG - Extracted CCC data shape: (1357283151, 1)
-2025-09-11 23:33:11,931 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:36:52,280 - INFO - Successfully processed gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:36:52,280 - INFO - Size reduction: 17.70 GB -> 5.58 GB (68.4% smaller)
-2025-09-11 23:36:53,574 - DEBUG - Processing file 12/54: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl
-2025-09-11 23:36:53,574 - INFO - Processing file: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl
-2025-09-11 23:37:32,215 - DEBUG - Loaded data shape: (1373692320, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:37:34,080 - DEBUG - Extracted CCC data shape: (1373692320, 1)
-2025-09-11 23:37:34,080 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:41:32,142 - INFO - Successfully processed gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:41:32,142 - INFO - Size reduction: 17.91 GB -> 5.75 GB (67.9% smaller)
-2025-09-11 23:41:33,450 - DEBUG - Processing file 13/54: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl
-2025-09-11 23:41:33,450 - INFO - Processing file: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl
-2025-09-11 23:42:24,751 - DEBUG - Loaded data shape: (1428371076, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:42:26,765 - DEBUG - Extracted CCC data shape: (1428371076, 1)
-2025-09-11 23:42:26,765 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:46:22,020 - INFO - Successfully processed gtex_v8_data_brain_cortex-var_pc_log2-all.pkl -> gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:46:22,020 - INFO - Size reduction: 18.63 GB -> 6.26 GB (66.4% smaller)
-2025-09-11 23:46:23,372 - DEBUG - Processing file 14/54: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl
-2025-09-11 23:46:23,372 - INFO - Processing file: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl
-2025-09-11 23:47:00,136 - DEBUG - Loaded data shape: (1359576585, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:47:01,992 - DEBUG - Extracted CCC data shape: (1359576585, 1)
-2025-09-11 23:47:01,992 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:50:37,217 - INFO - Successfully processed gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl -> gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:50:37,217 - INFO - Size reduction: 17.73 GB -> 5.59 GB (68.5% smaller)
-2025-09-11 23:50:38,531 - DEBUG - Processing file 15/54: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl
-2025-09-11 23:50:38,531 - INFO - Processing file: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl
-2025-09-11 23:52:04,330 - DEBUG - Loaded data shape: (1381565895, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:52:06,311 - DEBUG - Extracted CCC data shape: (1381565895, 1)
-2025-09-11 23:52:06,311 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:55:49,825 - INFO - Successfully processed gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet
-2025-09-11 23:55:49,825 - INFO - Size reduction: 18.01 GB -> 5.78 GB (67.9% smaller)
-2025-09-11 23:55:51,065 - DEBUG - Processing file 16/54: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl
-2025-09-11 23:55:51,065 - INFO - Processing file: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl
-2025-09-11 23:56:27,935 - DEBUG - Loaded data shape: (1371020430, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-11 23:56:29,884 - DEBUG - Extracted CCC data shape: (1371020430, 1)
-2025-09-11 23:56:29,884 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:00:09,533 - INFO - Successfully processed gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:00:09,533 - INFO - Size reduction: 17.88 GB -> 5.67 GB (68.3% smaller)
-2025-09-12 00:00:10,797 - DEBUG - Processing file 17/54: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:00:10,797 - INFO - Processing file: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:00:44,756 - DEBUG - Loaded data shape: (1389198405, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:00:46,719 - DEBUG - Extracted CCC data shape: (1389198405, 1)
-2025-09-12 00:00:46,719 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:05:05,403 - INFO - Successfully processed gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:05:05,403 - INFO - Size reduction: 18.11 GB -> 5.84 GB (67.7% smaller)
-2025-09-12 00:05:06,693 - DEBUG - Processing file 18/54: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:05:06,693 - INFO - Processing file: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl
-2025-09-12 00:05:39,317 - DEBUG - Loaded data shape: (1336936195, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:05:41,239 - DEBUG - Extracted CCC data shape: (1336936195, 1)
-2025-09-12 00:05:41,239 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:09:17,133 - INFO - Successfully processed gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:09:17,134 - INFO - Size reduction: 17.43 GB -> 5.35 GB (69.3% smaller)
-2025-09-12 00:09:18,419 - DEBUG - Processing file 19/54: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl
-2025-09-12 00:09:18,419 - INFO - Processing file: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl
-2025-09-12 00:09:55,385 - DEBUG - Loaded data shape: (1305886065, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:09:57,241 - DEBUG - Extracted CCC data shape: (1305886065, 1)
-2025-09-12 00:09:57,241 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:13:24,327 - INFO - Successfully processed gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl -> gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:13:24,328 - INFO - Size reduction: 17.03 GB -> 5.07 GB (70.2% smaller)
-2025-09-12 00:13:25,591 - DEBUG - Processing file 20/54: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl
-2025-09-12 00:13:25,591 - INFO - Processing file: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl
-2025-09-12 00:13:56,027 - DEBUG - Loaded data shape: (1278940600, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:13:57,877 - DEBUG - Extracted CCC data shape: (1278940600, 1)
-2025-09-12 00:13:57,877 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:17:25,692 - INFO - Successfully processed gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl -> gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:17:25,692 - INFO - Size reduction: 16.68 GB -> 4.82 GB (71.1% smaller)
-2025-09-12 00:17:26,928 - DEBUG - Processing file 21/54: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl
-2025-09-12 00:17:26,928 - INFO - Processing file: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl
-2025-09-12 00:18:07,581 - DEBUG - Loaded data shape: (1452847560, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:18:09,592 - DEBUG - Extracted CCC data shape: (1452847560, 1)
-2025-09-12 00:18:09,593 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:22:35,756 - INFO - Successfully processed gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl -> gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:22:35,756 - INFO - Size reduction: 18.94 GB -> 6.50 GB (65.7% smaller)
-2025-09-12 00:22:37,092 - DEBUG - Processing file 22/54: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl
-2025-09-12 00:22:37,092 - INFO - Processing file: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl
-2025-09-12 00:23:16,518 - DEBUG - Loaded data shape: (1401877725, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:23:18,482 - DEBUG - Extracted CCC data shape: (1401877725, 1)
-2025-09-12 00:23:18,482 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:27:42,509 - INFO - Successfully processed gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl -> gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:27:42,510 - INFO - Size reduction: 18.28 GB -> 6.01 GB (67.1% smaller)
-2025-09-12 00:27:43,822 - DEBUG - Processing file 23/54: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl
-2025-09-12 00:27:43,822 - INFO - Processing file: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl
-2025-09-12 00:28:16,439 - DEBUG - Loaded data shape: (1338539670, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:28:18,322 - DEBUG - Extracted CCC data shape: (1338539670, 1)
-2025-09-12 00:28:18,322 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:31:50,501 - INFO - Successfully processed gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl -> gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:31:50,501 - INFO - Size reduction: 17.45 GB -> 5.43 GB (68.9% smaller)
-2025-09-12 00:31:51,756 - DEBUG - Processing file 24/54: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl
-2025-09-12 00:31:51,756 - INFO - Processing file: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl
-2025-09-12 00:32:16,338 - DEBUG - Loaded data shape: (871468626, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:32:17,565 - DEBUG - Extracted CCC data shape: (871468626, 1)
-2025-09-12 00:32:17,565 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:34:32,270 - INFO - Successfully processed gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:34:32,270 - INFO - Size reduction: 11.36 GB -> 2.56 GB (77.4% smaller)
-2025-09-12 00:34:33,139 - DEBUG - Processing file 25/54: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl
-2025-09-12 00:34:33,139 - INFO - Processing file: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl
-2025-09-12 00:34:57,887 - DEBUG - Loaded data shape: (883533666, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:34:59,115 - DEBUG - Extracted CCC data shape: (883533666, 1)
-2025-09-12 00:34:59,116 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:37:14,667 - INFO - Successfully processed gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:37:14,667 - INFO - Size reduction: 11.52 GB -> 2.70 GB (76.6% smaller)
-2025-09-12 00:37:15,482 - DEBUG - Processing file 26/54: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl
-2025-09-12 00:37:15,482 - INFO - Processing file: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl
-2025-09-12 00:38:40,776 - DEBUG - Loaded data shape: (1414189153, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:38:42,695 - DEBUG - Extracted CCC data shape: (1414189153, 1)
-2025-09-12 00:38:42,695 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:42:32,231 - INFO - Successfully processed gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl -> gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:42:32,231 - INFO - Size reduction: 18.44 GB -> 6.12 GB (66.8% smaller)
-2025-09-12 00:42:33,416 - DEBUG - Processing file 27/54: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl
-2025-09-12 00:42:33,416 - INFO - Processing file: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl
-2025-09-12 00:43:13,557 - DEBUG - Loaded data shape: (1425646503, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:43:15,585 - DEBUG - Extracted CCC data shape: (1425646503, 1)
-2025-09-12 00:43:15,585 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:47:04,970 - INFO - Successfully processed gtex_v8_data_colon_transverse-var_pc_log2-all.pkl -> gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:47:04,971 - INFO - Size reduction: 18.59 GB -> 6.22 GB (66.6% smaller)
-2025-09-12 00:47:06,372 - DEBUG - Processing file 28/54: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl
-2025-09-12 00:47:06,372 - INFO - Processing file: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl
-2025-09-12 00:47:40,754 - DEBUG - Loaded data shape: (1407708330, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:47:42,712 - DEBUG - Extracted CCC data shape: (1407708330, 1)
-2025-09-12 00:47:42,712 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:51:30,851 - INFO - Successfully processed gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:51:30,852 - INFO - Size reduction: 18.36 GB -> 6.05 GB (67.0% smaller)
-2025-09-12 00:51:32,216 - DEBUG - Processing file 29/54: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl
-2025-09-12 00:51:32,217 - INFO - Processing file: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl
-2025-09-12 00:52:12,189 - DEBUG - Loaded data shape: (1429226380, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:52:14,202 - DEBUG - Extracted CCC data shape: (1429226380, 1)
-2025-09-12 00:52:14,202 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:56:04,136 - INFO - Successfully processed gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet
-2025-09-12 00:56:04,136 - INFO - Size reduction: 18.64 GB -> 6.26 GB (66.4% smaller)
-2025-09-12 00:56:05,503 - DEBUG - Processing file 30/54: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl
-2025-09-12 00:56:05,504 - INFO - Processing file: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl
-2025-09-12 00:56:40,714 - DEBUG - Loaded data shape: (1438705261, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 00:56:42,738 - DEBUG - Extracted CCC data shape: (1438705261, 1)
-2025-09-12 00:56:42,738 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:01:04,618 - INFO - Successfully processed gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:01:04,618 - INFO - Size reduction: 18.76 GB -> 6.34 GB (66.2% smaller)
-2025-09-12 01:01:05,944 - DEBUG - Processing file 31/54: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl
-2025-09-12 01:01:05,944 - INFO - Processing file: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl
-2025-09-12 01:01:30,707 - DEBUG - Loaded data shape: (869799486, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:01:31,984 - DEBUG - Extracted CCC data shape: (869799486, 1)
-2025-09-12 01:01:31,984 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:03:48,060 - INFO - Successfully processed gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl -> gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:03:48,061 - INFO - Size reduction: 11.34 GB -> 2.57 GB (77.4% smaller)
-2025-09-12 01:03:48,926 - DEBUG - Processing file 32/54: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl
-2025-09-12 01:03:48,926 - INFO - Processing file: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl
-2025-09-12 01:04:24,279 - DEBUG - Loaded data shape: (1416051153, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:04:26,222 - DEBUG - Extracted CCC data shape: (1416051153, 1)
-2025-09-12 01:04:26,222 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:08:16,477 - INFO - Successfully processed gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl -> gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:08:16,477 - INFO - Size reduction: 18.46 GB -> 6.11 GB (66.9% smaller)
-2025-09-12 01:08:17,711 - DEBUG - Processing file 33/54: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl
-2025-09-12 01:08:17,711 - INFO - Processing file: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl
-2025-09-12 01:08:56,782 - DEBUG - Loaded data shape: (1389303828, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:08:58,759 - DEBUG - Extracted CCC data shape: (1389303828, 1)
-2025-09-12 01:08:58,759 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:12:40,469 - INFO - Successfully processed gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl -> gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:12:40,469 - INFO - Size reduction: 18.12 GB -> 5.84 GB (67.8% smaller)
-2025-09-12 01:12:41,779 - DEBUG - Processing file 34/54: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl
-2025-09-12 01:12:41,779 - INFO - Processing file: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl
-2025-09-12 01:13:12,636 - DEBUG - Loaded data shape: (1231692528, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:13:14,333 - DEBUG - Extracted CCC data shape: (1231692528, 1)
-2025-09-12 01:13:14,333 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:16:29,445 - INFO - Successfully processed gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl -> gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:16:29,445 - INFO - Size reduction: 16.06 GB -> 4.38 GB (72.7% smaller)
-2025-09-12 01:16:30,622 - DEBUG - Processing file 35/54: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl
-2025-09-12 01:16:30,622 - INFO - Processing file: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl
-2025-09-12 01:16:48,096 - DEBUG - Loaded data shape: (692459505, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:16:49,091 - DEBUG - Extracted CCC data shape: (692459505, 1)
-2025-09-12 01:16:49,091 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:18:36,495 - INFO - Successfully processed gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl -> gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:18:36,495 - INFO - Size reduction: 9.03 GB -> 1.43 GB (84.1% smaller)
-2025-09-12 01:18:37,172 - DEBUG - Processing file 36/54: gtex_v8_data_liver-var_pc_log2-all.pkl
-2025-09-12 01:18:37,172 - INFO - Processing file: gtex_v8_data_liver-var_pc_log2-all.pkl
-2025-09-12 01:19:14,213 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:19:16,077 - DEBUG - Extracted CCC data shape: (1313153128, 1)
-2025-09-12 01:19:16,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:22:44,977 - INFO - Successfully processed gtex_v8_data_liver-var_pc_log2-all.pkl -> gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:22:44,977 - INFO - Size reduction: 17.12 GB -> 5.16 GB (69.9% smaller)
-2025-09-12 01:22:46,127 - DEBUG - Processing file 37/54: gtex_v8_data_lung-var_pc_log2-all.pkl
-2025-09-12 01:22:46,127 - INFO - Processing file: gtex_v8_data_lung-var_pc_log2-all.pkl
-2025-09-12 01:23:21,938 - DEBUG - Loaded data shape: (1461917628, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:23:24,049 - DEBUG - Extracted CCC data shape: (1461917628, 1)
-2025-09-12 01:23:24,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:27:20,003 - INFO - Successfully processed gtex_v8_data_lung-var_pc_log2-all.pkl -> gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:27:20,003 - INFO - Size reduction: 19.06 GB -> 6.58 GB (65.5% smaller)
-2025-09-12 01:27:21,442 - DEBUG - Processing file 38/54: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl
-2025-09-12 01:27:21,442 - INFO - Processing file: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl
-2025-09-12 01:27:54,922 - DEBUG - Loaded data shape: (1331409003, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:27:56,838 - DEBUG - Extracted CCC data shape: (1331409003, 1)
-2025-09-12 01:27:56,838 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:31:28,578 - INFO - Successfully processed gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl -> gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:31:28,578 - INFO - Size reduction: 17.36 GB -> 5.35 GB (69.2% smaller)
-2025-09-12 01:31:29,896 - DEBUG - Processing file 39/54: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl
-2025-09-12 01:31:29,896 - INFO - Processing file: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl
-2025-09-12 01:32:06,348 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:32:08,350 - DEBUG - Extracted CCC data shape: (1460025703, 1)
-2025-09-12 01:32:08,350 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:36:04,226 - INFO - Successfully processed gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl -> gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:36:04,226 - INFO - Size reduction: 19.04 GB -> 6.54 GB (65.7% smaller)
-2025-09-12 01:36:05,600 - DEBUG - Processing file 40/54: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl
-2025-09-12 01:36:05,601 - INFO - Processing file: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl
-2025-09-12 01:36:42,425 - DEBUG - Loaded data shape: (1472643585, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:36:44,497 - DEBUG - Extracted CCC data shape: (1472643585, 1)
-2025-09-12 01:36:44,497 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:40:42,722 - INFO - Successfully processed gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl -> gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:40:42,722 - INFO - Size reduction: 19.20 GB -> 6.69 GB (65.2% smaller)
-2025-09-12 01:40:44,106 - DEBUG - Processing file 41/54: gtex_v8_data_ovary-var_pc_log2-all.pkl
-2025-09-12 01:40:44,106 - INFO - Processing file: gtex_v8_data_ovary-var_pc_log2-all.pkl
-2025-09-12 01:41:18,148 - DEBUG - Loaded data shape: (1353222276, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:41:20,048 - DEBUG - Extracted CCC data shape: (1353222276, 1)
-2025-09-12 01:41:20,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:45:04,730 - INFO - Successfully processed gtex_v8_data_ovary-var_pc_log2-all.pkl -> gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:45:04,730 - INFO - Size reduction: 17.65 GB -> 5.56 GB (68.5% smaller)
-2025-09-12 01:45:06,016 - DEBUG - Processing file 42/54: gtex_v8_data_pancreas-var_pc_log2-all.pkl
-2025-09-12 01:45:06,016 - INFO - Processing file: gtex_v8_data_pancreas-var_pc_log2-all.pkl
-2025-09-12 01:46:30,520 - DEBUG - Loaded data shape: (1369711630, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:46:32,480 - DEBUG - Extracted CCC data shape: (1369711630, 1)
-2025-09-12 01:46:32,480 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:50:13,619 - INFO - Successfully processed gtex_v8_data_pancreas-var_pc_log2-all.pkl -> gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:50:13,619 - INFO - Size reduction: 17.86 GB -> 5.68 GB (68.2% smaller)
-2025-09-12 01:50:14,880 - DEBUG - Processing file 43/54: gtex_v8_data_pituitary-var_pc_log2-all.pkl
-2025-09-12 01:50:14,880 - INFO - Processing file: gtex_v8_data_pituitary-var_pc_log2-all.pkl
-2025-09-12 01:50:55,006 - DEBUG - Loaded data shape: (1418660011, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:50:57,042 - DEBUG - Extracted CCC data shape: (1418660011, 1)
-2025-09-12 01:50:57,042 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:54:43,653 - INFO - Successfully processed gtex_v8_data_pituitary-var_pc_log2-all.pkl -> gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:54:43,653 - INFO - Size reduction: 18.50 GB -> 6.17 GB (66.6% smaller)
-2025-09-12 01:54:44,998 - DEBUG - Processing file 44/54: gtex_v8_data_prostate-var_pc_log2-all.pkl
-2025-09-12 01:54:44,998 - INFO - Processing file: gtex_v8_data_prostate-var_pc_log2-all.pkl
-2025-09-12 01:55:19,883 - DEBUG - Loaded data shape: (1395161076, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:55:21,840 - DEBUG - Extracted CCC data shape: (1395161076, 1)
-2025-09-12 01:55:21,840 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:59:09,359 - INFO - Successfully processed gtex_v8_data_prostate-var_pc_log2-all.pkl -> gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet
-2025-09-12 01:59:09,359 - INFO - Size reduction: 18.19 GB -> 5.96 GB (67.2% smaller)
-2025-09-12 01:59:10,703 - DEBUG - Processing file 45/54: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl
-2025-09-12 01:59:10,703 - INFO - Processing file: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl
-2025-09-12 01:59:51,917 - DEBUG - Loaded data shape: (1458621066, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 01:59:53,937 - DEBUG - Extracted CCC data shape: (1458621066, 1)
-2025-09-12 01:59:53,937 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:03:48,454 - INFO - Successfully processed gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl -> gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:03:48,454 - INFO - Size reduction: 19.02 GB -> 6.55 GB (65.6% smaller)
-2025-09-12 02:03:49,884 - DEBUG - Processing file 46/54: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl
-2025-09-12 02:03:49,884 - INFO - Processing file: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl
-2025-09-12 02:05:18,157 - DEBUG - Loaded data shape: (1473566328, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:05:20,326 - DEBUG - Extracted CCC data shape: (1473566328, 1)
-2025-09-12 02:05:20,327 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:09:19,847 - INFO - Successfully processed gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl -> gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:09:19,847 - INFO - Size reduction: 19.21 GB -> 6.69 GB (65.2% smaller)
-2025-09-12 02:09:21,270 - DEBUG - Processing file 47/54: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl
-2025-09-12 02:09:21,270 - INFO - Processing file: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl
-2025-09-12 02:09:59,324 - DEBUG - Loaded data shape: (1353014190, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:10:01,233 - DEBUG - Extracted CCC data shape: (1353014190, 1)
-2025-09-12 02:10:01,233 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:13:38,031 - INFO - Successfully processed gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl -> gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:13:38,032 - INFO - Size reduction: 17.64 GB -> 5.54 GB (68.6% smaller)
-2025-09-12 02:13:39,310 - DEBUG - Processing file 48/54: gtex_v8_data_spleen-var_pc_log2-all.pkl
-2025-09-12 02:13:39,310 - INFO - Processing file: gtex_v8_data_spleen-var_pc_log2-all.pkl
-2025-09-12 02:14:13,490 - DEBUG - Loaded data shape: (1367095905, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:14:15,383 - DEBUG - Extracted CCC data shape: (1367095905, 1)
-2025-09-12 02:14:15,383 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:17:52,891 - INFO - Successfully processed gtex_v8_data_spleen-var_pc_log2-all.pkl -> gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:17:52,891 - INFO - Size reduction: 17.83 GB -> 5.68 GB (68.1% smaller)
-2025-09-12 02:17:54,199 - DEBUG - Processing file 49/54: gtex_v8_data_stomach-var_pc_log2-all.pkl
-2025-09-12 02:17:54,199 - INFO - Processing file: gtex_v8_data_stomach-var_pc_log2-all.pkl
-2025-09-12 02:18:29,137 - DEBUG - Loaded data shape: (1402248403, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:18:31,077 - DEBUG - Extracted CCC data shape: (1402248403, 1)
-2025-09-12 02:18:31,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:23:07,195 - INFO - Successfully processed gtex_v8_data_stomach-var_pc_log2-all.pkl -> gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:23:07,195 - INFO - Size reduction: 18.28 GB -> 5.99 GB (67.2% smaller)
-2025-09-12 02:23:08,518 - DEBUG - Processing file 50/54: gtex_v8_data_testis-var_pc_log2-all.pkl
-2025-09-12 02:23:08,519 - INFO - Processing file: gtex_v8_data_testis-var_pc_log2-all.pkl
-2025-09-12 02:23:47,944 - DEBUG - Loaded data shape: (1502917725, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:23:50,000 - DEBUG - Extracted CCC data shape: (1502917725, 1)
-2025-09-12 02:23:50,000 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:27:48,966 - INFO - Successfully processed gtex_v8_data_testis-var_pc_log2-all.pkl -> gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:27:48,966 - INFO - Size reduction: 19.60 GB -> 7.02 GB (64.2% smaller)
-2025-09-12 02:27:50,385 - DEBUG - Processing file 51/54: gtex_v8_data_thyroid-var_pc_log2-all.pkl
-2025-09-12 02:27:50,385 - INFO - Processing file: gtex_v8_data_thyroid-var_pc_log2-all.pkl
-2025-09-12 02:28:27,204 - DEBUG - Loaded data shape: (1472317980, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:28:29,203 - DEBUG - Extracted CCC data shape: (1472317980, 1)
-2025-09-12 02:28:29,204 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:32:22,715 - INFO - Successfully processed gtex_v8_data_thyroid-var_pc_log2-all.pkl -> gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:32:22,715 - INFO - Size reduction: 19.20 GB -> 6.68 GB (65.2% smaller)
-2025-09-12 02:32:24,123 - DEBUG - Processing file 52/54: gtex_v8_data_uterus-var_pc_log2-all.pkl
-2025-09-12 02:32:24,123 - INFO - Processing file: gtex_v8_data_uterus-var_pc_log2-all.pkl
-2025-09-12 02:32:56,654 - DEBUG - Loaded data shape: (1308289128, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:32:58,515 - DEBUG - Extracted CCC data shape: (1308289128, 1)
-2025-09-12 02:32:58,515 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:36:26,336 - INFO - Successfully processed gtex_v8_data_uterus-var_pc_log2-all.pkl -> gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:36:26,338 - INFO - Size reduction: 17.06 GB -> 5.16 GB (69.8% smaller)
-2025-09-12 02:36:27,576 - DEBUG - Processing file 53/54: gtex_v8_data_vagina-var_pc_log2-all.pkl
-2025-09-12 02:36:27,576 - INFO - Processing file: gtex_v8_data_vagina-var_pc_log2-all.pkl
-2025-09-12 02:37:09,453 - DEBUG - Loaded data shape: (1328623926, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:37:11,361 - DEBUG - Extracted CCC data shape: (1328623926, 1)
-2025-09-12 02:37:11,361 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:40:41,189 - INFO - Successfully processed gtex_v8_data_vagina-var_pc_log2-all.pkl -> gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:40:41,189 - INFO - Size reduction: 17.32 GB -> 5.33 GB (69.2% smaller)
-2025-09-12 02:40:42,400 - DEBUG - Processing file 54/54: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2025-09-12 02:40:42,400 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl
-2025-09-12 02:41:17,012 - DEBUG - Loaded data shape: (1420258456, 3), columns: ['ccc', 'pearson', 'spearman']
-2025-09-12 02:41:19,005 - DEBUG - Extracted CCC data shape: (1420258456, 1)
-2025-09-12 02:41:19,005 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:46:00,216 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl -> gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet
-2025-09-12 02:46:00,216 - INFO - Size reduction: 18.52 GB -> 6.12 GB (66.9% smaller)
-2025-09-12 02:46:01,553 - INFO - File processing completed in 3:55:57.451227
-2025-09-12 02:46:01,553 - INFO - Processing complete! Successfully processed: 54/54 files
-2025-09-12 02:46:01,553 - INFO - Total execution time: 3:55:57.451335
-2025-09-12 02:46:01,553 - INFO - Output directory: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet
-2025-09-12 02:46:01,553 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log
diff --git a/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py
new file mode 100755
index 00000000..bb729d39
--- /dev/null
+++ b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+"""
+Convert GTEx CCC correlation data from pickle format to DuckDB format for efficient storage and fast queries.
+
+This script processes all .pkl files containing gene correlation data and creates optimized
+DuckDB databases that provide:
+- Fast random access to gene pairs (sub-millisecond queries)
+- Significantly reduced storage size
+- SQL query capabilities
+- Minimal memory usage for queries
+"""
+
+import argparse
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import gc
+
+import pandas as pd
+import duckdb
+import numpy as np
+from tqdm import tqdm
+
+
+def setup_logging(debug: bool = False) -> str:
+ """Set up logging with timestamped log file."""
+ script_dir = Path(__file__).parent
+ logs_dir = script_dir / "logs"
+ logs_dir.mkdir(exist_ok=True)
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ log_filename = logs_dir / f"process_ccc_to_duckdb_{timestamp}.log"
+
+ logging.basicConfig(
+ level=logging.DEBUG if debug else logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[
+ logging.FileHandler(log_filename),
+ logging.StreamHandler(sys.stdout)
+ ]
+ )
+
+ logger = logging.getLogger(__name__)
+ logger.info(f"Starting CCC to DuckDB conversion")
+ logger.info(f"Log file: {log_filename}")
+
+ return str(log_filename)
+
+
+def convert_pickle_to_duckdb(
+ pkl_file: Path,
+ output_dir: Path,
+ single_db: bool = False,
+ db_con: Optional[duckdb.DuckDBPyConnection] = None,
+ chunk_size: int = 10_000_000
+) -> Dict:
+ """
+ Convert a single pickle file to DuckDB format.
+
+ Args:
+ pkl_file: Path to input pickle file
+ output_dir: Directory for output database files
+ single_db: If True, append to single database (db_con must be provided)
+ db_con: Existing DuckDB connection (if single_db is True)
+ chunk_size: Number of rows to process at once
+
+ Returns:
+ Dictionary with conversion statistics
+ """
+ logger = logging.getLogger(__name__)
+ stats = {}
+ start_time = datetime.now()
+
+ # Get tissue name from filename
+ tissue_name = pkl_file.stem.replace('gtex_v8_data_', '').replace('-var_pc_log2-all', '')
+ stats['tissue'] = tissue_name
+ stats['input_file'] = pkl_file.name
+
+ logger.info(f"Processing: {pkl_file.name}")
+
+ try:
+ # Load pickle file
+ logger.info(f"Loading pickle file...")
+ load_start = datetime.now()
+ df = pd.read_pickle(pkl_file)
+ load_time = (datetime.now() - load_start).total_seconds()
+
+ stats['input_rows'] = len(df)
+ stats['input_size_gb'] = pkl_file.stat().st_size / (1024**3)
+ logger.info(f"Loaded {len(df):,} rows in {load_time:.1f}s ({stats['input_size_gb']:.2f} GB)")
+
+ # Extract only CCC column and reset index
+ logger.info("Preparing data...")
+ df_ccc = df[['ccc']].reset_index()
+ df_ccc.columns = ['gene1', 'gene2', 'ccc']
+
+ # Convert to appropriate types
+ df_ccc['ccc'] = df_ccc['ccc'].astype('float32')
+ df_ccc['gene1'] = df_ccc['gene1'].astype(str)
+ df_ccc['gene2'] = df_ccc['gene2'].astype(str)
+
+ # Clean up original dataframe to free memory
+ del df
+ gc.collect()
+
+ # Create or connect to database
+ if single_db:
+ con = db_con
+ table_name = f"ccc_{tissue_name}"
+ else:
+ db_file = output_dir / f"{tissue_name}_ccc.duckdb"
+ con = duckdb.connect(str(db_file))
+ table_name = "ccc_data"
+
+ # Create table
+ logger.info(f"Creating table {table_name}...")
+
+ if single_db:
+ # For single database, include tissue in the table
+ con.execute(f"""
+ CREATE TABLE IF NOT EXISTS {table_name} (
+ gene1 VARCHAR NOT NULL,
+ gene2 VARCHAR NOT NULL,
+ ccc REAL NOT NULL,
+ PRIMARY KEY (gene1, gene2)
+ )
+ """)
+ else:
+ con.execute(f"""
+ CREATE TABLE {table_name} (
+ gene1 VARCHAR NOT NULL,
+ gene2 VARCHAR NOT NULL,
+ ccc REAL NOT NULL,
+ PRIMARY KEY (gene1, gene2)
+ )
+ """)
+
+ # Insert data efficiently
+ logger.info(f"Inserting {len(df_ccc):,} rows...")
+ insert_start = datetime.now()
+
+ # Use DuckDB's register for bulk insert
+ con.register('df_temp', df_ccc)
+ con.execute(f"INSERT INTO {table_name} SELECT * FROM df_temp")
+ con.unregister('df_temp')
+
+ insert_time = (datetime.now() - insert_start).total_seconds()
+ stats['insert_time'] = insert_time
+ logger.info(f"Data inserted in {insert_time:.1f}s")
+
+ # Clean up dataframe
+ del df_ccc
+ gc.collect()
+
+ # Create indexes for faster lookups
+ logger.info("Creating indexes...")
+ index_start = datetime.now()
+
+ # Create index on gene2 for reverse lookups
+ con.execute(f"CREATE INDEX idx_{table_name}_gene2 ON {table_name}(gene2)")
+
+ # Create index on ccc for range queries
+ con.execute(f"CREATE INDEX idx_{table_name}_ccc ON {table_name}(ccc)")
+
+ # Analyze table for query optimization
+ con.execute(f"ANALYZE {table_name}")
+
+ index_time = (datetime.now() - index_start).total_seconds()
+ stats['index_time'] = index_time
+
+ # Get final statistics
+ result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
+ stats['output_rows'] = result[0]
+
+ if not single_db:
+ # Close connection and get file size
+ con.close()
+ stats['output_size_gb'] = (output_dir / f"{tissue_name}_ccc.duckdb").stat().st_size / (1024**3)
+
+ stats['total_time'] = (datetime.now() - start_time).total_seconds()
+
+ if 'output_size_gb' in stats:
+ stats['compression_ratio'] = stats['input_size_gb'] / stats['output_size_gb']
+ logger.info(f"Completed: {stats['input_size_gb']:.2f} GB -> {stats['output_size_gb']:.2f} GB "
+ f"(compression: {stats['compression_ratio']:.1f}x)")
+
+ logger.info(f"Total time: {stats['total_time']:.1f}s")
+
+ except Exception as e:
+ logger.error(f"Error processing {pkl_file.name}: {e}")
+ stats['error'] = str(e)
+ if not single_db and 'con' in locals():
+ con.close()
+
+ return stats
+
+
+def create_consolidated_database(
+ pkl_files: List[Path],
+ output_dir: Path
+) -> Dict:
+ """
+ Create a single consolidated DuckDB database with all tissues.
+
+ Args:
+ pkl_files: List of pickle files to process
+ output_dir: Directory for output database
+
+ Returns:
+ Dictionary with overall statistics
+ """
+ logger = logging.getLogger(__name__)
+
+ db_file = output_dir / "all_tissues_ccc.duckdb"
+ logger.info(f"Creating consolidated database: {db_file}")
+
+ con = duckdb.connect(str(db_file))
+ all_stats = []
+
+ try:
+ # Create master table for tissue metadata
+ con.execute("""
+ CREATE TABLE tissues (
+ tissue_id INTEGER PRIMARY KEY,
+ tissue_name VARCHAR UNIQUE NOT NULL,
+ num_pairs BIGINT,
+ min_ccc REAL,
+ max_ccc REAL,
+ mean_ccc REAL
+ )
+ """)
+
+ tissue_id = 1
+
+ for pkl_file in tqdm(pkl_files, desc="Processing tissues"):
+ stats = convert_pickle_to_duckdb(
+ pkl_file=pkl_file,
+ output_dir=output_dir,
+ single_db=True,
+ db_con=con
+ )
+
+ if 'error' not in stats:
+ # Add tissue metadata
+ tissue_name = stats['tissue']
+ table_name = f"ccc_{tissue_name}"
+
+ tissue_stats = con.execute(f"""
+ SELECT
+ COUNT(*) as num_pairs,
+ MIN(ccc) as min_ccc,
+ MAX(ccc) as max_ccc,
+ AVG(ccc) as mean_ccc
+ FROM {table_name}
+ """).fetchone()
+
+ con.execute("""
+ INSERT INTO tissues (tissue_id, tissue_name, num_pairs, min_ccc, max_ccc, mean_ccc)
+ VALUES (?, ?, ?, ?, ?, ?)
+ """, [tissue_id, tissue_name, *tissue_stats])
+
+ tissue_id += 1
+
+ all_stats.append(stats)
+
+ # Create a view for easy cross-tissue queries
+ logger.info("Creating cross-tissue query views...")
+
+ # Get list of all tissue tables
+ tissue_tables = con.execute("""
+ SELECT 'ccc_' || tissue_name as table_name, tissue_name
+ FROM tissues
+ """).fetchall()
+
+ # Create UNION ALL view for searching across all tissues
+ union_parts = []
+ for table_name, tissue_name in tissue_tables:
+ union_parts.append(f"""
+ SELECT '{tissue_name}' as tissue, gene1, gene2, ccc
+ FROM {table_name}
+ """)
+
+ if union_parts:
+ union_query = " UNION ALL ".join(union_parts)
+ con.execute(f"""
+ CREATE VIEW all_correlations AS
+ {union_query}
+ """)
+
+ logger.info("Created all_correlations view for cross-tissue queries")
+
+ # Optimize database
+ logger.info("Optimizing database...")
+ con.execute("PRAGMA optimize")
+
+ # Get final database size
+ con.close()
+
+ db_size = db_file.stat().st_size / (1024**3)
+ logger.info(f"Consolidated database size: {db_size:.2f} GB")
+
+ return {
+ 'database': str(db_file),
+ 'tissues_processed': len([s for s in all_stats if 'error' not in s]),
+ 'tissues_failed': len([s for s in all_stats if 'error' in s]),
+ 'total_size_gb': db_size,
+ 'stats': all_stats
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating consolidated database: {e}")
+ con.close()
+ raise
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Convert GTEx CCC data from pickle to DuckDB format"
+ )
+ parser.add_argument(
+ "--source-dir",
+ type=str,
+ default="/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all",
+ help="Source directory containing .pkl files"
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ default="/mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb",
+ help="Output directory for DuckDB files"
+ )
+ parser.add_argument(
+ "--single-db",
+ action="store_true",
+ help="Create a single consolidated database instead of one per tissue"
+ )
+ parser.add_argument(
+ "--tissues",
+ nargs="+",
+ help="Specific tissues to process (default: all)"
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Show what would be processed without doing it"
+ )
+ parser.add_argument(
+ "--debug",
+ action="store_true",
+ help="Enable debug logging"
+ )
+
+ args = parser.parse_args()
+
+ # Setup logging
+ log_file = setup_logging(debug=args.debug)
+ logger = logging.getLogger(__name__)
+
+ # Convert paths
+ source_dir = Path(args.source_dir)
+ output_dir = Path(args.output_dir)
+
+ logger.info(f"Configuration:")
+ logger.info(f" Source: {source_dir}")
+ logger.info(f" Output: {output_dir}")
+ logger.info(f" Single DB: {args.single_db}")
+
+ # Check source directory
+ if not source_dir.exists():
+ logger.error(f"Source directory not found: {source_dir}")
+ sys.exit(1)
+
+ # Get list of pickle files
+ pkl_files = sorted(source_dir.glob("*.pkl"))
+
+ # Filter by specific tissues if requested
+ if args.tissues:
+ filtered = []
+ for tissue in args.tissues:
+ matching = [f for f in pkl_files if tissue in f.name]
+ filtered.extend(matching)
+ pkl_files = filtered
+
+ if not pkl_files:
+ logger.error("No pickle files found to process")
+ sys.exit(1)
+
+ logger.info(f"Found {len(pkl_files)} files to process")
+
+ if args.dry_run:
+ print("\nFiles that would be processed:")
+ for f in pkl_files:
+ size_gb = f.stat().st_size / (1024**3)
+ print(f" {f.name} ({size_gb:.2f} GB)")
+ print(f"\nOutput would be written to: {output_dir}")
+ return
+
+ # Create output directory
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Process files
+ start_time = datetime.now()
+
+ if args.single_db:
+ # Create single consolidated database
+ results = create_consolidated_database(pkl_files, output_dir)
+
+ print(f"\n{'='*60}")
+ print("PROCESSING COMPLETE")
+ print(f"{'='*60}")
+ print(f"Database: {results['database']}")
+ print(f"Tissues processed: {results['tissues_processed']}")
+ print(f"Tissues failed: {results['tissues_failed']}")
+ print(f"Total size: {results['total_size_gb']:.2f} GB")
+
+ else:
+ # Create individual databases
+ all_stats = []
+
+ for pkl_file in tqdm(pkl_files, desc="Processing files"):
+ stats = convert_pickle_to_duckdb(
+ pkl_file=pkl_file,
+ output_dir=output_dir,
+ single_db=False
+ )
+ all_stats.append(stats)
+
+ # Summary
+ successful = [s for s in all_stats if 'error' not in s]
+ failed = [s for s in all_stats if 'error' in s]
+
+ print(f"\n{'='*60}")
+ print("PROCESSING COMPLETE")
+ print(f"{'='*60}")
+ print(f"Files processed: {len(successful)}/{len(pkl_files)}")
+
+ if successful:
+ total_input = sum(s['input_size_gb'] for s in successful)
+ total_output = sum(s.get('output_size_gb', 0) for s in successful)
+ avg_compression = total_input / total_output if total_output > 0 else 0
+
+ print(f"Total input size: {total_input:.2f} GB")
+ print(f"Total output size: {total_output:.2f} GB")
+ print(f"Average compression: {avg_compression:.1f}x")
+
+ if failed:
+ print(f"\nFailed files ({len(failed)}):")
+ for s in failed:
+ print(f" {s['input_file']}: {s['error']}")
+
+ total_time = (datetime.now() - start_time).total_seconds()
+ print(f"\nTotal processing time: {total_time/60:.1f} minutes")
+ print(f"Log file: {log_file}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb
new file mode 100644
index 00000000..68f2b129
--- /dev/null
+++ b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb
@@ -0,0 +1,4056 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Description\n",
+ "This notebook demonstrates:\n",
+ "\n",
+ "1. how to compute coefficients values\n",
+ "2. how to correlate gene expression data with categorical metadata\n",
+ "\n",
+ "using CCC GPU with public data from GTEx v8."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Please follow the instructions in the [README](../../README.md), section \"Quick Install with pip\" to install CCC-GPU with a conda environment `ccc-gpu-env`.\n",
+ "\n",
+ "Then activate the environment and start the jupyter notebook server in order to run this notebook.\n",
+ "\n",
+ "```bash\n",
+ "conda activate ccc-gpu-env\n",
+ "pip install notebook\n",
+ "jupyter notebook\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import re\n",
+ "import pandas as pd\n",
+ "import urllib.request\n",
+ "from tqdm import tqdm\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from ccc.utils import simplify_string\n",
+ "from ccc import conf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set this path to the directory where you want to save the intermediate data and results\n",
+ "ANALYSIS_DIR = Path(\"/mnt/data/proj_data/ccc-gpu/data/tutorial\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data Fetching and Preprocessing\n",
+ "This section downloads:\n",
+ "1. the public GTEx v8 gene TPMs data (https://www.gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)\n",
+ "2. the GTEx sample attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n",
+ "3. the GTEx subject attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n",
+ "\n",
+ "and perform preprocessing to prepare the data for the analysis."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Download GTEx v8 gene expression data and split by tissue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "gtex_all_sample_ids_with_expr_data already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\n",
+ "gtex_sample_attrs already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\n",
+ "Downloading gtex_subject_attrs to /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\n",
+ "Download completed!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create analysis directory if it doesn't exist\n",
+ "os.makedirs(ANALYSIS_DIR, exist_ok=True)\n",
+ "\n",
+ "# Define files to download\n",
+ "files_to_download = {\n",
+ " \"gtex_all_sample_ids_with_expr_data\": \"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\",\n",
+ " \"gtex_sample_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\",\n",
+ " \"gtex_subject_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\"\n",
+ "}\n",
+ "\n",
+ "# Dictionary to store file paths\n",
+ "file_paths = {}\n",
+ "\n",
+ "# Download files\n",
+ "for var_name, url in files_to_download.items():\n",
+ " filename = Path(url).name\n",
+ " file_path = Path(ANALYSIS_DIR) / filename\n",
+ " file_paths[var_name] = file_path\n",
+ " \n",
+ " if not file_path.exists():\n",
+ " print(f\"Downloading {var_name} to {file_path}\")\n",
+ " urllib.request.urlretrieve(url, file_path)\n",
+ " print(\"Download completed!\")\n",
+ " else:\n",
+ " print(f\"{var_name} already exists at {file_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GTEx sample attributes shape: (22951, 63)\n",
+ "GTEx sample attributes columns: Index(['SAMPID', 'SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'SMRIN', 'SMTS', 'SMTSD',\n",
+ " 'SMUBRID', 'SMTSISCH', 'SMTSPAX', 'SMNABTCH', 'SMNABTCHT', 'SMNABTCHD',\n",
+ " 'SMGEBTCH', 'SMGEBTCHD', 'SMGEBTCHT', 'SMAFRZE', 'SMGTC', 'SME2MPRT',\n",
+ " 'SMCHMPRS', 'SMNTRART', 'SMNUMGPS', 'SMMAPRT', 'SMEXNCRT', 'SM550NRM',\n",
+ " 'SMGNSDTC', 'SMUNMPRT', 'SM350NRM', 'SMRDLGTH', 'SMMNCPB', 'SME1MMRT',\n",
+ " 'SMSFLGTH', 'SMESTLBS', 'SMMPPD', 'SMNTERRT', 'SMRRNANM', 'SMRDTTL',\n",
+ " 'SMVQCFL', 'SMMNCV', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMGAPPCT',\n",
+ " 'SMUNPDRD', 'SMNTRNRT', 'SMMPUNRT', 'SMEXPEFF', 'SMMPPDUN', 'SME2MMRT',\n",
+ " 'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SME1ANTI', 'SMSPLTRD',\n",
+ " 'SMBSMMRT', 'SME1SNSE', 'SME1PCTS', 'SMRRNART', 'SME1MPRT', 'SMNUM5CD',\n",
+ " 'SMDPMPRT', 'SME2PCTS'],\n",
+ " dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "gtex_sample_attrs = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\")\n",
+ "print(f\"GTEx sample attributes shape: {gtex_sample_attrs.shape}\")\n",
+ "print(f\"GTEx sample attributes columns: {gtex_sample_attrs.columns}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "55\n",
+ "['Whole Blood' 'Brain - Frontal Cortex (BA9)' 'Adipose - Subcutaneous'\n",
+ " 'Muscle - Skeletal' 'Artery - Tibial' 'Artery - Coronary'\n",
+ " 'Heart - Atrial Appendage' 'Adipose - Visceral (Omentum)' 'Ovary'\n",
+ " 'Uterus' 'Vagina' 'Breast - Mammary Tissue'\n",
+ " 'Skin - Not Sun Exposed (Suprapubic)' 'Minor Salivary Gland'\n",
+ " 'Brain - Cortex' 'Adrenal Gland' 'Thyroid' 'Lung' 'Spleen' 'Pancreas'\n",
+ " 'Esophagus - Muscularis' 'Esophagus - Mucosa'\n",
+ " 'Esophagus - Gastroesophageal Junction' 'Stomach' 'Colon - Sigmoid'\n",
+ " 'Small Intestine - Terminal Ileum' 'Colon - Transverse' 'Prostate'\n",
+ " 'Testis' 'Skin - Sun Exposed (Lower leg)' 'Nerve - Tibial'\n",
+ " 'Heart - Left Ventricle' 'Pituitary' 'Brain - Cerebellum'\n",
+ " 'Cells - Cultured fibroblasts' 'Artery - Aorta'\n",
+ " 'Cells - EBV-transformed lymphocytes' 'Brain - Cerebellar Hemisphere'\n",
+ " 'Brain - Caudate (basal ganglia)'\n",
+ " 'Brain - Nucleus accumbens (basal ganglia)'\n",
+ " 'Brain - Putamen (basal ganglia)' 'Brain - Hypothalamus'\n",
+ " 'Brain - Spinal cord (cervical c-1)' 'Liver' 'Brain - Hippocampus'\n",
+ " 'Brain - Anterior cingulate cortex (BA24)' 'Brain - Substantia nigra'\n",
+ " 'Kidney - Cortex' 'Brain - Amygdala' 'Cervix - Ectocervix'\n",
+ " 'Fallopian Tube' 'Cervix - Endocervix' 'Bladder' 'Kidney - Medulla'\n",
+ " 'Cells - Leukemia cell line (CML)']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get tissue names\n",
+ "gtex_tissues = gtex_sample_attrs[\"SMTSD\"].unique()\n",
+ "print(len(gtex_tissues))\n",
+ "print(gtex_tissues)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Get sample IDs for each tissue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of samples with expression data: 17382\n",
+ "Sample IDs with expression data: ['GTEX-1HFI7-2426-SM-B2LXV', 'GTEX-11TTK-0226-SM-5N9EC', 'GTEX-11UD2-1226-SM-5EQMI', 'GTEX-X4EO-0006-SM-3P5ZF', 'GTEX-13O21-0326-SM-5J1N9', 'GTEX-XBED-1526-SM-4AT5W', 'GTEX-13NZ8-0011-R8b-SM-5KM48', 'GTEX-1H3O1-0005-SM-ACKV8', 'GTEX-13JVG-0011-R5a-SM-5MR4O', 'GTEX-1F88F-1126-SM-7MKHL']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# first, get all sample IDs with expression data\n",
+ "gtex_all_sample_ids_with_expr_data = set(\n",
+ " pd.read_csv(\n",
+ " file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n",
+ " sep=\"\\t\",\n",
+ " skiprows=2,\n",
+ " nrows=1,\n",
+ " usecols=lambda x: x not in (\"Name\", \"Description\"),\n",
+ " ).columns\n",
+ ")\n",
+ "\n",
+ "print(f\"Number of samples with expression data: {len(gtex_all_sample_ids_with_expr_data)}\")\n",
+ "print(f\"Sample IDs with expression data: {list(gtex_all_sample_ids_with_expr_data)[:10]}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get sample IDs by tissue\n",
+ "sample_ids_by_tissue = {\n",
+ " tissue_name: sorted(\n",
+ " list(\n",
+ " gtex_all_sample_ids_with_expr_data.intersection(\n",
+ " set(\n",
+ " gtex_sample_attrs[gtex_sample_attrs[\"SMTSD\"] == tissue_name][\n",
+ " \"SAMPID\"\n",
+ " ].tolist()\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ " for tissue_name in gtex_tissues\n",
+ "}\n",
+ "\n",
+ "assert len(gtex_tissues) == len(sample_ids_by_tissue)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['GTEX-111YS-0006-SM-5NQBE',\n",
+ " 'GTEX-1122O-0005-SM-5O99J',\n",
+ " 'GTEX-1128S-0005-SM-5P9HI',\n",
+ " 'GTEX-113IC-0006-SM-5NQ9C',\n",
+ " 'GTEX-113JC-0006-SM-5O997',\n",
+ " 'GTEX-117XS-0005-SM-5PNU6',\n",
+ " 'GTEX-117YW-0005-SM-5NQ8Z',\n",
+ " 'GTEX-1192W-0005-SM-5NQBQ',\n",
+ " 'GTEX-1192X-0005-SM-5NQC3',\n",
+ " 'GTEX-11DXW-0006-SM-5NQ7Y']"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_ids_by_tissue[\"Whole Blood\"][:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Ensure all IDs are unique\n",
+ "assert all(\n",
+ " [\n",
+ " len(sample_ids_by_tissue[tissue_name])\n",
+ " == len(set(sample_ids_by_tissue[tissue_name]))\n",
+ " for tissue_name in sample_ids_by_tissue.keys()\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Show sample size by tissue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tissue | \n",
+ " sample_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3 | \n",
+ " Muscle - Skeletal | \n",
+ " 803 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Whole Blood | \n",
+ " 755 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Skin - Sun Exposed (Lower leg) | \n",
+ " 701 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Artery - Tibial | \n",
+ " 663 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Adipose - Subcutaneous | \n",
+ " 663 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " Thyroid | \n",
+ " 653 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " Nerve - Tibial | \n",
+ " 619 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " Skin - Not Sun Exposed (Suprapubic) | \n",
+ " 604 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " Lung | \n",
+ " 578 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " Esophagus - Mucosa | \n",
+ " 555 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Adipose - Visceral (Omentum) | \n",
+ " 541 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " Esophagus - Muscularis | \n",
+ " 515 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " Cells - Cultured fibroblasts | \n",
+ " 504 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " Breast - Mammary Tissue | \n",
+ " 459 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " Heart - Left Ventricle | \n",
+ " 432 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " Artery - Aorta | \n",
+ " 432 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Heart - Atrial Appendage | \n",
+ " 429 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " Colon - Transverse | \n",
+ " 406 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " Esophagus - Gastroesophageal Junction | \n",
+ " 375 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " Colon - Sigmoid | \n",
+ " 373 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " Testis | \n",
+ " 361 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " Stomach | \n",
+ " 359 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " Pancreas | \n",
+ " 328 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " Pituitary | \n",
+ " 283 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " Adrenal Gland | \n",
+ " 258 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " Brain - Cortex | \n",
+ " 255 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " Brain - Caudate (basal ganglia) | \n",
+ " 246 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " Brain - Nucleus accumbens (basal ganglia) | \n",
+ " 246 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " Prostate | \n",
+ " 245 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " Spleen | \n",
+ " 241 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " Brain - Cerebellum | \n",
+ " 241 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Artery - Coronary | \n",
+ " 240 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " Liver | \n",
+ " 226 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " Brain - Cerebellar Hemisphere | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Brain - Frontal Cortex (BA9) | \n",
+ " 209 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " Brain - Putamen (basal ganglia) | \n",
+ " 205 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " Brain - Hypothalamus | \n",
+ " 202 | \n",
+ "
\n",
+ " \n",
+ " | 44 | \n",
+ " Brain - Hippocampus | \n",
+ " 197 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " Small Intestine - Terminal Ileum | \n",
+ " 187 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Ovary | \n",
+ " 180 | \n",
+ "
\n",
+ " \n",
+ " | 45 | \n",
+ " Brain - Anterior cingulate cortex (BA24) | \n",
+ " 176 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " Cells - EBV-transformed lymphocytes | \n",
+ " 174 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " Minor Salivary Gland | \n",
+ " 162 | \n",
+ "
\n",
+ " \n",
+ " | 42 | \n",
+ " Brain - Spinal cord (cervical c-1) | \n",
+ " 159 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " Vagina | \n",
+ " 156 | \n",
+ "
\n",
+ " \n",
+ " | 48 | \n",
+ " Brain - Amygdala | \n",
+ " 152 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Uterus | \n",
+ " 142 | \n",
+ "
\n",
+ " \n",
+ " | 46 | \n",
+ " Brain - Substantia nigra | \n",
+ " 139 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " Kidney - Cortex | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " Bladder | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " Cervix - Endocervix | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " Fallopian Tube | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 49 | \n",
+ " Cervix - Ectocervix | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " Kidney - Medulla | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " Cells - Leukemia cell line (CML) | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tissue sample_size\n",
+ "3 Muscle - Skeletal 803\n",
+ "0 Whole Blood 755\n",
+ "29 Skin - Sun Exposed (Lower leg) 701\n",
+ "4 Artery - Tibial 663\n",
+ "2 Adipose - Subcutaneous 663\n",
+ "16 Thyroid 653\n",
+ "30 Nerve - Tibial 619\n",
+ "12 Skin - Not Sun Exposed (Suprapubic) 604\n",
+ "17 Lung 578\n",
+ "21 Esophagus - Mucosa 555\n",
+ "7 Adipose - Visceral (Omentum) 541\n",
+ "20 Esophagus - Muscularis 515\n",
+ "34 Cells - Cultured fibroblasts 504\n",
+ "11 Breast - Mammary Tissue 459\n",
+ "31 Heart - Left Ventricle 432\n",
+ "35 Artery - Aorta 432\n",
+ "6 Heart - Atrial Appendage 429\n",
+ "26 Colon - Transverse 406\n",
+ "22 Esophagus - Gastroesophageal Junction 375\n",
+ "24 Colon - Sigmoid 373\n",
+ "28 Testis 361\n",
+ "23 Stomach 359\n",
+ "19 Pancreas 328\n",
+ "32 Pituitary 283\n",
+ "15 Adrenal Gland 258\n",
+ "14 Brain - Cortex 255\n",
+ "38 Brain - Caudate (basal ganglia) 246\n",
+ "39 Brain - Nucleus accumbens (basal ganglia) 246\n",
+ "27 Prostate 245\n",
+ "18 Spleen 241\n",
+ "33 Brain - Cerebellum 241\n",
+ "5 Artery - Coronary 240\n",
+ "43 Liver 226\n",
+ "37 Brain - Cerebellar Hemisphere 215\n",
+ "1 Brain - Frontal Cortex (BA9) 209\n",
+ "40 Brain - Putamen (basal ganglia) 205\n",
+ "41 Brain - Hypothalamus 202\n",
+ "44 Brain - Hippocampus 197\n",
+ "25 Small Intestine - Terminal Ileum 187\n",
+ "8 Ovary 180\n",
+ "45 Brain - Anterior cingulate cortex (BA24) 176\n",
+ "36 Cells - EBV-transformed lymphocytes 174\n",
+ "13 Minor Salivary Gland 162\n",
+ "42 Brain - Spinal cord (cervical c-1) 159\n",
+ "10 Vagina 156\n",
+ "48 Brain - Amygdala 152\n",
+ "9 Uterus 142\n",
+ "46 Brain - Substantia nigra 139\n",
+ "47 Kidney - Cortex 85\n",
+ "52 Bladder 21\n",
+ "51 Cervix - Endocervix 10\n",
+ "50 Fallopian Tube 9\n",
+ "49 Cervix - Ectocervix 9\n",
+ "53 Kidney - Medulla 4\n",
+ "54 Cells - Leukemia cell line (CML) 0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "tissue_sample_size = pd.DataFrame(\n",
+ " [{\"tissue\": k, \"sample_size\": len(v)} for k, v in sample_ids_by_tissue.items()]\n",
+ ")\n",
+ "\n",
+ "tissue_sample_size = tissue_sample_size.sort_values(\"sample_size\", ascending=False)\n",
+ "display(tissue_sample_size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Simple validations\n",
+ "_tmp = tissue_sample_size.set_index(\"tissue\").squeeze()\n",
+ "assert _tmp.loc[\"Muscle - Skeletal\"] == 803\n",
+ "assert _tmp.loc[\"Whole Blood\"] == 755\n",
+ "assert _tmp.loc[\"Skin - Not Sun Exposed (Suprapubic)\"] == 604\n",
+ "assert _tmp.loc[\"Kidney - Medulla\"] == 4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Split expression data by tissue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Cells - Leukemia cell line (CML): 100%|█████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 4357.51it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Skipping Muscle - Skeletal - file already exists\n",
+ "Skipping Whole Blood - file already exists\n",
+ "Skipping Skin - Sun Exposed (Lower leg) - file already exists\n",
+ "Skipping Artery - Tibial - file already exists\n",
+ "Skipping Adipose - Subcutaneous - file already exists\n",
+ "Skipping Thyroid - file already exists\n",
+ "Skipping Nerve - Tibial - file already exists\n",
+ "Skipping Skin - Not Sun Exposed (Suprapubic) - file already exists\n",
+ "Skipping Lung - file already exists\n",
+ "Skipping Esophagus - Mucosa - file already exists\n",
+ "Skipping Adipose - Visceral (Omentum) - file already exists\n",
+ "Skipping Esophagus - Muscularis - file already exists\n",
+ "Skipping Cells - Cultured fibroblasts - file already exists\n",
+ "Skipping Breast - Mammary Tissue - file already exists\n",
+ "Skipping Heart - Left Ventricle - file already exists\n",
+ "Skipping Artery - Aorta - file already exists\n",
+ "Skipping Heart - Atrial Appendage - file already exists\n",
+ "Skipping Colon - Transverse - file already exists\n",
+ "Skipping Esophagus - Gastroesophageal Junction - file already exists\n",
+ "Skipping Colon - Sigmoid - file already exists\n",
+ "Skipping Testis - file already exists\n",
+ "Skipping Stomach - file already exists\n",
+ "Skipping Pancreas - file already exists\n",
+ "Skipping Pituitary - file already exists\n",
+ "Skipping Adrenal Gland - file already exists\n",
+ "Skipping Brain - Cortex - file already exists\n",
+ "Skipping Brain - Caudate (basal ganglia) - file already exists\n",
+ "Skipping Brain - Nucleus accumbens (basal ganglia) - file already exists\n",
+ "Skipping Prostate - file already exists\n",
+ "Skipping Spleen - file already exists\n",
+ "Skipping Brain - Cerebellum - file already exists\n",
+ "Skipping Artery - Coronary - file already exists\n",
+ "Skipping Liver - file already exists\n",
+ "Skipping Brain - Cerebellar Hemisphere - file already exists\n",
+ "Skipping Brain - Frontal Cortex (BA9) - file already exists\n",
+ "Skipping Brain - Putamen (basal ganglia) - file already exists\n",
+ "Skipping Brain - Hypothalamus - file already exists\n",
+ "Skipping Brain - Hippocampus - file already exists\n",
+ "Skipping Small Intestine - Terminal Ileum - file already exists\n",
+ "Skipping Ovary - file already exists\n",
+ "Skipping Brain - Anterior cingulate cortex (BA24) - file already exists\n",
+ "Skipping Cells - EBV-transformed lymphocytes - file already exists\n",
+ "Skipping Minor Salivary Gland - file already exists\n",
+ "Skipping Brain - Spinal cord (cervical c-1) - file already exists\n",
+ "Skipping Vagina - file already exists\n",
+ "Skipping Brain - Amygdala - file already exists\n",
+ "Skipping Uterus - file already exists\n",
+ "Skipping Brain - Substantia nigra - file already exists\n",
+ "Skipping Kidney - Cortex - file already exists\n",
+ "Skipping Bladder - file already exists\n",
+ "Skipping Cervix - Endocervix - file already exists\n",
+ "Skipping Fallopian Tube - file already exists\n",
+ "Skipping Cervix - Ectocervix - file already exists\n",
+ "Skipping Kidney - Medulla - file already exists\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "TISSUE_DATA_DIR = ANALYSIS_DIR / \"data_by_tissue\"\n",
+ "TISSUE_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ "pbar = tqdm(tissue_sample_size[\"tissue\"])\n",
+ "\n",
+ "gene_id_symbol_map_tuples = set()\n",
+ "\n",
+ "for tissue_name in pbar:\n",
+ " pbar.set_description(tissue_name)\n",
+ "\n",
+ " tissue_ids = sample_ids_by_tissue[tissue_name]\n",
+ " if len(tissue_ids) == 0:\n",
+ " continue\n",
+ "\n",
+ " # Generate output filename\n",
+ " tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))\n",
+ " output_file = TISSUE_DATA_DIR / f\"gtex_v8_data_{tissue_name_simple}.pkl\"\n",
+ " output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n",
+ " \n",
+ " # Skip if file already exists\n",
+ " if output_file.exists() and output_gene_mappings.exists():\n",
+ " print(f\"Skipping {tissue_name} - file already exists\")\n",
+ " continue\n",
+ "\n",
+ " try:\n",
+ " tissue_data = pd.read_csv(\n",
+ " file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n",
+ " sep=\"\\t\",\n",
+ " skiprows=2,\n",
+ " usecols=[\"Name\", \"Description\"] + tissue_ids,\n",
+ " )\n",
+ "\n",
+ " tissue_data = tissue_data.rename(\n",
+ " columns={\n",
+ " \"Name\": \"gene_ens_id\",\n",
+ " \"Description\": \"gene_symbol\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " # Validate data before processing\n",
+ " if tissue_data.empty:\n",
+ " print(f\"Warning: No data found for {tissue_name}\")\n",
+ " continue\n",
+ "\n",
+ " # add gene id / gene symbol to mapping variable\n",
+ " gene_id_symbol_map_tuples.update(\n",
+ " tissue_data[[\"gene_ens_id\", \"gene_symbol\"]].itertuples(index=False)\n",
+ " )\n",
+ "\n",
+ " tissue_data = tissue_data.drop(columns=[\"gene_symbol\"]).set_index(\"gene_ens_id\")\n",
+ "\n",
+ " # Data quality checks\n",
+ " assert not tissue_data.isna().any().any(), f\"NaN values found in {tissue_name}\"\n",
+ " assert tissue_data.index.is_unique, f\"Non-unique gene IDs in {tissue_name}\"\n",
+ " assert tissue_data.columns.is_unique, f\"Non-unique sample IDs in {tissue_name}\"\n",
+ "\n",
+ " # save\n",
+ " tissue_data.to_pickle(path=output_file)\n",
+ " \n",
+ " except Exception as e:\n",
+ " print(f\"Error processing {tissue_name}: {str(e)}\")\n",
+ " continue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Simple validations\n",
+ "_tmp = pd.read_pickle(TISSUE_DATA_DIR / \"gtex_v8_data_brain_cerebellar_hemisphere.pkl\")\n",
+ "\n",
+ "assert \"GTEX-11DXY-0011-R11a-SM-DNZZN\" in _tmp.columns\n",
+ "assert \"GTEX-WL46-0011-R11A-SM-3MJFT\" in _tmp.columns\n",
+ "assert \"GTEX-ZF28-0011-R11a-SM-4WWEI\" in _tmp.columns\n",
+ "\n",
+ "_v = _tmp.loc[\"ENSG00000223972.5\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n",
+ "assert _v == 0.04045, _v\n",
+ "_v = _tmp.loc[\"ENSG00000278267.1\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n",
+ "assert _v == 0.0, _v\n",
+ "\n",
+ "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n",
+ "assert _v == 146.4000, _v\n",
+ "_v = _tmp.loc[\"ENSG00000237118.2\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n",
+ "assert _v == 0.3357, _v\n",
+ "\n",
+ "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n",
+ "assert _v == 30.7200, _v\n",
+ "_v = _tmp.loc[\"ENSG00000186907.7\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n",
+ "assert _v == 0.94720, _v"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Save gene mappings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded existing gene mappings from /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+ "gene_mappings.shape: (56200, 2)\n",
+ " gene_ens_id gene_symbol\n",
+ "0 ENSG00000144278.14 GALNT13\n",
+ "1 ENSG00000260976.1 LINC01633\n",
+ "2 ENSG00000186660.14 ZFP91\n",
+ "3 ENSG00000123560.13 PLP1\n",
+ "4 ENSG00000227371.1 RP11-3L10.2\n"
+ ]
+ }
+ ],
+ "source": [
+ "output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n",
+ "\n",
+ "if output_gene_mappings.exists():\n",
+ " gene_mappings = pd.read_pickle(output_gene_mappings)\n",
+ " print(f\"Loaded existing gene mappings from {output_gene_mappings}\")\n",
+ "else:\n",
+ " gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)\n",
+ " gene_mappings.to_pickle(output_gene_mappings)\n",
+ " print(f\"Created and saved gene mappings to {output_gene_mappings}\")\n",
+ "\n",
+ "print(f\"gene_mappings.shape: {gene_mappings.shape}\")\n",
+ "print(gene_mappings.head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Simple validations\n",
+ "# no null\n",
+ "assert gene_mappings.dropna(how=\"any\").shape == gene_mappings.shape\n",
+ "# no duplicates\n",
+ "assert gene_mappings.drop_duplicates().shape == gene_mappings.shape\n",
+ "\n",
+ "_tmp = gene_mappings.set_index(\"gene_ens_id\").squeeze()\n",
+ "assert _tmp.loc[\"ENSG00000223972.5\"] == \"DDX11L1\"\n",
+ "assert _tmp.loc[\"ENSG00000243485.5\"] == \"MIR1302-2HG\"\n",
+ "assert _tmp.loc[\"ENSG00000274059.1\"] == \"5S_rRNA\" # repeated gene\n",
+ "assert _tmp.loc[\"ENSG00000275305.1\"] == \"5S_rRNA\" # repeated gene"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Compute correlation coefficients"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We provide a command-line tool for computing CCC, Spearman, and Pearson correlations between two genes in a given tissue.\n",
+ "\n",
+ "```bash\n",
+ "usage: compute_single_gene_pair_correlations_cli.py [-h] [--tissue TISSUE] [--data-dir DATA_DIR] [--gene-mapping GENE_MAPPING] [--list-tissues] [--show-genes TISSUE] [--n-genes N_GENES] [--debug] [genes ...]\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 11:33:38,498 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n",
+ "[2025-09-25 11:33:38,644 - root] INFO: Tissue data shape: (56200, 755)\n",
+ "[2025-09-25 11:33:38,644 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+ "[2025-09-25 11:33:38,649 - root] INFO: Loaded 56200 gene mappings\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== Tissue: whole_blood ===\n",
+ "Total genes: 56,200\n",
+ "Total samples: 755\n",
+ "\n",
+ "First 20 genes:\n",
+ "------------------------------------------------------------\n",
+ "# Gene Symbol Ensembl ID \n",
+ "------------------------------------------------------------\n",
+ "1 DDX11L1 ENSG00000223972.5 \n",
+ "2 WASH7P ENSG00000227232.5 \n",
+ "3 MIR6859-1 ENSG00000278267.1 \n",
+ "4 MIR1302-2HG ENSG00000243485.5 \n",
+ "5 FAM138A ENSG00000237613.2 \n",
+ "6 OR4G4P ENSG00000268020.3 \n",
+ "7 OR4G11P ENSG00000240361.1 \n",
+ "8 OR4F5 ENSG00000186092.4 \n",
+ "9 RP11-34P13.7 ENSG00000238009.6 \n",
+ "10 CICP27 ENSG00000233750.3 \n",
+ "11 RP11-34P13.15 ENSG00000268903.1 \n",
+ "12 RP11-34P13.16 ENSG00000269981.1 \n",
+ "13 RP11-34P13.14 ENSG00000239906.1 \n",
+ "14 RP11-34P13.13 ENSG00000241860.6 \n",
+ "15 RNU6-1100P ENSG00000222623.1 \n",
+ "16 RP11-34P13.9 ENSG00000241599.1 \n",
+ "17 ABC7-43046700E7.1 ENSG00000279928.2 \n",
+ "18 RP11-34P13.18 ENSG00000279457.4 \n",
+ "19 MIR6859-2 ENSG00000273874.1 \n",
+ "20 AP006222.2 ENSG00000228463.9 \n",
+ "... and 56,180 more genes\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Make sure you start the notebook from the ROOT directory of the project\n",
+ "\n",
+ "# Preview genes in a tissue\n",
+ "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py --show-genes whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 11:33:38,676 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n",
+ "[2025-09-25 11:33:38,681 - root] INFO: Loaded 56200 gene mappings\n",
+ "[2025-09-25 11:33:38,686 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n",
+ "[2025-09-25 11:33:38,824 - root] INFO: Tissue data shape: (56200, 755)\n",
+ "[2025-09-25 11:33:38,827 - root] INFO: Computing correlations for 755 samples\n",
+ "[2025-09-25 11:33:38,832 - root] INFO: Computing CCC correlation...\n",
+ "[2025-09-25 11:33:38,857 - root] INFO: Computing Pearson correlation...\n",
+ "[2025-09-25 11:33:38,871 - root] INFO: Computing Spearman correlation...\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "============================================================\n",
+ "GENE PAIR CORRELATION RESULTS\n",
+ "============================================================\n",
+ "Gene 1: DDX11L1 (ENSG00000223972.5)\n",
+ "Gene 2: WASH7P (ENSG00000227232.5)\n",
+ "Tissue: whole_blood\n",
+ "Samples: 755\n",
+ "------------------------------------------------------------\n",
+ " CCC: 0.005060\n",
+ " PEARSON: 0.063041\n",
+ " SPEARMAN: 0.040069\n",
+ "============================================================\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Compute CCC, Spearman, and Pearson correlations between two genes in a given tissue\n",
+ "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py DDX11L1 WASH7P --tissue whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Metadata Correlation\n",
+ "We will compute the correlation between the gene expression and the metadata for each tissue. Metadata is downloaded from: https://www.gtexportal.org/home/downloads/adult-gtex/metadata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Preparation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(22951, 62)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load GTEx samples info\n",
+ "gtex_samples = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\", index_col=\"SAMPID\")\n",
+ "print(gtex_samples.shape)\n",
+ "assert gtex_samples.index.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(980, 4)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load GTEx subject attributes\n",
+ "gtex_phenotypes = pd.read_csv(file_paths[\"gtex_subject_attrs\"], sep=\"\\t\")\n",
+ "print(gtex_phenotypes.shape)\n",
+ "assert gtex_phenotypes.index.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['GTEX-1117F-0003-SM-58Q7G', 'GTEX-1117F-0003-SM-5DWSB', 'GTEX-1117F-0003-SM-6WBT7', 'GTEX-1117F-0011-R10a-SM-AHZ7F', 'GTEX-1117F-0011-R10b-SM-CYKQ8']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get GTEx sample metadata\n",
+ "gtex_samples_ids = gtex_samples.index.to_list()\n",
+ "print(gtex_samples_ids[:5])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 GTEX-1117F-0003-SM-58Q7G\n",
+ "1 GTEX-1117F-0003-SM-5DWSB\n",
+ "2 GTEX-1117F-0003-SM-6WBT7\n",
+ "3 GTEX-1117F-0011-R10a-SM-AHZ7F\n",
+ "4 GTEX-1117F-0011-R10b-SM-CYKQ8\n",
+ " ... \n",
+ "22946 K-562-SM-E9EZC\n",
+ "22947 K-562-SM-E9EZI\n",
+ "22948 K-562-SM-E9EZO\n",
+ "22949 K-562-SM-E9EZT\n",
+ "22950 K-562-SM-E9EZZ\n",
+ "Name: SAMPID, Length: 22951, dtype: object"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_samples_ids = pd.Series(gtex_samples_ids).rename(\"SAMPID\")\n",
+ "gtex_samples_ids"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 GTEX-1117F\n",
+ "1 GTEX-1117F\n",
+ "2 GTEX-1117F\n",
+ "3 GTEX-1117F\n",
+ "4 GTEX-1117F\n",
+ " ... \n",
+ "22946 K-562\n",
+ "22947 K-562\n",
+ "22948 K-562\n",
+ "22949 K-562\n",
+ "22950 K-562\n",
+ "Name: SUBJID, Length: 22951, dtype: object"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_subjects_ids = gtex_samples_ids.str.extract(\n",
+ " r\"([\\w\\d]+\\-[\\w\\d]+)\", flags=re.IGNORECASE, expand=True\n",
+ ")[0].rename(\"SUBJID\")\n",
+ "\n",
+ "gtex_subjects_ids"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SAMPID | \n",
+ " SUBJID | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " GTEX-1117F-0003-SM-58Q7G | \n",
+ " GTEX-1117F | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " GTEX-1117F-0003-SM-5DWSB | \n",
+ " GTEX-1117F | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " GTEX-1117F-0003-SM-6WBT7 | \n",
+ " GTEX-1117F | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " GTEX-1117F-0011-R10a-SM-AHZ7F | \n",
+ " GTEX-1117F | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " GTEX-1117F-0011-R10b-SM-CYKQ8 | \n",
+ " GTEX-1117F | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 22946 | \n",
+ " K-562-SM-E9EZC | \n",
+ " K-562 | \n",
+ "
\n",
+ " \n",
+ " | 22947 | \n",
+ " K-562-SM-E9EZI | \n",
+ " K-562 | \n",
+ "
\n",
+ " \n",
+ " | 22948 | \n",
+ " K-562-SM-E9EZO | \n",
+ " K-562 | \n",
+ "
\n",
+ " \n",
+ " | 22949 | \n",
+ " K-562-SM-E9EZT | \n",
+ " K-562 | \n",
+ "
\n",
+ " \n",
+ " | 22950 | \n",
+ " K-562-SM-E9EZZ | \n",
+ " K-562 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
22951 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SAMPID SUBJID\n",
+ "0 GTEX-1117F-0003-SM-58Q7G GTEX-1117F\n",
+ "1 GTEX-1117F-0003-SM-5DWSB GTEX-1117F\n",
+ "2 GTEX-1117F-0003-SM-6WBT7 GTEX-1117F\n",
+ "3 GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F\n",
+ "4 GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F\n",
+ "... ... ...\n",
+ "22946 K-562-SM-E9EZC K-562\n",
+ "22947 K-562-SM-E9EZI K-562\n",
+ "22948 K-562-SM-E9EZO K-562\n",
+ "22949 K-562-SM-E9EZT K-562\n",
+ "22950 K-562-SM-E9EZZ K-562\n",
+ "\n",
+ "[22951 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)\n",
+ "gtex_metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SUBJID | \n",
+ " SEX | \n",
+ " AGE | \n",
+ " DTHHRDY | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " GTEX-111CU | \n",
+ " 1 | \n",
+ " 50-59 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " GTEX-111FC | \n",
+ " 1 | \n",
+ " 60-69 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " GTEX-111VG | \n",
+ " 1 | \n",
+ " 60-69 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " GTEX-111YS | \n",
+ " 1 | \n",
+ " 60-69 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 975 | \n",
+ " GTEX-ZYY3 | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | 976 | \n",
+ " GTEX-ZZ64 | \n",
+ " 1 | \n",
+ " 20-29 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 977 | \n",
+ " GTEX-ZZPT | \n",
+ " 1 | \n",
+ " 50-59 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | 978 | \n",
+ " GTEX-ZZPU | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 979 | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
980 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SUBJID SEX AGE DTHHRDY\n",
+ "0 GTEX-1117F 2 60-69 4.0\n",
+ "1 GTEX-111CU 1 50-59 0.0\n",
+ "2 GTEX-111FC 1 60-69 1.0\n",
+ "3 GTEX-111VG 1 60-69 3.0\n",
+ "4 GTEX-111YS 1 60-69 0.0\n",
+ ".. ... ... ... ...\n",
+ "975 GTEX-ZYY3 2 60-69 4.0\n",
+ "976 GTEX-ZZ64 1 20-29 0.0\n",
+ "977 GTEX-ZZPT 1 50-59 4.0\n",
+ "978 GTEX-ZZPU 2 50-59 0.0\n",
+ "979 K-562 2 50-59 NaN\n",
+ "\n",
+ "[980 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_phenotypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SUBJID | \n",
+ " SEX | \n",
+ " AGE | \n",
+ " DTHHRDY | \n",
+ "
\n",
+ " \n",
+ " | SAMPID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | GTEX-1117F-0003-SM-58Q7G | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0003-SM-5DWSB | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0003-SM-6WBT7 | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0011-R10a-SM-AHZ7F | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0011-R10b-SM-CYKQ8 | \n",
+ " GTEX-1117F | \n",
+ " 2 | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | K-562-SM-E9EZC | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | K-562-SM-E9EZI | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | K-562-SM-E9EZO | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | K-562-SM-E9EZT | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | K-562-SM-E9EZZ | \n",
+ " K-562 | \n",
+ " 2 | \n",
+ " 50-59 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
22951 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SUBJID SEX AGE DTHHRDY\n",
+ "SAMPID \n",
+ "GTEX-1117F-0003-SM-58Q7G GTEX-1117F 2 60-69 4.0\n",
+ "GTEX-1117F-0003-SM-5DWSB GTEX-1117F 2 60-69 4.0\n",
+ "GTEX-1117F-0003-SM-6WBT7 GTEX-1117F 2 60-69 4.0\n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F 2 60-69 4.0\n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F 2 60-69 4.0\n",
+ "... ... ... ... ...\n",
+ "K-562-SM-E9EZC K-562 2 50-59 NaN\n",
+ "K-562-SM-E9EZI K-562 2 50-59 NaN\n",
+ "K-562-SM-E9EZO K-562 2 50-59 NaN\n",
+ "K-562-SM-E9EZT K-562 2 50-59 NaN\n",
+ "K-562-SM-E9EZZ K-562 2 50-59 NaN\n",
+ "\n",
+ "[22951 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 94,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index(\"SAMPID\")\n",
+ "gtex_metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SUBJID | \n",
+ " SEX | \n",
+ " AGE | \n",
+ " DTHHRDY | \n",
+ " SMATSSCR | \n",
+ " SMCENTER | \n",
+ " SMPTHNTS | \n",
+ " SMRIN | \n",
+ " SMTS | \n",
+ " SMTSD | \n",
+ " ... | \n",
+ " SME1ANTI | \n",
+ " SMSPLTRD | \n",
+ " SMBSMMRT | \n",
+ " SME1SNSE | \n",
+ " SME1PCTS | \n",
+ " SMRRNART | \n",
+ " SME1MPRT | \n",
+ " SMNUM5CD | \n",
+ " SMDPMPRT | \n",
+ " SME2PCTS | \n",
+ "
\n",
+ " \n",
+ " | SAMPID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | GTEX-1117F-0003-SM-58Q7G | \n",
+ " GTEX-1117F | \n",
+ " Female | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " B1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Blood | \n",
+ " Whole Blood | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0003-SM-5DWSB | \n",
+ " GTEX-1117F | \n",
+ " Female | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " B1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Blood | \n",
+ " Whole Blood | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0003-SM-6WBT7 | \n",
+ " GTEX-1117F | \n",
+ " Female | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " B1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Blood | \n",
+ " Whole Blood | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0011-R10a-SM-AHZ7F | \n",
+ " GTEX-1117F | \n",
+ " Female | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " B1, A1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Brain | \n",
+ " Brain - Frontal Cortex (BA9) | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | GTEX-1117F-0011-R10b-SM-CYKQ8 | \n",
+ " GTEX-1117F | \n",
+ " Female | \n",
+ " 60-69 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " B1, A1 | \n",
+ " NaN | \n",
+ " 7.2 | \n",
+ " Brain | \n",
+ " Brain - Frontal Cortex (BA9) | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 66 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SUBJID SEX AGE DTHHRDY SMATSSCR \\\n",
+ "SAMPID \n",
+ "GTEX-1117F-0003-SM-58Q7G GTEX-1117F Female 60-69 4.0 NaN \n",
+ "GTEX-1117F-0003-SM-5DWSB GTEX-1117F Female 60-69 4.0 NaN \n",
+ "GTEX-1117F-0003-SM-6WBT7 GTEX-1117F Female 60-69 4.0 NaN \n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F Female 60-69 4.0 NaN \n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F Female 60-69 4.0 NaN \n",
+ "\n",
+ " SMCENTER SMPTHNTS SMRIN SMTS \\\n",
+ "SAMPID \n",
+ "GTEX-1117F-0003-SM-58Q7G B1 NaN NaN Blood \n",
+ "GTEX-1117F-0003-SM-5DWSB B1 NaN NaN Blood \n",
+ "GTEX-1117F-0003-SM-6WBT7 B1 NaN NaN Blood \n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F B1, A1 NaN NaN Brain \n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 B1, A1 NaN 7.2 Brain \n",
+ "\n",
+ " SMTSD ... SME1ANTI \\\n",
+ "SAMPID ... \n",
+ "GTEX-1117F-0003-SM-58Q7G Whole Blood ... NaN \n",
+ "GTEX-1117F-0003-SM-5DWSB Whole Blood ... NaN \n",
+ "GTEX-1117F-0003-SM-6WBT7 Whole Blood ... NaN \n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F Brain - Frontal Cortex (BA9) ... NaN \n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 Brain - Frontal Cortex (BA9) ... NaN \n",
+ "\n",
+ " SMSPLTRD SMBSMMRT SME1SNSE SME1PCTS SMRRNART \\\n",
+ "SAMPID \n",
+ "GTEX-1117F-0003-SM-58Q7G NaN NaN NaN NaN NaN \n",
+ "GTEX-1117F-0003-SM-5DWSB NaN NaN NaN NaN NaN \n",
+ "GTEX-1117F-0003-SM-6WBT7 NaN NaN NaN NaN NaN \n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F NaN NaN NaN NaN NaN \n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " SME1MPRT SMNUM5CD SMDPMPRT SME2PCTS \n",
+ "SAMPID \n",
+ "GTEX-1117F-0003-SM-58Q7G NaN NaN NaN NaN \n",
+ "GTEX-1117F-0003-SM-5DWSB NaN NaN NaN NaN \n",
+ "GTEX-1117F-0003-SM-6WBT7 NaN NaN NaN NaN \n",
+ "GTEX-1117F-0011-R10a-SM-AHZ7F NaN NaN NaN NaN \n",
+ "GTEX-1117F-0011-R10b-SM-CYKQ8 NaN NaN NaN NaN \n",
+ "\n",
+ "[5 rows x 66 columns]"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)\n",
+ "\n",
+ "gtex_metadata = gtex_metadata.replace(\n",
+ " {\n",
+ " \"SEX\": {\n",
+ " 1: \"Male\",\n",
+ " 2: \"Female\",\n",
+ " }\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "gtex_metadata = gtex_metadata.sort_index()\n",
+ "\n",
+ "gtex_metadata.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Simple validations\n",
+ "assert not gtex_metadata[\"SUBJID\"].isna().any()\n",
+ "\n",
+ "assert not gtex_metadata[\"SMTS\"].isna().any()\n",
+ "assert not gtex_metadata[\"SMTSD\"].isna().any()\n",
+ "\n",
+ "assert not gtex_metadata[\"SEX\"].isna().any()\n",
+ "assert gtex_metadata[\"SEX\"].unique().shape[0] == 2\n",
+ "assert set(gtex_metadata[\"SEX\"].unique()) == {\"Female\", \"Male\"}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Save metadata\n",
+ "gtex_metadatadata_filename = ANALYSIS_DIR / \"gtex_v8-sample_metadata.pkl\"\n",
+ "gtex_metadata.to_pickle(gtex_metadatadata_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Metadata correlation\n",
+ "We also provide a command-line tool `nbs/common/metadata_corr_cli.py` for computing the correlation between the gene expression and the metadata for each tissue.\n",
+ "\n",
+ "```bash\n",
+ "usage: metadata_corr_cli.py [-h] [--expr-data-dir EXPR_DATA_DIR] [--include [INCLUDE ...]] [--exclude [EXCLUDE ...]] [--permutations PERMUTATIONS]\n",
+ " [--n-jobs N_JOBS] [--list-metadata-columns] [--list-tissues] [--output-dir OUTPUT_DIR] [--quiet] [--no-csv-output]\n",
+ " [--no-individual-logs] [--data-dir DATA_DIR]\n",
+ " gene_symbols [gene_symbols ...]\n",
+ "\n",
+ "Analyze gene expression correlations with metadata using CCC across multiple tissues\n",
+ "\n",
+ "positional arguments:\n",
+ " gene_symbols Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)\n",
+ "\n",
+ "options:\n",
+ " -h, --help show this help message and exit\n",
+ " --expr-data-dir EXPR_DATA_DIR\n",
+ " Directory containing expression data files (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex/gene_selection/all)\n",
+ " --include [INCLUDE ...]\n",
+ " Include only tissues matching these patterns (fuzzy match on tissue name) (default: None)\n",
+ " --exclude [EXCLUDE ...]\n",
+ " Exclude tissues matching these patterns (fuzzy match on tissue name) (default: None)\n",
+ " --permutations PERMUTATIONS\n",
+ " Number of permutations for p-value calculation (default: 100000)\n",
+ " --n-jobs N_JOBS Number of parallel jobs for computation (default: 4)\n",
+ " --list-metadata-columns\n",
+ " List available metadata columns and exit (default: False)\n",
+ " --list-tissues List available tissue files and exit (default: False)\n",
+ " --output-dir OUTPUT_DIR\n",
+ " Directory to save output files (default: current directory) (default: .)\n",
+ " --quiet Reduce output verbosity for batch processing (default: False)\n",
+ " --no-csv-output Skip CSV file generation (only create pickle files) (default: False)\n",
+ " --no-individual-logs Skip individual tissue log files (only keep summary logs) (default: False)\n",
+ " --data-dir DATA_DIR Directory containing GTEx data files (metadata and gene mappings) (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "METADATA_CORRELATIONS_RESULT_DIR = ANALYSIS_DIR / \"metadata_correlations\"\n",
+ "os.makedirs(METADATA_CORRELATIONS_RESULT_DIR, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:17,840 - summary] INFO: Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n",
+ "[2025-09-25 13:05:17,840 - summary] INFO: Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+ "[2025-09-25 13:05:17,840 - summary] INFO: Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n",
+ "[2025-09-25 13:05:17,840 - summary] INFO: Gene symbols to analyze: RASSF2, CYTIP\n",
+ "[2025-09-25 13:05:17,857 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:17,858 - summary] INFO: PROCESSING GENE 1/2: RASSF2\n",
+ "[2025-09-25 13:05:17,858 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:17,858 - summary] INFO: \n",
+ "[1/1] Starting processing for RASSF2 in whole_blood...\n",
+ "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: \n",
+ "============================================================\n",
+ "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Processing tissue: whole_blood\n",
+ "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n",
+ "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: ============================================================\n",
+ "[2025-09-25 13:05:17,860 - tissue_RASSF2_whole_blood] INFO: Loading expression data...\n",
+ "[2025-09-25 13:05:18,013 - tissue_RASSF2_whole_blood] INFO: Expression data shape: (56200, 755)\n",
+ "[2025-09-25 13:05:18,016 - tissue_RASSF2_whole_blood] INFO: Gene ID for RASSF2: ENSG00000101265.15\n",
+ "[2025-09-25 13:05:18,019 - tissue_RASSF2_whole_blood] INFO: Number of samples: 755\n",
+ "[2025-09-25 13:05:18,021 - tissue_RASSF2_whole_blood] INFO: Common samples: 755\n",
+ "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Computing CCC between RASSF2 expression and all metadata columns...\n",
+ "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Using 100000 permutations and 4 jobs\n",
+ "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing 66 metadata columns...\n",
+ "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing column 1/66: SUBJID\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n",
+ "Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+ "Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n",
+ "Gene symbols to analyze: RASSF2, CYTIP\n",
+ "Found 1 expression files to process:\n",
+ " whole_blood: gtex_v8_data_whole_blood.pkl\n",
+ "Loading metadata and gene mapping files...\n",
+ "Loaded metadata: (22951, 66)\n",
+ "Loaded gene mapping: (56200, 2)\n",
+ "\n",
+ "====================================================================================================\n",
+ "PROCESSING GENE 1/2: RASSF2\n",
+ "====================================================================================================\n",
+ "\n",
+ "[1/1] Starting processing for RASSF2 in whole_blood...\n",
+ "\n",
+ "============================================================\n",
+ "Processing tissue: whole_blood\n",
+ "File: gtex_v8_data_whole_blood.pkl\n",
+ "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ "============================================================\n",
+ "Loading expression data...\n",
+ "Expression data shape: (56200, 755)\n",
+ "Gene ID for RASSF2: ENSG00000101265.15\n",
+ "Number of samples: 755\n",
+ "Common samples: 755\n",
+ "Computing CCC between RASSF2 expression and all metadata columns...\n",
+ "Using 100000 permutations and 4 jobs\n",
+ "Processing 66 metadata columns...\n",
+ "Processing column 1/66: SUBJID\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:18,143 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000000, p-value: 1.00e+00\n",
+ "[2025-09-25 13:05:18,144 - tissue_RASSF2_whole_blood] INFO: Processing column 2/66: SEX\n",
+ "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007134, p-value: 1.23e-02\n",
+ "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO: Processing column 3/66: AGE\n",
+ "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO: CCC: 0.039824, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO: Processing column 4/66: DTHHRDY\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.000000, p-value: 1.00e+00\n",
+ "Processing column 2/66: SEX\n",
+ " CCC: 0.007134, p-value: 1.23e-02\n",
+ "Processing column 3/66: AGE\n",
+ " CCC: 0.039824, p-value: 1.00e-05\n",
+ "Processing column 4/66: DTHHRDY\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:18,547 - tissue_RASSF2_whole_blood] INFO: CCC: 0.464582, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 5/66: SMATSSCR\n",
+ "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Skipping SMATSSCR: all values are NaN\n",
+ "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 6/66: SMCENTER\n",
+ "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO: CCC: 0.108148, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n",
+ "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO: Skipping SMPTHNTS: all values are NaN\n",
+ "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO: Processing column 8/66: SMRIN\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.464582, p-value: 1.00e-05\n",
+ "Processing column 5/66: SMATSSCR\n",
+ " Skipping SMATSSCR: all values are NaN\n",
+ "Processing column 6/66: SMCENTER\n",
+ " CCC: 0.108148, p-value: 1.00e-05\n",
+ "Processing column 7/66: SMPTHNTS\n",
+ " Skipping SMPTHNTS: all values are NaN\n",
+ "Processing column 8/66: SMRIN\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO: CCC: 0.048847, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO: Processing column 9/66: SMTS\n",
+ "[2025-09-25 13:05:18,873 - tissue_RASSF2_whole_blood] INFO: Skipping SMTS: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 10/66: SMTSD\n",
+ "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Skipping SMTSD: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 11/66: SMUBRID\n",
+ "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO: Skipping SMUBRID: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO: Processing column 12/66: SMTSISCH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.048847, p-value: 1.00e-05\n",
+ "Processing column 9/66: SMTS\n",
+ " Skipping SMTS: only 1 unique value(s)\n",
+ "Processing column 10/66: SMTSD\n",
+ " Skipping SMTSD: only 1 unique value(s)\n",
+ "Processing column 11/66: SMUBRID\n",
+ " Skipping SMUBRID: only 1 unique value(s)\n",
+ "Processing column 12/66: SMTSISCH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:19,129 - tissue_RASSF2_whole_blood] INFO: CCC: 0.528125, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 13/66: SMTSPAX\n",
+ "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Skipping SMTSPAX: all values are NaN\n",
+ "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 14/66: SMNABTCH\n",
+ "[2025-09-25 13:05:19,194 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000884, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,195 - tissue_RASSF2_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n",
+ "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO: Skipping SMNABTCHT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n",
+ "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000900, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n",
+ "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO: CCC: 0.003663, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.528125, p-value: 1.00e-05\n",
+ "Processing column 13/66: SMTSPAX\n",
+ " Skipping SMTSPAX: all values are NaN\n",
+ "Processing column 14/66: SMNABTCH\n",
+ " CCC: 0.000884, p-value: 1.00e-05\n",
+ "Processing column 15/66: SMNABTCHT\n",
+ " Skipping SMNABTCHT: only 1 unique value(s)\n",
+ "Processing column 16/66: SMNABTCHD\n",
+ " CCC: 0.000900, p-value: 1.00e-05\n",
+ "Processing column 17/66: SMGEBTCH\n",
+ " CCC: 0.003663, p-value: 1.00e-05\n",
+ "Processing column 18/66: SMGEBTCHD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO: CCC: 0.005827, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n",
+ "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Skipping SMGEBTCHT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Processing column 20/66: SMAFRZE\n",
+ "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Skipping SMAFRZE: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 21/66: SMGTC\n",
+ "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Skipping SMGTC: all values are NaN\n",
+ "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 22/66: SME2MPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.005827, p-value: 1.00e-05\n",
+ "Processing column 19/66: SMGEBTCHT\n",
+ " Skipping SMGEBTCHT: only 1 unique value(s)\n",
+ "Processing column 20/66: SMAFRZE\n",
+ " Skipping SMAFRZE: only 1 unique value(s)\n",
+ "Processing column 21/66: SMGTC\n",
+ " Skipping SMGTC: all values are NaN\n",
+ "Processing column 22/66: SME2MPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO: CCC: 0.172974, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.172974, p-value: 1.00e-05\n",
+ "Processing column 23/66: SMCHMPRS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO: CCC: 0.143365, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO: Processing column 24/66: SMNTRART\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.143365, p-value: 1.00e-05\n",
+ "Processing column 24/66: SMNTRART\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:20,136 - tissue_RASSF2_whole_blood] INFO: CCC: 0.243071, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n",
+ "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Skipping SMNUMGPS: all values are NaN\n",
+ "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 26/66: SMMAPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.243071, p-value: 1.00e-05\n",
+ "Processing column 25/66: SMNUMGPS\n",
+ " Skipping SMNUMGPS: all values are NaN\n",
+ "Processing column 26/66: SMMAPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:20,392 - tissue_RASSF2_whole_blood] INFO: CCC: 0.168576, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:20,393 - tissue_RASSF2_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.168576, p-value: 1.00e-05\n",
+ "Processing column 27/66: SMEXNCRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:20,646 - tissue_RASSF2_whole_blood] INFO: CCC: 0.040140, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO: Processing column 28/66: SM550NRM\n",
+ "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO: Skipping SM550NRM: all values are NaN\n",
+ "[2025-09-25 13:05:20,648 - tissue_RASSF2_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.040140, p-value: 1.00e-05\n",
+ "Processing column 28/66: SM550NRM\n",
+ " Skipping SM550NRM: all values are NaN\n",
+ "Processing column 29/66: SMGNSDTC\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:20,902 - tissue_RASSF2_whole_blood] INFO: CCC: 0.043013, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n",
+ "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Skipping SMUNMPRT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 31/66: SM350NRM\n",
+ "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Skipping SM350NRM: all values are NaN\n",
+ "[2025-09-25 13:05:20,904 - tissue_RASSF2_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.043013, p-value: 1.00e-05\n",
+ "Processing column 30/66: SMUNMPRT\n",
+ " Skipping SMUNMPRT: only 1 unique value(s)\n",
+ "Processing column 31/66: SM350NRM\n",
+ " Skipping SM350NRM: all values are NaN\n",
+ "Processing column 32/66: SMRDLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:21,156 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000028, p-value: 1.73e-01\n",
+ "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 33/66: SMMNCPB\n",
+ "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Skipping SMMNCPB: all values are NaN\n",
+ "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 34/66: SME1MMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.000028, p-value: 1.73e-01\n",
+ "Processing column 33/66: SMMNCPB\n",
+ " Skipping SMMNCPB: all values are NaN\n",
+ "Processing column 34/66: SME1MMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:21,411 - tissue_RASSF2_whole_blood] INFO: CCC: 0.018125, p-value: 1.40e-04\n",
+ "[2025-09-25 13:05:21,412 - tissue_RASSF2_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.018125, p-value: 1.40e-04\n",
+ "Processing column 35/66: SMSFLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:21,665 - tissue_RASSF2_whole_blood] INFO: CCC: 0.047258, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 36/66: SMESTLBS\n",
+ "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Skipping SMESTLBS: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 37/66: SMMPPD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.047258, p-value: 1.00e-05\n",
+ "Processing column 36/66: SMESTLBS\n",
+ " Skipping SMESTLBS: only 1 unique value(s)\n",
+ "Processing column 37/66: SMMPPD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.43e-02\n",
+ "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO: Processing column 38/66: SMNTERRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007761, p-value: 3.43e-02\n",
+ "Processing column 38/66: SMNTERRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO: CCC: 0.250997, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO: Processing column 39/66: SMRRNANM\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.250997, p-value: 1.00e-05\n",
+ "Processing column 39/66: SMRRNANM\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO: CCC: 0.036631, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO: Processing column 40/66: SMRDTTL\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.036631, p-value: 1.00e-05\n",
+ "Processing column 40/66: SMRDTTL\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO: CCC: 0.010388, p-value: 6.63e-03\n",
+ "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO: Processing column 41/66: SMVQCFL\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.010388, p-value: 6.63e-03\n",
+ "Processing column 41/66: SMVQCFL\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:22,941 - tissue_RASSF2_whole_blood] INFO: CCC: 0.001442, p-value: 9.24e-01\n",
+ "[2025-09-25 13:05:22,942 - tissue_RASSF2_whole_blood] INFO: Processing column 42/66: SMMNCV\n",
+ "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO: Skipping SMMNCV: all values are NaN\n",
+ "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.001442, p-value: 9.24e-01\n",
+ "Processing column 42/66: SMMNCV\n",
+ " Skipping SMMNCV: all values are NaN\n",
+ "Processing column 43/66: SMTRSCPT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO: CCC: 0.042714, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.042714, p-value: 1.00e-05\n",
+ "Processing column 44/66: SMMPPDPR\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:23,453 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.48e-02\n",
+ "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n",
+ "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Skipping SMCGLGTH: all values are NaN\n",
+ "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n",
+ "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Skipping SMGAPPCT: all values are NaN\n",
+ "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n",
+ "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Skipping SMUNPDRD: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007761, p-value: 3.48e-02\n",
+ "Processing column 45/66: SMCGLGTH\n",
+ " Skipping SMCGLGTH: all values are NaN\n",
+ "Processing column 46/66: SMGAPPCT\n",
+ " Skipping SMGAPPCT: all values are NaN\n",
+ "Processing column 47/66: SMUNPDRD\n",
+ " Skipping SMUNPDRD: only 1 unique value(s)\n",
+ "Processing column 48/66: SMNTRNRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO: CCC: 0.202936, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.202936, p-value: 1.00e-05\n",
+ "Processing column 49/66: SMMPUNRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO: CCC: 0.168576, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.168576, p-value: 1.00e-05\n",
+ "Processing column 50/66: SMEXPEFF\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO: CCC: 0.059931, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.059931, p-value: 1.00e-05\n",
+ "Processing column 51/66: SMMPPDUN\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.43e-02\n",
+ "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO: Processing column 52/66: SME2MMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007761, p-value: 3.43e-02\n",
+ "Processing column 52/66: SME2MMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:24,730 - tissue_RASSF2_whole_blood] INFO: CCC: 0.003990, p-value: 4.05e-01\n",
+ "[2025-09-25 13:05:24,731 - tissue_RASSF2_whole_blood] INFO: Processing column 53/66: SME2ANTI\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.003990, p-value: 4.05e-01\n",
+ "Processing column 53/66: SME2ANTI\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:24,987 - tissue_RASSF2_whole_blood] INFO: CCC: 0.020742, p-value: 1.10e-04\n",
+ "[2025-09-25 13:05:24,988 - tissue_RASSF2_whole_blood] INFO: Processing column 54/66: SMALTALG\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.020742, p-value: 1.10e-04\n",
+ "Processing column 54/66: SMALTALG\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO: CCC: 0.177009, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO: Processing column 55/66: SME2SNSE\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.177009, p-value: 1.00e-05\n",
+ "Processing column 55/66: SME2SNSE\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:25,496 - tissue_RASSF2_whole_blood] INFO: CCC: 0.019048, p-value: 1.80e-04\n",
+ "[2025-09-25 13:05:25,497 - tissue_RASSF2_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.019048, p-value: 1.80e-04\n",
+ "Processing column 56/66: SMMFLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO: CCC: 0.019296, p-value: 1.50e-04\n",
+ "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO: Processing column 57/66: SME1ANTI\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.019296, p-value: 1.50e-04\n",
+ "Processing column 57/66: SME1ANTI\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:26,007 - tissue_RASSF2_whole_blood] INFO: CCC: 0.021058, p-value: 1.20e-04\n",
+ "[2025-09-25 13:05:26,008 - tissue_RASSF2_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.021058, p-value: 1.20e-04\n",
+ "Processing column 58/66: SMSPLTRD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:26,263 - tissue_RASSF2_whole_blood] INFO: CCC: 0.057786, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:26,264 - tissue_RASSF2_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.057786, p-value: 1.00e-05\n",
+ "Processing column 59/66: SMBSMMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO: CCC: 0.005333, p-value: 1.83e-01\n",
+ "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO: Processing column 60/66: SME1SNSE\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.005333, p-value: 1.83e-01\n",
+ "Processing column 60/66: SME1SNSE\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO: CCC: 0.022008, p-value: 5.00e-05\n",
+ "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO: Processing column 61/66: SME1PCTS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.022008, p-value: 5.00e-05\n",
+ "Processing column 61/66: SME1PCTS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO: CCC: 0.032073, p-value: 2.00e-05\n",
+ "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO: Processing column 62/66: SMRRNART\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.032073, p-value: 2.00e-05\n",
+ "Processing column 62/66: SMRRNART\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:27,285 - tissue_RASSF2_whole_blood] INFO: CCC: 0.048437, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:27,286 - tissue_RASSF2_whole_blood] INFO: Processing column 63/66: SME1MPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.048437, p-value: 1.00e-05\n",
+ "Processing column 63/66: SME1MPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: CCC: 0.181940, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n",
+ "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: Skipping SMNUM5CD: all values are NaN\n",
+ "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n",
+ "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Skipping SMDPMPRT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 66/66: SME2PCTS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.181940, p-value: 1.00e-05\n",
+ "Processing column 64/66: SMNUM5CD\n",
+ " Skipping SMNUM5CD: all values are NaN\n",
+ "Processing column 65/66: SMDPMPRT\n",
+ " Skipping SMDPMPRT: only 1 unique value(s)\n",
+ "Processing column 66/66: SME2PCTS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:27,796 - tissue_RASSF2_whole_blood] INFO: CCC: 0.029344, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: \n",
+ "Completed processing whole_blood:\n",
+ "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Total metadata columns: 66\n",
+ "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Successful analyses: 44\n",
+ "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Skipped/Failed: 22\n",
+ "[2025-09-25 13:05:27,821 - summary] INFO: Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n",
+ "[2025-09-25 13:05:27,821 - summary] INFO: Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ "[2025-09-25 13:05:27,822 - summary] INFO: Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:27,823 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:27,823 - summary] INFO: COMBINED RESULTS SUMMARY\n",
+ "[2025-09-25 13:05:27,823 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:27,824 - summary] INFO: Gene Symbol: RASSF2\n",
+ "[2025-09-25 13:05:27,824 - summary] INFO: Gene ID: ENSG00000101265.15\n",
+ "[2025-09-25 13:05:27,824 - summary] INFO: Permutations: 100,000\n",
+ "[2025-09-25 13:05:27,824 - summary] INFO: Tissues processed: 1\n",
+ "[2025-09-25 13:05:27,825 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n",
+ "[2025-09-25 13:05:27,825 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n",
+ "[2025-09-25 13:05:27,826 - summary] INFO: \n",
+ "Total successful analyses across all tissues: 44\n",
+ "[2025-09-25 13:05:27,826 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:27,826 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+ "[2025-09-25 13:05:27,826 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:27,827 - summary] INFO: Tissue Metadata Column CCC Value P-value Significance \n",
+ "[2025-09-25 13:05:27,827 - summary] INFO: ------------------------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood SMTSISCH 0.528125 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood DTHHRDY 0.464582 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood SMNTERRT 0.250997 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMNTRART 0.243071 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SME1MPRT 0.181940 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMALTALG 0.177009 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SME2MPRT 0.172974 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMMAPRT 0.168576 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood SMCENTER 0.108148 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMRIN 0.048847 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMRRNART 0.048437 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n",
+ "[2025-09-25 13:05:27,834 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:27,834 - summary] INFO: SUMMARY BY TISSUE\n",
+ "[2025-09-25 13:05:27,834 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:27,834 - summary] INFO: Tissue N Samples Successful Mean |CCC| Max |CCC| \n",
+ "[2025-09-25 13:05:27,835 - summary] INFO: ----------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:27,835 - summary] INFO: whole_blood 755 44 0.079987 0.528125 \n",
+ "[2025-09-25 13:05:27,835 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:27,835 - summary] INFO: RUNTIME SUMMARY\n",
+ "[2025-09-25 13:05:27,836 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:27,836 - summary] INFO: Total runtime: 9.96 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:27,836 - summary] INFO: Average runtime per tissue: 9.96 seconds\n",
+ "[2025-09-25 13:05:27,836 - summary] INFO: \n",
+ "Runtime by tissue:\n",
+ "[2025-09-25 13:05:27,837 - summary] INFO: Tissue Runtime (sec) Runtime (min) Status \n",
+ "[2025-09-25 13:05:27,837 - summary] INFO: ----------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:27,837 - summary] INFO: whole_blood 9.96 0.17 Success \n",
+ "[2025-09-25 13:05:27,837 - summary] INFO: \n",
+ "Fastest: whole_blood (9.96 seconds)\n",
+ "[2025-09-25 13:05:27,837 - summary] INFO: Slowest: whole_blood (9.96 seconds)\n",
+ "[2025-09-25 13:05:27,838 - summary] INFO: Speed ratio: 1.0x\n",
+ "[2025-09-25 13:05:27,838 - summary] INFO: Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:27,838 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:27,838 - summary] INFO: PROCESSING GENE 2/2: CYTIP\n",
+ "[2025-09-25 13:05:27,838 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:27,839 - summary] INFO: \n",
+ "[1/1] Starting processing for CYTIP in whole_blood...\n",
+ "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: \n",
+ "============================================================\n",
+ "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: Processing tissue: whole_blood\n",
+ "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n",
+ "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: ============================================================\n",
+ "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Loading expression data...\n",
+ "[2025-09-25 13:05:27,981 - tissue_CYTIP_whole_blood] INFO: Expression data shape: (56200, 755)\n",
+ "[2025-09-25 13:05:27,984 - tissue_CYTIP_whole_blood] INFO: Gene ID for CYTIP: ENSG00000115165.9\n",
+ "[2025-09-25 13:05:27,986 - tissue_CYTIP_whole_blood] INFO: Number of samples: 755\n",
+ "[2025-09-25 13:05:27,987 - tissue_CYTIP_whole_blood] INFO: Common samples: 755\n",
+ "[2025-09-25 13:05:27,988 - tissue_CYTIP_whole_blood] INFO: Computing CCC between CYTIP expression and all metadata columns...\n",
+ "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Using 100000 permutations and 4 jobs\n",
+ "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing 66 metadata columns...\n",
+ "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing column 1/66: SUBJID\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.029344, p-value: 1.00e-05\n",
+ "\n",
+ "Completed processing whole_blood:\n",
+ " Total metadata columns: 66\n",
+ " Successful analyses: 44\n",
+ " Skipped/Failed: 22\n",
+ "Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n",
+ "Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ "Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n",
+ "\n",
+ "================================================================================\n",
+ "COMBINED RESULTS SUMMARY\n",
+ "================================================================================\n",
+ "Gene Symbol: RASSF2\n",
+ "Gene ID: ENSG00000101265.15\n",
+ "Permutations: 100,000\n",
+ "Tissues processed: 1\n",
+ "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n",
+ "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n",
+ "\n",
+ "Total successful analyses across all tissues: 44\n",
+ "\n",
+ "================================================================================\n",
+ "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+ "================================================================================\n",
+ "Tissue Metadata Column CCC Value P-value Significance \n",
+ "------------------------------------------------------------------------------------------\n",
+ "whole_blood SMTSISCH 0.528125 1.00e-05 *** \n",
+ "whole_blood DTHHRDY 0.464582 1.00e-05 *** \n",
+ "whole_blood SMNTERRT 0.250997 1.00e-05 *** \n",
+ "whole_blood SMNTRART 0.243071 1.00e-05 *** \n",
+ "whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n",
+ "whole_blood SME1MPRT 0.181940 1.00e-05 *** \n",
+ "whole_blood SMALTALG 0.177009 1.00e-05 *** \n",
+ "whole_blood SME2MPRT 0.172974 1.00e-05 *** \n",
+ "whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n",
+ "whole_blood SMMAPRT 0.168576 1.00e-05 *** \n",
+ "whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n",
+ "whole_blood SMCENTER 0.108148 1.00e-05 *** \n",
+ "whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n",
+ "whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n",
+ "whole_blood SMRIN 0.048847 1.00e-05 *** \n",
+ "whole_blood SMRRNART 0.048437 1.00e-05 *** \n",
+ "whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n",
+ "whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n",
+ "whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n",
+ "whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n",
+ "\n",
+ "================================================================================\n",
+ "SUMMARY BY TISSUE\n",
+ "================================================================================\n",
+ "Tissue N Samples Successful Mean |CCC| Max |CCC| \n",
+ "----------------------------------------------------------------------\n",
+ "whole_blood 755 44 0.079987 0.528125 \n",
+ "\n",
+ "================================================================================\n",
+ "RUNTIME SUMMARY\n",
+ "================================================================================\n",
+ "Total runtime: 9.96 seconds (0.17 minutes)\n",
+ "Average runtime per tissue: 9.96 seconds\n",
+ "\n",
+ "Runtime by tissue:\n",
+ "Tissue Runtime (sec) Runtime (min) Status \n",
+ "----------------------------------------------------------------------\n",
+ "whole_blood 9.96 0.17 Success \n",
+ "\n",
+ "Fastest: whole_blood (9.96 seconds)\n",
+ "Slowest: whole_blood (9.96 seconds)\n",
+ "Speed ratio: 1.0x\n",
+ "Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n",
+ "\n",
+ "====================================================================================================\n",
+ "PROCESSING GENE 2/2: CYTIP\n",
+ "====================================================================================================\n",
+ "\n",
+ "[1/1] Starting processing for CYTIP in whole_blood...\n",
+ "\n",
+ "============================================================\n",
+ "Processing tissue: whole_blood\n",
+ "File: gtex_v8_data_whole_blood.pkl\n",
+ "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "============================================================\n",
+ "Loading expression data...\n",
+ "Expression data shape: (56200, 755)\n",
+ "Gene ID for CYTIP: ENSG00000115165.9\n",
+ "Number of samples: 755\n",
+ "Common samples: 755\n",
+ "Computing CCC between CYTIP expression and all metadata columns...\n",
+ "Using 100000 permutations and 4 jobs\n",
+ "Processing 66 metadata columns...\n",
+ "Processing column 1/66: SUBJID\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000000, p-value: 1.00e+00\n",
+ "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO: Processing column 2/66: SEX\n",
+ "[2025-09-25 13:05:28,156 - tissue_CYTIP_whole_blood] INFO: CCC: 0.001409, p-value: 3.98e-01\n",
+ "[2025-09-25 13:05:28,157 - tissue_CYTIP_whole_blood] INFO: Processing column 3/66: AGE\n",
+ "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO: CCC: 0.018997, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO: Processing column 4/66: DTHHRDY\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.000000, p-value: 1.00e+00\n",
+ "Processing column 2/66: SEX\n",
+ " CCC: 0.001409, p-value: 3.98e-01\n",
+ "Processing column 3/66: AGE\n",
+ " CCC: 0.018997, p-value: 1.00e-05\n",
+ "Processing column 4/66: DTHHRDY\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:28,481 - tissue_CYTIP_whole_blood] INFO: CCC: 0.184226, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 5/66: SMATSSCR\n",
+ "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Skipping SMATSSCR: all values are NaN\n",
+ "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 6/66: SMCENTER\n",
+ "[2025-09-25 13:05:28,551 - tissue_CYTIP_whole_blood] INFO: CCC: 0.084684, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n",
+ "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Skipping SMPTHNTS: all values are NaN\n",
+ "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 8/66: SMRIN\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.184226, p-value: 1.00e-05\n",
+ "Processing column 5/66: SMATSSCR\n",
+ " Skipping SMATSSCR: all values are NaN\n",
+ "Processing column 6/66: SMCENTER\n",
+ " CCC: 0.084684, p-value: 1.00e-05\n",
+ "Processing column 7/66: SMPTHNTS\n",
+ " Skipping SMPTHNTS: all values are NaN\n",
+ "Processing column 8/66: SMRIN\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO: CCC: 0.003196, p-value: 5.68e-01\n",
+ "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO: Processing column 9/66: SMTS\n",
+ "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Skipping SMTS: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Processing column 10/66: SMTSD\n",
+ "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Skipping SMTSD: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 11/66: SMUBRID\n",
+ "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Skipping SMUBRID: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 12/66: SMTSISCH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.003196, p-value: 5.68e-01\n",
+ "Processing column 9/66: SMTS\n",
+ " Skipping SMTS: only 1 unique value(s)\n",
+ "Processing column 10/66: SMTSD\n",
+ " Skipping SMTSD: only 1 unique value(s)\n",
+ "Processing column 11/66: SMUBRID\n",
+ " Skipping SMUBRID: only 1 unique value(s)\n",
+ "Processing column 12/66: SMTSISCH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO: CCC: 0.215092, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO: Processing column 13/66: SMTSPAX\n",
+ "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO: Skipping SMTSPAX: all values are NaN\n",
+ "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO: Processing column 14/66: SMNABTCH\n",
+ "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000304, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n",
+ "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO: Skipping SMNABTCHT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n",
+ "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000256, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n",
+ "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO: CCC: 0.001533, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.215092, p-value: 1.00e-05\n",
+ "Processing column 13/66: SMTSPAX\n",
+ " Skipping SMTSPAX: all values are NaN\n",
+ "Processing column 14/66: SMNABTCH\n",
+ " CCC: 0.000304, p-value: 1.00e-05\n",
+ "Processing column 15/66: SMNABTCHT\n",
+ " Skipping SMNABTCHT: only 1 unique value(s)\n",
+ "Processing column 16/66: SMNABTCHD\n",
+ " CCC: 0.000256, p-value: 1.00e-05\n",
+ "Processing column 17/66: SMGEBTCH\n",
+ " CCC: 0.001533, p-value: 1.00e-05\n",
+ "Processing column 18/66: SMGEBTCHD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:29,317 - tissue_CYTIP_whole_blood] INFO: CCC: 0.002104, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n",
+ "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO: Skipping SMGEBTCHT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 20/66: SMAFRZE\n",
+ "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Skipping SMAFRZE: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 21/66: SMGTC\n",
+ "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO: Skipping SMGTC: all values are NaN\n",
+ "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO: Processing column 22/66: SME2MPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.002104, p-value: 1.00e-05\n",
+ "Processing column 19/66: SMGEBTCHT\n",
+ " Skipping SMGEBTCHT: only 1 unique value(s)\n",
+ "Processing column 20/66: SMAFRZE\n",
+ " Skipping SMAFRZE: only 1 unique value(s)\n",
+ "Processing column 21/66: SMGTC\n",
+ " Skipping SMGTC: all values are NaN\n",
+ "Processing column 22/66: SME2MPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:29,573 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021744, p-value: 5.00e-05\n",
+ "[2025-09-25 13:05:29,574 - tissue_CYTIP_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.021744, p-value: 5.00e-05\n",
+ "Processing column 23/66: SMCHMPRS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO: CCC: 0.015946, p-value: 4.50e-04\n",
+ "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO: Processing column 24/66: SMNTRART\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.015946, p-value: 4.50e-04\n",
+ "Processing column 24/66: SMNTRART\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:30,082 - tissue_CYTIP_whole_blood] INFO: CCC: 0.024407, p-value: 3.00e-05\n",
+ "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n",
+ "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO: Skipping SMNUMGPS: all values are NaN\n",
+ "[2025-09-25 13:05:30,084 - tissue_CYTIP_whole_blood] INFO: Processing column 26/66: SMMAPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.024407, p-value: 3.00e-05\n",
+ "Processing column 25/66: SMNUMGPS\n",
+ " Skipping SMNUMGPS: all values are NaN\n",
+ "Processing column 26/66: SMMAPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:30,338 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021052, p-value: 7.00e-05\n",
+ "[2025-09-25 13:05:30,339 - tissue_CYTIP_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.021052, p-value: 7.00e-05\n",
+ "Processing column 27/66: SMEXNCRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: CCC: 0.126241, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: Processing column 28/66: SM550NRM\n",
+ "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: Skipping SM550NRM: all values are NaN\n",
+ "[2025-09-25 13:05:30,594 - tissue_CYTIP_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.126241, p-value: 1.00e-05\n",
+ "Processing column 28/66: SM550NRM\n",
+ " Skipping SM550NRM: all values are NaN\n",
+ "Processing column 29/66: SMGNSDTC\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO: CCC: 0.050841, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n",
+ "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Skipping SMUNMPRT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Processing column 31/66: SM350NRM\n",
+ "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Skipping SM350NRM: all values are NaN\n",
+ "[2025-09-25 13:05:30,849 - tissue_CYTIP_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.050841, p-value: 1.00e-05\n",
+ "Processing column 30/66: SMUNMPRT\n",
+ " Skipping SMUNMPRT: only 1 unique value(s)\n",
+ "Processing column 31/66: SM350NRM\n",
+ " Skipping SM350NRM: all values are NaN\n",
+ "Processing column 32/66: SMRDLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:31,103 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000003, p-value: 9.42e-01\n",
+ "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 33/66: SMMNCPB\n",
+ "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Skipping SMMNCPB: all values are NaN\n",
+ "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 34/66: SME1MMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.000003, p-value: 9.42e-01\n",
+ "Processing column 33/66: SMMNCPB\n",
+ " Skipping SMMNCPB: all values are NaN\n",
+ "Processing column 34/66: SME1MMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005089, p-value: 2.16e-01\n",
+ "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.005089, p-value: 2.16e-01\n",
+ "Processing column 35/66: SMSFLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:31,614 - tissue_CYTIP_whole_blood] INFO: CCC: 0.015322, p-value: 7.10e-04\n",
+ "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO: Processing column 36/66: SMESTLBS\n",
+ "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO: Skipping SMESTLBS: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:31,616 - tissue_CYTIP_whole_blood] INFO: Processing column 37/66: SMMPPD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.015322, p-value: 7.10e-04\n",
+ "Processing column 36/66: SMESTLBS\n",
+ " Skipping SMESTLBS: only 1 unique value(s)\n",
+ "Processing column 37/66: SMMPPD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007147, p-value: 5.32e-02\n",
+ "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO: Processing column 38/66: SMNTERRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007147, p-value: 5.32e-02\n",
+ "Processing column 38/66: SMNTERRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:32,125 - tissue_CYTIP_whole_blood] INFO: CCC: 0.023433, p-value: 4.00e-05\n",
+ "[2025-09-25 13:05:32,126 - tissue_CYTIP_whole_blood] INFO: Processing column 39/66: SMRRNANM\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.023433, p-value: 4.00e-05\n",
+ "Processing column 39/66: SMRRNANM\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005677, p-value: 1.45e-01\n",
+ "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO: Processing column 40/66: SMRDTTL\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.005677, p-value: 1.45e-01\n",
+ "Processing column 40/66: SMRDTTL\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:32,632 - tissue_CYTIP_whole_blood] INFO: CCC: 0.008033, p-value: 2.92e-02\n",
+ "[2025-09-25 13:05:32,633 - tissue_CYTIP_whole_blood] INFO: Processing column 41/66: SMVQCFL\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.008033, p-value: 2.92e-02\n",
+ "Processing column 41/66: SMVQCFL\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:32,887 - tissue_CYTIP_whole_blood] INFO: CCC: 0.003136, p-value: 6.00e-01\n",
+ "[2025-09-25 13:05:32,888 - tissue_CYTIP_whole_blood] INFO: Processing column 42/66: SMMNCV\n",
+ "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO: Skipping SMMNCV: all values are NaN\n",
+ "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.003136, p-value: 6.00e-01\n",
+ "Processing column 42/66: SMMNCV\n",
+ " Skipping SMMNCV: all values are NaN\n",
+ "Processing column 43/66: SMTRSCPT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:33,142 - tissue_CYTIP_whole_blood] INFO: CCC: 0.051533, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:33,143 - tissue_CYTIP_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.051533, p-value: 1.00e-05\n",
+ "Processing column 44/66: SMMPPDPR\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005880, p-value: 1.29e-01\n",
+ "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n",
+ "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: Skipping SMCGLGTH: all values are NaN\n",
+ "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n",
+ "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Skipping SMGAPPCT: all values are NaN\n",
+ "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n",
+ "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO: Skipping SMUNPDRD: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.005880, p-value: 1.29e-01\n",
+ "Processing column 45/66: SMCGLGTH\n",
+ " Skipping SMCGLGTH: all values are NaN\n",
+ "Processing column 46/66: SMGAPPCT\n",
+ " Skipping SMGAPPCT: all values are NaN\n",
+ "Processing column 47/66: SMUNPDRD\n",
+ " Skipping SMUNPDRD: only 1 unique value(s)\n",
+ "Processing column 48/66: SMNTRNRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO: CCC: 0.261762, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.261762, p-value: 1.00e-05\n",
+ "Processing column 49/66: SMMPUNRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021052, p-value: 7.00e-05\n",
+ "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.021052, p-value: 7.00e-05\n",
+ "Processing column 50/66: SMEXPEFF\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO: CCC: 0.086945, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.086945, p-value: 1.00e-05\n",
+ "Processing column 51/66: SMMPPDUN\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007147, p-value: 5.32e-02\n",
+ "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO: Processing column 52/66: SME2MMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007147, p-value: 5.32e-02\n",
+ "Processing column 52/66: SME2MMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:34,674 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004187, p-value: 3.68e-01\n",
+ "[2025-09-25 13:05:34,675 - tissue_CYTIP_whole_blood] INFO: Processing column 53/66: SME2ANTI\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.004187, p-value: 3.68e-01\n",
+ "Processing column 53/66: SME2ANTI\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:34,929 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007334, p-value: 4.67e-02\n",
+ "[2025-09-25 13:05:34,930 - tissue_CYTIP_whole_blood] INFO: Processing column 54/66: SMALTALG\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007334, p-value: 4.67e-02\n",
+ "Processing column 54/66: SMALTALG\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:35,186 - tissue_CYTIP_whole_blood] INFO: CCC: 0.038381, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:35,187 - tissue_CYTIP_whole_blood] INFO: Processing column 55/66: SME2SNSE\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.038381, p-value: 1.00e-05\n",
+ "Processing column 55/66: SME2SNSE\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:35,441 - tissue_CYTIP_whole_blood] INFO: CCC: 0.006734, p-value: 7.08e-02\n",
+ "[2025-09-25 13:05:35,442 - tissue_CYTIP_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.006734, p-value: 7.08e-02\n",
+ "Processing column 56/66: SMMFLGTH\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO: CCC: 0.010863, p-value: 4.63e-03\n",
+ "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO: Processing column 57/66: SME1ANTI\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.010863, p-value: 4.63e-03\n",
+ "Processing column 57/66: SME1ANTI\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:35,950 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007210, p-value: 5.04e-02\n",
+ "[2025-09-25 13:05:35,951 - tissue_CYTIP_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007210, p-value: 5.04e-02\n",
+ "Processing column 58/66: SMSPLTRD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO: CCC: 0.030117, p-value: 1.00e-05\n",
+ "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.030117, p-value: 1.00e-05\n",
+ "Processing column 59/66: SMBSMMRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:36,464 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004293, p-value: 3.48e-01\n",
+ "[2025-09-25 13:05:36,465 - tissue_CYTIP_whole_blood] INFO: Processing column 60/66: SME1SNSE\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.004293, p-value: 3.48e-01\n",
+ "Processing column 60/66: SME1SNSE\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007285, p-value: 4.87e-02\n",
+ "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO: Processing column 61/66: SME1PCTS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007285, p-value: 4.87e-02\n",
+ "Processing column 61/66: SME1PCTS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004663, p-value: 2.81e-01\n",
+ "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO: Processing column 62/66: SMRRNART\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.004663, p-value: 2.81e-01\n",
+ "Processing column 62/66: SMRRNART\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:37,228 - tissue_CYTIP_whole_blood] INFO: CCC: 0.013729, p-value: 1.10e-03\n",
+ "[2025-09-25 13:05:37,229 - tissue_CYTIP_whole_blood] INFO: Processing column 63/66: SME1MPRT\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.013729, p-value: 1.10e-03\n",
+ "Processing column 63/66: SME1MPRT\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021952, p-value: 4.00e-05\n",
+ "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n",
+ "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO: Skipping SMNUM5CD: all values are NaN\n",
+ "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n",
+ "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO: Skipping SMDPMPRT: only 1 unique value(s)\n",
+ "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO: Processing column 66/66: SME2PCTS\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.021952, p-value: 4.00e-05\n",
+ "Processing column 64/66: SMNUM5CD\n",
+ " Skipping SMNUM5CD: all values are NaN\n",
+ "Processing column 65/66: SMDPMPRT\n",
+ " Skipping SMDPMPRT: only 1 unique value(s)\n",
+ "Processing column 66/66: SME2PCTS\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[2025-09-25 13:05:37,739 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007812, p-value: 3.38e-02\n",
+ "[2025-09-25 13:05:37,740 - tissue_CYTIP_whole_blood] INFO: \n",
+ "Completed processing whole_blood:\n",
+ "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Total metadata columns: 66\n",
+ "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Successful analyses: 44\n",
+ "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Skipped/Failed: 22\n",
+ "[2025-09-25 13:05:37,765 - summary] INFO: Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n",
+ "[2025-09-25 13:05:37,765 - summary] INFO: Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "[2025-09-25 13:05:37,766 - summary] INFO: Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:37,767 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:37,767 - summary] INFO: COMBINED RESULTS SUMMARY\n",
+ "[2025-09-25 13:05:37,767 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:37,768 - summary] INFO: Gene Symbol: CYTIP\n",
+ "[2025-09-25 13:05:37,768 - summary] INFO: Gene ID: ENSG00000115165.9\n",
+ "[2025-09-25 13:05:37,768 - summary] INFO: Permutations: 100,000\n",
+ "[2025-09-25 13:05:37,768 - summary] INFO: Tissues processed: 1\n",
+ "[2025-09-25 13:05:37,769 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n",
+ "[2025-09-25 13:05:37,769 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n",
+ "[2025-09-25 13:05:37,769 - summary] INFO: \n",
+ "Total successful analyses across all tissues: 44\n",
+ "[2025-09-25 13:05:37,769 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:37,770 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+ "[2025-09-25 13:05:37,771 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:37,771 - summary] INFO: Tissue Metadata Column CCC Value P-value Significance \n",
+ "[2025-09-25 13:05:37,771 - summary] INFO: ------------------------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood SMTSISCH 0.215092 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood DTHHRDY 0.184226 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMCENTER 0.084684 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMALTALG 0.038381 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMSPLTRD 0.030117 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SMNTRART 0.024407 3.00e-05 *** \n",
+ "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SMNTERRT 0.023433 4.00e-05 *** \n",
+ "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SME1MPRT 0.021952 4.00e-05 *** \n",
+ "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SME2MPRT 0.021744 5.00e-05 *** \n",
+ "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood SMMAPRT 0.021052 7.00e-05 *** \n",
+ "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood SMMPUNRT 0.021052 7.00e-05 *** \n",
+ "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood AGE 0.018997 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMCHMPRS 0.015946 4.50e-04 *** \n",
+ "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMSFLGTH 0.015322 7.10e-04 *** \n",
+ "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMRRNART 0.013729 1.10e-03 ** \n",
+ "[2025-09-25 13:05:37,777 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:37,778 - summary] INFO: SUMMARY BY TISSUE\n",
+ "[2025-09-25 13:05:37,778 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:37,778 - summary] INFO: Tissue N Samples Successful Mean |CCC| Max |CCC| \n",
+ "[2025-09-25 13:05:37,778 - summary] INFO: ----------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,779 - summary] INFO: whole_blood 755 44 0.032699 0.261762 \n",
+ "[2025-09-25 13:05:37,779 - summary] INFO: \n",
+ "================================================================================\n",
+ "[2025-09-25 13:05:37,780 - summary] INFO: RUNTIME SUMMARY\n",
+ "[2025-09-25 13:05:37,780 - summary] INFO: ================================================================================\n",
+ "[2025-09-25 13:05:37,780 - summary] INFO: Total runtime: 9.93 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:37,780 - summary] INFO: Average runtime per tissue: 9.93 seconds\n",
+ "[2025-09-25 13:05:37,780 - summary] INFO: \n",
+ "Runtime by tissue:\n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: Tissue Runtime (sec) Runtime (min) Status \n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: ----------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: whole_blood 9.93 0.17 Success \n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: \n",
+ "Fastest: whole_blood (9.93 seconds)\n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: Slowest: whole_blood (9.93 seconds)\n",
+ "[2025-09-25 13:05:37,781 - summary] INFO: Speed ratio: 1.0x\n",
+ "[2025-09-25 13:05:37,782 - summary] INFO: Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:37,782 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:37,782 - summary] INFO: OVERALL RESULTS SUMMARY\n",
+ "[2025-09-25 13:05:37,782 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:37,783 - summary] INFO: Gene symbols processed: RASSF2, CYTIP\n",
+ "[2025-09-25 13:05:37,783 - summary] INFO: Total genes: 2\n",
+ "[2025-09-25 13:05:37,783 - summary] INFO: Permutations: 100,000\n",
+ "[2025-09-25 13:05:37,783 - summary] INFO: Tissues per gene: 1\n",
+ "[2025-09-25 13:05:37,784 - summary] INFO: All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n",
+ "[2025-09-25 13:05:37,785 - summary] INFO: All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n",
+ "[2025-09-25 13:05:37,785 - summary] INFO: \n",
+ "Log files created:\n",
+ "[2025-09-25 13:05:37,785 - summary] INFO: RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ "[2025-09-25 13:05:37,786 - summary] INFO: CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "[2025-09-25 13:05:37,786 - summary] INFO: \n",
+ "Total successful analyses across all genes and tissues: 88\n",
+ "[2025-09-25 13:05:37,786 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:37,787 - summary] INFO: TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n",
+ "[2025-09-25 13:05:37,787 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:37,788 - summary] INFO: Gene Tissue Metadata Column CCC Value P-value Significance \n",
+ "[2025-09-25 13:05:37,788 - summary] INFO: --------------------------------------------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2 whole_blood SMTSISCH 0.528125 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2 whole_blood DTHHRDY 0.464582 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2 whole_blood SMNTERRT 0.250997 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2 whole_blood SMNTRART 0.243071 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP whole_blood SMTSISCH 0.215092 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2 whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,790 - summary] INFO: CYTIP whole_blood DTHHRDY 0.184226 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2 whole_blood SME1MPRT 0.181940 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMALTALG 0.177009 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SME2MPRT 0.172974 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMMAPRT 0.168576 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,792 - summary] INFO: RASSF2 whole_blood SMCENTER 0.108148 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP whole_blood SMCENTER 0.084684 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2 whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2 whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,794 - summary] INFO: CYTIP whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMRIN 0.048847 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMRRNART 0.048437 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood AGE 0.039824 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,796 - summary] INFO: CYTIP whole_blood SMALTALG 0.038381 1.00e-05 *** \n",
+ "[2025-09-25 13:05:37,796 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:37,796 - summary] INFO: SUMMARY BY GENE\n",
+ "[2025-09-25 13:05:37,796 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:37,797 - summary] INFO: \n",
+ "Gene: RASSF2 (ID: ENSG00000101265.15)\n",
+ "[2025-09-25 13:05:37,797 - summary] INFO: Tissues processed: 1\n",
+ "[2025-09-25 13:05:37,797 - summary] INFO: Successful analyses: 44\n",
+ "[2025-09-25 13:05:37,797 - summary] INFO: Mean |CCC|: 0.079987\n",
+ "[2025-09-25 13:05:37,797 - summary] INFO: Max |CCC|: 0.528125\n",
+ "[2025-09-25 13:05:37,798 - summary] INFO: Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n",
+ "[2025-09-25 13:05:37,798 - summary] INFO: Runtime: 9.96 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:37,799 - summary] INFO: \n",
+ "Gene: CYTIP (ID: ENSG00000115165.9)\n",
+ "[2025-09-25 13:05:37,799 - summary] INFO: Tissues processed: 1\n",
+ "[2025-09-25 13:05:37,799 - summary] INFO: Successful analyses: 44\n",
+ "[2025-09-25 13:05:37,800 - summary] INFO: Mean |CCC|: 0.032699\n",
+ "[2025-09-25 13:05:37,800 - summary] INFO: Max |CCC|: 0.261762\n",
+ "[2025-09-25 13:05:37,800 - summary] INFO: Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: Runtime: 9.93 seconds (0.17 minutes)\n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: SUMMARY BY TISSUE (across all genes)\n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: Tissue N Genes Successful Mean |CCC| Max |CCC| \n",
+ "[2025-09-25 13:05:37,801 - summary] INFO: ---------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,802 - summary] INFO: whole_blood 2 88 0.056343 0.528125 \n",
+ "[2025-09-25 13:05:37,802 - summary] INFO: \n",
+ "====================================================================================================\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: RUNTIME SUMMARY\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: ====================================================================================================\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: Total runtime: 19.92 seconds (0.33 minutes)\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: Average runtime per gene: 9.96 seconds\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: Total gene-tissue combinations: 2\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: \n",
+ "Runtime by gene:\n",
+ "[2025-09-25 13:05:37,803 - summary] INFO: Gene Runtime (sec) Runtime (min) Tissues Successful \n",
+ "[2025-09-25 13:05:37,804 - summary] INFO: ---------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,804 - summary] INFO: RASSF2 9.96 0.17 1 1 \n",
+ "[2025-09-25 13:05:37,804 - summary] INFO: CYTIP 9.93 0.17 1 1 \n",
+ "[2025-09-25 13:05:37,804 - summary] INFO: \n",
+ "Average runtime by tissue (across all genes):\n",
+ "[2025-09-25 13:05:37,805 - summary] INFO: Tissue Avg Runtime (sec) Avg Runtime (min) N Runs Min Max \n",
+ "[2025-09-25 13:05:37,805 - summary] INFO: -----------------------------------------------------------------------------------------------\n",
+ "[2025-09-25 13:05:37,805 - summary] INFO: whole_blood 9.94 0.17 2 9.93 9.96 \n",
+ "[2025-09-25 13:05:37,805 - summary] INFO: \n",
+ "Fastest tissue (avg): whole_blood (9.94 seconds)\n",
+ "[2025-09-25 13:05:37,805 - summary] INFO: Slowest tissue (avg): whole_blood (9.94 seconds)\n",
+ "[2025-09-25 13:05:37,806 - summary] INFO: Speed ratio: 1.0x\n",
+ "[2025-09-25 13:05:37,806 - summary] INFO: \n",
+ "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+ "[2025-09-25 13:05:37,806 - summary] INFO: Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " CCC: 0.007812, p-value: 3.38e-02\n",
+ "\n",
+ "Completed processing whole_blood:\n",
+ " Total metadata columns: 66\n",
+ " Successful analyses: 44\n",
+ " Skipped/Failed: 22\n",
+ "Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n",
+ "Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n",
+ "\n",
+ "================================================================================\n",
+ "COMBINED RESULTS SUMMARY\n",
+ "================================================================================\n",
+ "Gene Symbol: CYTIP\n",
+ "Gene ID: ENSG00000115165.9\n",
+ "Permutations: 100,000\n",
+ "Tissues processed: 1\n",
+ "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n",
+ "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n",
+ "\n",
+ "Total successful analyses across all tissues: 44\n",
+ "\n",
+ "================================================================================\n",
+ "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n",
+ "================================================================================\n",
+ "Tissue Metadata Column CCC Value P-value Significance \n",
+ "------------------------------------------------------------------------------------------\n",
+ "whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n",
+ "whole_blood SMTSISCH 0.215092 1.00e-05 *** \n",
+ "whole_blood DTHHRDY 0.184226 1.00e-05 *** \n",
+ "whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n",
+ "whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n",
+ "whole_blood SMCENTER 0.084684 1.00e-05 *** \n",
+ "whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n",
+ "whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n",
+ "whole_blood SMALTALG 0.038381 1.00e-05 *** \n",
+ "whole_blood SMSPLTRD 0.030117 1.00e-05 *** \n",
+ "whole_blood SMNTRART 0.024407 3.00e-05 *** \n",
+ "whole_blood SMNTERRT 0.023433 4.00e-05 *** \n",
+ "whole_blood SME1MPRT 0.021952 4.00e-05 *** \n",
+ "whole_blood SME2MPRT 0.021744 5.00e-05 *** \n",
+ "whole_blood SMMAPRT 0.021052 7.00e-05 *** \n",
+ "whole_blood SMMPUNRT 0.021052 7.00e-05 *** \n",
+ "whole_blood AGE 0.018997 1.00e-05 *** \n",
+ "whole_blood SMCHMPRS 0.015946 4.50e-04 *** \n",
+ "whole_blood SMSFLGTH 0.015322 7.10e-04 *** \n",
+ "whole_blood SMRRNART 0.013729 1.10e-03 ** \n",
+ "\n",
+ "================================================================================\n",
+ "SUMMARY BY TISSUE\n",
+ "================================================================================\n",
+ "Tissue N Samples Successful Mean |CCC| Max |CCC| \n",
+ "----------------------------------------------------------------------\n",
+ "whole_blood 755 44 0.032699 0.261762 \n",
+ "\n",
+ "================================================================================\n",
+ "RUNTIME SUMMARY\n",
+ "================================================================================\n",
+ "Total runtime: 9.93 seconds (0.17 minutes)\n",
+ "Average runtime per tissue: 9.93 seconds\n",
+ "\n",
+ "Runtime by tissue:\n",
+ "Tissue Runtime (sec) Runtime (min) Status \n",
+ "----------------------------------------------------------------------\n",
+ "whole_blood 9.93 0.17 Success \n",
+ "\n",
+ "Fastest: whole_blood (9.93 seconds)\n",
+ "Slowest: whole_blood (9.93 seconds)\n",
+ "Speed ratio: 1.0x\n",
+ "Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n",
+ "\n",
+ "====================================================================================================\n",
+ "OVERALL RESULTS SUMMARY\n",
+ "====================================================================================================\n",
+ "Gene symbols processed: RASSF2, CYTIP\n",
+ "Total genes: 2\n",
+ "Permutations: 100,000\n",
+ "Tissues per gene: 1\n",
+ "All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n",
+ "All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n",
+ "\n",
+ "Log files created:\n",
+ " RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n",
+ " CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n",
+ "\n",
+ "Total successful analyses across all genes and tissues: 88\n",
+ "\n",
+ "====================================================================================================\n",
+ "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n",
+ "====================================================================================================\n",
+ "Gene Tissue Metadata Column CCC Value P-value Significance \n",
+ "--------------------------------------------------------------------------------------------------------------\n",
+ "RASSF2 whole_blood SMTSISCH 0.528125 1.00e-05 *** \n",
+ "RASSF2 whole_blood DTHHRDY 0.464582 1.00e-05 *** \n",
+ "CYTIP whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMNTERRT 0.250997 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMNTRART 0.243071 1.00e-05 *** \n",
+ "CYTIP whole_blood SMTSISCH 0.215092 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n",
+ "CYTIP whole_blood DTHHRDY 0.184226 1.00e-05 *** \n",
+ "RASSF2 whole_blood SME1MPRT 0.181940 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMALTALG 0.177009 1.00e-05 *** \n",
+ "RASSF2 whole_blood SME2MPRT 0.172974 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMMAPRT 0.168576 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n",
+ "CYTIP whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMCENTER 0.108148 1.00e-05 *** \n",
+ "CYTIP whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n",
+ "CYTIP whole_blood SMCENTER 0.084684 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n",
+ "CYTIP whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n",
+ "CYTIP whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMRIN 0.048847 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMRRNART 0.048437 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n",
+ "RASSF2 whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n",
+ "RASSF2 whole_blood AGE 0.039824 1.00e-05 *** \n",
+ "CYTIP whole_blood SMALTALG 0.038381 1.00e-05 *** \n",
+ "\n",
+ "====================================================================================================\n",
+ "SUMMARY BY GENE\n",
+ "====================================================================================================\n",
+ "\n",
+ "Gene: RASSF2 (ID: ENSG00000101265.15)\n",
+ " Tissues processed: 1\n",
+ " Successful analyses: 44\n",
+ " Mean |CCC|: 0.079987\n",
+ " Max |CCC|: 0.528125\n",
+ " Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n",
+ " Runtime: 9.96 seconds (0.17 minutes)\n",
+ "\n",
+ "Gene: CYTIP (ID: ENSG00000115165.9)\n",
+ " Tissues processed: 1\n",
+ " Successful analyses: 44\n",
+ " Mean |CCC|: 0.032699\n",
+ " Max |CCC|: 0.261762\n",
+ " Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n",
+ " Runtime: 9.93 seconds (0.17 minutes)\n",
+ "\n",
+ "====================================================================================================\n",
+ "SUMMARY BY TISSUE (across all genes)\n",
+ "====================================================================================================\n",
+ "Tissue N Genes Successful Mean |CCC| Max |CCC| \n",
+ "---------------------------------------------------------------------------\n",
+ "whole_blood 2 88 0.056343 0.528125 \n",
+ "\n",
+ "====================================================================================================\n",
+ "RUNTIME SUMMARY\n",
+ "====================================================================================================\n",
+ "Total runtime: 19.92 seconds (0.33 minutes)\n",
+ "Average runtime per gene: 9.96 seconds\n",
+ "Total gene-tissue combinations: 2\n",
+ "\n",
+ "Runtime by gene:\n",
+ "Gene Runtime (sec) Runtime (min) Tissues Successful \n",
+ "---------------------------------------------------------------------------\n",
+ "RASSF2 9.96 0.17 1 1 \n",
+ "CYTIP 9.93 0.17 1 1 \n",
+ "\n",
+ "Average runtime by tissue (across all genes):\n",
+ "Tissue Avg Runtime (sec) Avg Runtime (min) N Runs Min Max \n",
+ "-----------------------------------------------------------------------------------------------\n",
+ "whole_blood 9.94 0.17 2 9.93 9.96 \n",
+ "\n",
+ "Fastest tissue (avg): whole_blood (9.94 seconds)\n",
+ "Slowest tissue (avg): whole_blood (9.94 seconds)\n",
+ "Speed ratio: 1.0x\n",
+ "\n",
+ "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n",
+ "Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n"
+ ]
+ }
+ ],
+ "source": [
+ "%run ./nbs/common/metadata_corr_cli.py RASSF2 CYTIP --include whole_blood --expr-data-dir {TISSUE_DATA_DIR} --data-dir {ANALYSIS_DIR} --output-dir {METADATA_CORRELATIONS_RESULT_DIR}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['RASSF2_all_tissues_correlation_results.csv',\n",
+ " 'CYTIP_whole_blood.log',\n",
+ " 'RASSF2_whole_blood_correlation_results.pkl',\n",
+ " 'RASSF2_all_tissues_correlation_results.pkl',\n",
+ " 'RASSF2_whole_blood.log',\n",
+ " 'CYTIP_all_tissues_correlation_results.pkl',\n",
+ " 'CYTIP_whole_blood_correlation_results.pkl',\n",
+ " '_all_genes_all_tissues_correlation_results.csv',\n",
+ " '_RASSF2_CYTIP_summary_tables.log',\n",
+ " '_RASSF2_CYTIP_summary_execution.log',\n",
+ " 'CYTIP_all_tissues_correlation_results.csv',\n",
+ " '_all_genes_all_tissues_correlation_results.pkl']"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# You can find the results in the `METADATA_CORRELATIONS_RESULT_DIR` directory\n",
+ "os.listdir(METADATA_CORRELATIONS_RESULT_DIR)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs/common/README.md b/nbs/common/README.md
new file mode 100644
index 00000000..ea9f6262
--- /dev/null
+++ b/nbs/common/README.md
@@ -0,0 +1,527 @@
+# Common Analysis Tools
+
+This directory contains command-line tools for gene expression analysis using the CCC-GPU package.
+
+## Available Tools
+
+1. **[Single Gene Pair Correlation Analysis](#single-gene-pair-correlation-analysis)** (`compute_single_gene_pair_correlations_cli.py`)
+2. **[Gene Expression-Metadata Correlation Analysis](#gene-expression-metadata-correlation-analysis)** (`metadata_corr_cli.py`)
+
+---
+
+# Single Gene Pair Correlation Analysis
+
+A command-line tool for exploring gene expression data and computing correlations between specific gene pairs using CCC (Clustered Correlation Coefficient), Spearman, and Pearson correlation methods.
+
+## Features
+
+- **Data Exploration**: Browse available tissues and genes with their symbols
+- **Gene Pair Correlation**: Compute three correlation coefficients (CCC, Pearson, Spearman) for any gene pair
+- **Flexible Gene Input**: Accept both gene symbols (e.g., TP53) and Ensembl IDs (e.g., ENSG00000141510.16)
+- **Tissue-Specific Analysis**: Analyze correlations within specific tissue contexts
+- **Robust Gene Resolution**: Handle version numbers and case-insensitive matching
+- **Comprehensive Error Handling**: Clear error messages and debugging support
+
+## Installation Requirements
+
+```bash
+# Required packages
+pip install pandas numpy
+# CCC-GPU package (install from source as per project instructions)
+```
+
+## Quick Start
+
+### 1. Explore Available Data
+
+```bash
+# List all available tissues
+python compute_single_gene_pair_correlations_cli.py --list-tissues
+
+# Show genes available in whole blood tissue
+python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood
+
+# Show more genes (default is 20)
+python compute_single_gene_pair_correlations_cli.py --show-genes liver --n-genes 50
+```
+
+### 2. Compute Gene Pair Correlations
+
+```bash
+# Basic correlation analysis between TP53 and BRCA1 in whole blood
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood
+
+# Use Ensembl IDs instead of symbols
+python compute_single_gene_pair_correlations_cli.py ENSG00000141510.16 ENSG00000012048.20 --tissue liver
+
+# Mixed input (symbol and Ensembl ID)
+python compute_single_gene_pair_correlations_cli.py TP53 ENSG00000012048.20 --tissue brain_cortex
+```
+
+### 3. Save Results and Logs
+
+```bash
+# Save results and logs to a specific directory
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \
+ --output-dir ./results
+
+# Combine with debug logging for detailed output
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \
+ --output-dir ./detailed_analysis --debug
+```
+
+### 4. Custom Data Paths
+
+```bash
+# Use custom data directory and gene mapping file
+python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \
+ --data-dir /custom/path/to/tissue/data \
+ --gene-mapping /custom/path/to/gene_mappings.pkl \
+ --output-dir ./custom_results
+```
+
+## Command Line Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `genes` | str+ | Required | Two gene symbols or Ensembl IDs for correlation analysis |
+| `--tissue` | str | Required | Tissue name for correlation analysis |
+| `--data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing tissue expression data |
+| `--gene-mapping` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl` | Gene mapping file path |
+| `--output-dir` | str | None | Directory to save output files and logs (optional) |
+| `--list-tissues` | flag | False | List all available tissues and exit |
+| `--show-genes` | str | None | Show genes for specified tissue and exit |
+| `--n-genes` | int | 20 | Number of genes to display |
+| `--debug` | flag | False | Enable debug logging |
+
+## Output Format
+
+### Tissue and Gene Discovery
+```
+=== Available Tissues (49) ===
+ 1. adipose_subcutaneous
+ 2. adipose_visceral_omentum
+ 3. adrenal_gland
+ ...
+
+=== Tissue: whole_blood ===
+Total genes: 56,200
+Total samples: 755
+
+First 20 genes:
+------------------------------------------------------------
+# Gene Symbol Ensembl ID
+------------------------------------------------------------
+1 DDX11L1 ENSG00000223972.5
+2 WASH7P ENSG00000227232.5
+3 MIR6859-1 ENSG00000278267.1
+...
+```
+
+### Correlation Results
+```
+============================================================
+GENE PAIR CORRELATION RESULTS
+============================================================
+Gene 1: TP53 (ENSG00000141510.16)
+Gene 2: BRCA1 (ENSG00000012048.20)
+Tissue: whole_blood
+Samples: 755
+------------------------------------------------------------
+ CCC: 0.123456
+ PEARSON: 0.234567
+ SPEARMAN: 0.345678
+============================================================
+Results saved to:
+ JSON: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.json
+ Pickle: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.pkl
+Log file: gene_pair_correlation_analysis_20240925_143022.log
+```
+
+### Output Files (when --output-dir is used)
+
+1. **JSON Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.json`
+ - Human-readable format with all correlation results
+ - Can be easily imported into other tools or scripts
+
+2. **Pickle Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.pkl`
+ - Python-specific format preserving exact data types
+ - Optimal for downstream analysis in Python
+
+3. **Log File**: `gene_pair_correlation_analysis_{timestamp}.log`
+ - Detailed processing information and debug messages
+ - Useful for troubleshooting and audit trails
+
+Example JSON output:
+```json
+{
+ "gene1_symbol": "TP53",
+ "gene1_ensembl_id": "ENSG00000141510.16",
+ "gene2_symbol": "BRCA1",
+ "gene2_ensembl_id": "ENSG00000012048.20",
+ "tissue": "whole_blood",
+ "n_samples": 755,
+ "ccc": 0.123456,
+ "pearson": 0.234567,
+ "spearman": 0.345678
+}
+```
+
+## Input Data Format
+
+### Tissue Expression Files
+- **Format**: Pickle (.pkl) files
+- **Naming**: `gtex_v8_data_{tissue_name}.pkl`
+- **Structure**: DataFrame with Ensembl gene IDs as index, sample IDs as columns
+- **Content**: Log2-transformed gene expression values
+
+### Gene Mapping File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol`
+- **Content**: Mapping between Ensembl gene IDs and HUGO gene symbols
+
+## Statistical Methods
+
+### Correlation Coefficients
+
+1. **CCC (Clustered Correlation Coefficient)**
+ - GPU-accelerated implementation
+ - Robust to outliers and non-linear relationships
+ - Particularly suited for detecting complex correlation patterns
+
+2. **Pearson Correlation**
+ - Standard linear correlation coefficient
+ - Measures linear relationship strength
+
+3. **Spearman Correlation**
+ - Rank-based correlation coefficient
+ - Robust to outliers and monotonic relationships
+
+## Example Workflows
+
+### 1. Cancer Gene Analysis
+```bash
+# Explore brain tissues for TP53-related genes
+python compute_single_gene_pair_correlations_cli.py --list-tissues | grep brain
+
+# Analyze TP53 interactions in different brain regions with output saving
+python compute_single_gene_pair_correlations_cli.py TP53 MDM2 --tissue brain_cortex \
+ --output-dir ./cancer_gene_analysis --debug
+python compute_single_gene_pair_correlations_cli.py TP53 CDKN1A --tissue brain_hippocampus \
+ --output-dir ./cancer_gene_analysis --debug
+```
+
+### 2. Housekeeping Gene Analysis
+```bash
+# Compare expression correlation of housekeeping genes
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue whole_blood
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue liver
+python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue muscle_skeletal
+```
+
+### 3. Tissue-Specific Gene Discovery
+```bash
+# Find genes in specific tissues and analyze their relationships
+python compute_single_gene_pair_correlations_cli.py --show-genes heart_left_ventricle --n-genes 100 | grep MYH
+python compute_single_gene_pair_correlations_cli.py MYH6 MYH7 --tissue heart_left_ventricle
+```
+
+## Error Handling
+
+The tool provides comprehensive error handling:
+
+- **Gene not found**: Suggestions to check spelling or use `--show-genes`
+- **Tissue not found**: List of available tissues
+- **Data issues**: Clear messages about insufficient samples or missing data
+- **Path issues**: Validation of data directory and gene mapping file
+
+## Performance Considerations
+
+- **Memory usage**: ~100-500MB depending on tissue size
+- **Computation time**: 1-5 seconds per gene pair
+- **CCC computation**: GPU-accelerated when available
+
+---
+
+# Gene Expression-Metadata Correlation Analysis
+
+## Overview
+
+This tool computes correlations between specific gene expression levels and all available metadata columns across multiple GTEx tissues. It uses the **Clustered Correlation Coefficient (CCC)** method, which is particularly suited for detecting non-linear relationships and complex correlation patterns.
+
+### Key Features
+
+- **Multi-gene Analysis**: Process multiple genes simultaneously
+- **Cross-tissue Analysis**: Analyze correlations across all available GTEx tissues
+- **Comprehensive Metadata Coverage**: Correlate against all metadata columns automatically
+- **Statistical Significance**: Permutation-based p-value calculation with customizable iterations
+- **Flexible Tissue Filtering**: Include/exclude tissues using pattern matching
+- **Parallel Processing**: Multi-threaded computation support
+- **Detailed Logging**: Individual logs per gene-tissue combination plus comprehensive summaries
+- **Multiple Output Formats**: Results in both pickle (.pkl) and CSV formats
+- **Runtime Tracking**: Detailed performance monitoring and optimization insights
+
+## Requirements
+
+### Dependencies
+
+```python
+pandas
+numpy
+ccc # Clustered Correlation Coefficient library
+```
+
+### Required Data Files
+
+The tool expects specific data files in predetermined locations:
+
+1. **Expression Data**: GTEx v8 expression files in the format `gtex_v8_data_{tissue_name}-var_pc_log2.pkl`
+2. **Metadata**: GTEx v8 sample metadata (`gtex_v8-sample_metadata.pkl`)
+3. **Gene Mappings**: Gene ID to symbol mappings (`gtex_gene_id_symbol_mappings.pkl`)
+
+## Installation
+
+```bash
+# Clone or download the script
+# Ensure all required Python packages are installed
+pip install pandas numpy ccc
+```
+
+## Usage
+
+### Basic Usage
+
+```bash
+# Analyze single gene across all tissues
+python metadata_corr_cli.py RASSF2
+
+# Analyze multiple genes
+python metadata_corr_cli.py RASSF2 TP53 BRCA1
+
+# Specify custom output directory
+python metadata_corr_cli.py RASSF2 --output-dir ./results
+```
+
+### Advanced Usage
+
+```bash
+# Include only specific tissues (pattern matching)
+python metadata_corr_cli.py RASSF2 --include brain liver
+
+# Exclude specific tissues
+python metadata_corr_cli.py RASSF2 --exclude cells brain
+
+# Custom permutation settings and parallel processing
+python metadata_corr_cli.py RASSF2 --permutations 500000 --n-jobs 16
+
+# Combined filtering and custom settings
+python metadata_corr_cli.py TP53 BRCA1 \
+ --include muscle heart \
+ --exclude cells \
+ --permutations 1000000 \
+ --n-jobs 32 \
+ --output-dir ./tp53_brca1_analysis
+```
+
+### Discovery Commands
+
+```bash
+# List all available tissues
+python metadata_corr_cli.py GENE --list-tissues
+
+# List all available metadata columns
+python metadata_corr_cli.py GENE --list-metadata-columns
+```
+
+## Command Line Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `gene_symbols` | str+ | Required | Gene symbol(s) to analyze (e.g., RASSF2 TP53) |
+| `--expr-data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing expression data files |
+| `--include` | str* | None | Include only tissues matching these patterns |
+| `--exclude` | str* | None | Exclude tissues matching these patterns |
+| `--permutations` | int | 100,000 | Number of permutations for p-value calculation |
+| `--n-jobs` | int | 24 | Number of parallel jobs for computation |
+| `--output-dir` | str | `.` | Directory to save output files |
+| `--list-metadata-columns` | flag | False | List available metadata columns and exit |
+| `--list-tissues` | flag | False | List available tissue files and exit |
+
+## Input File Formats
+
+### Expression Data Files
+- **Format**: Pickle (.pkl) files
+- **Structure**: DataFrame with genes as rows, samples as columns
+- **Naming**: `gtex_v8_data_{tissue_name}-var_pc_log2.pkl`
+- **Content**: Log2-transformed, variance-filtered gene expression data
+
+### Metadata File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with samples as rows, metadata columns as columns
+- **Content**: All GTEx v8 sample metadata including demographics, sampling info, etc.
+
+### Gene Mapping File
+- **Format**: Pickle (.pkl) file
+- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol`
+- **Content**: Mapping between Ensembl gene IDs and gene symbols
+
+## Output Files
+
+### Per Gene-Tissue Results
+- **Individual Results**: `{gene}_{tissue}_correlation_results.pkl`
+- **Individual Logs**: `{gene}_{tissue}.log`
+- **Content**: Correlation results for each metadata column
+
+### Per Gene Summaries
+- **Combined Results**: `{gene}_all_tissues_correlation_results.pkl`
+- **Combined CSV**: `{gene}_all_tissues_correlation_results.csv`
+- **Content**: All tissues combined for single gene
+
+### Overall Results
+- **Mega Results**: `_all_genes_all_tissues_correlation_results.pkl`
+- **Mega CSV**: `_all_genes_all_tissues_correlation_results.csv`
+- **Summary Log**: `_{genes}_summary_execution.log`
+- **Summary Tables**: `_{genes}_summary_tables.log`
+
+### Result DataFrame Structure
+
+```python
+# Each results DataFrame contains:
+{
+ 'ccc_value': float, # CCC correlation coefficient
+ 'p_value': float, # Permutation-based p-value
+ 'status': str, # 'success', 'all_nan', 'insufficient_variation', or 'error'
+ 'tissue': str, # Tissue name
+ 'gene_symbol': str, # Gene symbol
+ 'gene_id': str, # Ensembl gene ID
+ 'n_samples': int # Number of samples used
+}
+```
+
+## Analysis Workflow
+
+### 1. **Gene Discovery**
+- Converts gene symbols to Ensembl IDs using gene mapping
+- Validates gene existence across tissues
+
+### 2. **Tissue Processing**
+- Loads expression data for each tissue
+- Filters to common samples between expression and metadata
+- Handles missing data and insufficient variation gracefully
+
+### 3. **Correlation Analysis**
+- Computes CCC between gene expression and each metadata column
+- Calculates statistical significance via permutation testing
+- Handles various data types and edge cases
+
+### 4. **Results Compilation**
+- Aggregates results across tissues and genes
+- Generates comprehensive summary statistics
+- Creates ranked lists of strongest correlations
+
+### 5. **Performance Monitoring**
+- Tracks runtime for each gene-tissue combination
+- Identifies computational bottlenecks
+- Provides optimization recommendations
+
+## Statistical Methods
+
+### Clustered Correlation Coefficient (CCC)
+- **Purpose**: Detects both linear and non-linear relationships
+- **Advantages**: Robust to outliers, captures complex patterns
+- **Implementation**: Uses permutation-based significance testing
+
+### Significance Levels
+- `***`: p < 0.001 (highly significant)
+- `**`: p < 0.01 (significant)
+- `*`: p < 0.05 (marginally significant)
+- `ns`: p ≥ 0.05 (not significant)
+
+## Performance Considerations
+
+### Computational Requirements
+- **Memory**: ~2-8 GB depending on tissue size and number of genes
+- **CPU**: Benefits from multi-core systems (default: 24 cores)
+- **Time**: ~1-5 minutes per gene-tissue combination
+
+### Optimization Tips
+- **Parallel Processing**: Increase `--n-jobs` for faster computation
+- **Permutations**: Reduce `--permutations` for faster (less precise) p-values
+- **Tissue Filtering**: Use `--include`/`--exclude` to focus on relevant tissues
+- **Batch Processing**: Process multiple genes together for efficiency
+
+## Example Workflows
+
+### 1. Cancer Gene Analysis
+```bash
+# Analyze tumor suppressor genes across cancer-relevant tissues
+python metadata_corr_cli.py TP53 BRCA1 BRCA2 PTEN \
+ --include breast ovary lung liver \
+ --permutations 1000000 \
+ --n-jobs 32 \
+ --output-dir ./cancer_genes_analysis
+```
+
+### 2. Brain-Specific Gene Study
+```bash
+# Focus on brain tissues for neurological genes
+python metadata_corr_cli.py APOE MAPT SNCA \
+ --include brain \
+ --exclude cells \
+ --output-dir ./brain_genes
+```
+
+### 3. Exploratory Analysis
+```bash
+# Quick exploration with reduced permutations
+python metadata_corr_cli.py GENE_OF_INTEREST \
+ --permutations 10000 \
+ --n-jobs 8 \
+ --output-dir ./exploratory
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Gene Not Found**: Check gene symbol spelling and availability in gene mapping
+2. **No Expression Data**: Verify gene is expressed in selected tissues
+3. **Memory Errors**: Reduce number of parallel jobs or process fewer genes at once
+4. **File Not Found**: Ensure all required data files exist in expected locations
+
+### Error Codes
+- **Gene symbol not found**: Gene not in mapping file
+- **No common samples**: Expression and metadata samples don't overlap
+- **All NaN values**: Metadata column contains only missing values
+- **Insufficient variation**: Metadata column has ≤1 unique values
+
+## Output Interpretation
+
+### Top Results Tables
+- Results ranked by absolute CCC value
+- Include significance levels and tissue information
+- Show strongest correlations across all analyses
+
+### Summary Statistics
+- **Mean |CCC|**: Average absolute correlation strength
+- **Max |CCC|**: Strongest correlation found
+- **Success Rate**: Proportion of successful analyses
+- **Runtime Metrics**: Performance characteristics
+
+## Citation
+
+If you use this tool in your research, please cite the CCC method and relevant GTEx publications.
+
+## Version Information
+
+- **Script**: metadata_corr_cli.py
+- **Converted from**: 00-data-exploration.ipynb
+- **GTEx Version**: v8
+- **CCC Implementation**: Uses ccc.coef module
+
+## Support
+
+For issues related to:
+- **CCC Method**: Refer to CCC library documentation
+- **GTEx Data**: Consult GTEx consortium resources
+- **Script Usage**: Check this README or examine log files for detailed error messages
\ No newline at end of file
diff --git a/nbs/common/compute_single_gene_pair_correlations_cli.py b/nbs/common/compute_single_gene_pair_correlations_cli.py
new file mode 100755
index 00000000..2cd2c690
--- /dev/null
+++ b/nbs/common/compute_single_gene_pair_correlations_cli.py
@@ -0,0 +1,624 @@
+#!/usr/bin/env python3
+"""
+Single Gene Pair Correlation Analysis Tool
+
+A command-line tool for exploring gene expression data and computing correlations
+between specific gene pairs using CCC (Clustered Correlation Coefficient),
+Spearman, and Pearson correlation methods.
+
+This script provides two main functionalities:
+1. Data exploration: Show available genes and their symbols for a tissue
+2. Correlation analysis: Compute correlations for a specific gene pair in a tissue
+
+Author: Generated for CCC-GPU project
+Version: 1.0
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Union
+
+import pandas as pd
+import numpy as np
+
+# Import correlation methods
+try:
+ from ccc.corr import ccc_gpu, pearson, spearman
+except ImportError:
+ print("Error: CCC library not found. Please install the ccc package.")
+ sys.exit(1)
+
+
+def setup_logging(debug: bool = False, output_dir: Optional[Path] = None) -> Optional[Path]:
+ """Configure logging for the script.
+
+ Args:
+ debug: Enable debug level logging if True
+ output_dir: Directory to write log files to (optional)
+
+ Returns:
+ Path to log file if output_dir provided, None otherwise
+ """
+ level = logging.DEBUG if debug else logging.INFO
+
+ # Clear any existing handlers
+ for handler in logging.root.handlers[:]:
+ logging.root.removeHandler(handler)
+
+ # Setup formatters
+ console_formatter = logging.Formatter(
+ '%(asctime)s - %(levelname)s - %(message)s',
+ datefmt='%H:%M:%S'
+ )
+ file_formatter = logging.Formatter(
+ '%(asctime)s - %(levelname)s - %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S'
+ )
+
+ # Setup handlers
+ handlers = []
+
+ # Console handler
+ console_handler = logging.StreamHandler(sys.stdout)
+ console_handler.setFormatter(console_formatter)
+ handlers.append(console_handler)
+
+ # File handler (if output directory provided)
+ log_file = None
+ if output_dir:
+ output_dir.mkdir(parents=True, exist_ok=True)
+ from datetime import datetime
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ log_file = output_dir / f"gene_pair_correlation_analysis_{timestamp}.log"
+
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setFormatter(file_formatter)
+ handlers.append(file_handler)
+
+ # Configure root logger
+ logging.basicConfig(
+ level=level,
+ handlers=handlers,
+ force=True
+ )
+
+ if log_file:
+ logging.info(f"Log file created: {log_file}")
+
+ return log_file
+
+
+class GeneExpressionAnalyzer:
+ """Main class for gene expression analysis and correlation computation."""
+
+ def __init__(self, data_dir: str, gene_mapping_file: str):
+ """Initialize the analyzer with data directory and gene mapping file.
+
+ Args:
+ data_dir: Directory containing tissue expression data files
+ gene_mapping_file: Path to gene ID to symbol mapping file
+ """
+ self.data_dir = Path(data_dir)
+ self.gene_mapping_file = Path(gene_mapping_file)
+ self._gene_mapping = None
+ self._validate_inputs()
+
+ def _validate_inputs(self) -> None:
+ """Validate that input paths exist and are accessible."""
+ if not self.data_dir.exists():
+ raise FileNotFoundError(f"Data directory not found: {self.data_dir}")
+
+ if not self.gene_mapping_file.exists():
+ raise FileNotFoundError(f"Gene mapping file not found: {self.gene_mapping_file}")
+
+ @property
+ def gene_mapping(self) -> pd.DataFrame:
+ """Load and cache gene mapping data."""
+ if self._gene_mapping is None:
+ logging.info(f"Loading gene mapping from: {self.gene_mapping_file}")
+ self._gene_mapping = pd.read_pickle(self.gene_mapping_file)
+ logging.info(f"Loaded {len(self._gene_mapping)} gene mappings")
+ return self._gene_mapping
+
+ def list_available_tissues(self) -> list:
+ """Get list of available tissue files.
+
+ Returns:
+ List of tissue names (without file extensions)
+ """
+ tissue_files = list(self.data_dir.glob("gtex_v8_data_*.pkl"))
+ tissues = [f.stem.replace("gtex_v8_data_", "") for f in tissue_files]
+ return sorted(tissues)
+
+ def _find_tissue_file(self, tissue: str) -> Path:
+ """Find the tissue file for a given tissue name.
+
+ Args:
+ tissue: Tissue name
+
+ Returns:
+ Path to tissue file
+
+ Raises:
+ FileNotFoundError: If tissue file is not found
+ """
+ # Try exact match first
+ exact_file = self.data_dir / f"gtex_v8_data_{tissue}.pkl"
+ if exact_file.exists():
+ return exact_file
+
+ # Try partial matching
+ tissue_files = list(self.data_dir.glob(f"gtex_v8_data_*{tissue}*.pkl"))
+ if len(tissue_files) == 1:
+ return tissue_files[0]
+ elif len(tissue_files) > 1:
+ matches = [f.stem for f in tissue_files]
+ raise ValueError(
+ f"Multiple tissue files match '{tissue}': {matches}. "
+ "Please be more specific."
+ )
+ else:
+ available = self.list_available_tissues()
+ raise FileNotFoundError(
+ f"No tissue file found for '{tissue}'. "
+ f"Available tissues: {available[:10]}..." if len(available) > 10
+ else f"Available tissues: {available}"
+ )
+
+ def show_tissue_genes(self, tissue: str, n_genes: int = 20) -> None:
+ """Display available genes and their symbols for a tissue.
+
+ Args:
+ tissue: Tissue name
+ n_genes: Number of genes to display (default: 20)
+ """
+ # Load tissue data
+ tissue_file = self._find_tissue_file(tissue)
+ logging.info(f"Loading tissue data from: {tissue_file}")
+
+ tissue_data = pd.read_pickle(tissue_file)
+ logging.info(f"Tissue data shape: {tissue_data.shape}")
+
+ # Get gene IDs and map to symbols
+ gene_ids = tissue_data.index.tolist()
+
+ # Create mapping lookup for faster access
+ gene_mapping = self.gene_mapping.set_index('gene_ens_id')
+
+ print(f"\n=== Tissue: {tissue} ===")
+ print(f"Total genes: {len(gene_ids):,}")
+ print(f"Total samples: {tissue_data.shape[1]:,}")
+ print(f"\nFirst {n_genes} genes:")
+ print("-" * 60)
+ print(f"{'#':<4} {'Gene Symbol':<15} {'Ensembl ID':<20}")
+ print("-" * 60)
+
+ for i, gene_id in enumerate(gene_ids[:n_genes], 1):
+ # Remove version from gene ID for mapping lookup
+ clean_gene_id = gene_id.split('.')[0] if '.' in gene_id else gene_id
+
+ # Look up symbol
+ symbol = "N/A"
+ if gene_id in gene_mapping.index:
+ symbol = gene_mapping.loc[gene_id, 'gene_symbol']
+ elif clean_gene_id in gene_mapping.index:
+ symbol = gene_mapping.loc[clean_gene_id, 'gene_symbol']
+ else:
+ # Search in original mapping
+ matches = self.gene_mapping[
+ self.gene_mapping['gene_ens_id'].str.startswith(clean_gene_id)
+ ]
+ if len(matches) > 0:
+ symbol = matches.iloc[0]['gene_symbol']
+
+ print(f"{i:<4} {symbol:<15} {gene_id:<20}")
+
+ if len(gene_ids) > n_genes:
+ print(f"... and {len(gene_ids) - n_genes:,} more genes")
+ print()
+
+ def _resolve_gene(self, gene_input: str) -> Tuple[str, str]:
+ """Resolve gene input to Ensembl ID and symbol.
+
+ Args:
+ gene_input: Gene symbol or Ensembl ID
+
+ Returns:
+ Tuple of (ensembl_id, gene_symbol)
+
+ Raises:
+ ValueError: If gene cannot be resolved
+ """
+ # Check if it's already an Ensembl ID
+ if gene_input.startswith('ENSG'):
+ # Look up the symbol
+ matches = self.gene_mapping[self.gene_mapping['gene_ens_id'] == gene_input]
+ if len(matches) == 0:
+ # Try without version
+ base_id = gene_input.split('.')[0]
+ matches = self.gene_mapping[
+ self.gene_mapping['gene_ens_id'].str.startswith(base_id)
+ ]
+
+ if len(matches) > 0:
+ return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+ else:
+ raise ValueError(f"Ensembl ID '{gene_input}' not found in mapping")
+ else:
+ # Assume it's a gene symbol
+ matches = self.gene_mapping[self.gene_mapping['gene_symbol'] == gene_input]
+ if len(matches) > 0:
+ return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+ else:
+ # Try case-insensitive search
+ matches = self.gene_mapping[
+ self.gene_mapping['gene_symbol'].str.upper() == gene_input.upper()
+ ]
+ if len(matches) > 0:
+ return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol']
+ else:
+ raise ValueError(
+ f"Gene symbol '{gene_input}' not found. "
+ "Use --show-genes to see available genes."
+ )
+
+ def compute_gene_pair_correlations(
+ self,
+ gene1: str,
+ gene2: str,
+ tissue: str
+ ) -> Dict[str, Union[float, str]]:
+ """Compute correlations between two genes in a specific tissue.
+
+ Args:
+ gene1: First gene (symbol or Ensembl ID)
+ gene2: Second gene (symbol or Ensembl ID)
+ tissue: Tissue name
+
+ Returns:
+ Dictionary with correlation results
+ """
+ # Resolve genes
+ gene1_id, gene1_symbol = self._resolve_gene(gene1)
+ gene2_id, gene2_symbol = self._resolve_gene(gene2)
+
+ # Load tissue data
+ tissue_file = self._find_tissue_file(tissue)
+ logging.info(f"Loading tissue data from: {tissue_file}")
+
+ tissue_data = pd.read_pickle(tissue_file)
+ logging.info(f"Tissue data shape: {tissue_data.shape}")
+
+ # Extract gene expression data
+ gene1_expr = self._extract_gene_expression(tissue_data, gene1_id, gene1_symbol)
+ gene2_expr = self._extract_gene_expression(tissue_data, gene2_id, gene2_symbol)
+
+ # Ensure we have the same samples
+ common_samples = gene1_expr.index.intersection(gene2_expr.index)
+ if len(common_samples) == 0:
+ raise ValueError("No common samples between the two genes")
+
+ gene1_values = gene1_expr.loc[common_samples].values
+ gene2_values = gene2_expr.loc[common_samples].values
+
+ # Remove any NaN values
+ mask = ~(np.isnan(gene1_values) | np.isnan(gene2_values))
+ gene1_clean = gene1_values[mask]
+ gene2_clean = gene2_values[mask]
+
+ if len(gene1_clean) < 3:
+ raise ValueError("Insufficient valid data points for correlation analysis")
+
+ logging.info(f"Computing correlations for {len(gene1_clean)} samples")
+
+ # Compute correlations
+ results = {
+ 'gene1_symbol': gene1_symbol,
+ 'gene1_ensembl_id': gene1_id,
+ 'gene2_symbol': gene2_symbol,
+ 'gene2_ensembl_id': gene2_id,
+ 'tissue': tissue,
+ 'n_samples': len(gene1_clean),
+ }
+
+ # Create DataFrame for correlation computation (genes as rows, samples as columns)
+ # This matches the format expected by ccc.corr functions
+ data_df = pd.DataFrame({
+ f'sample_{i}': [gene1_clean[i], gene2_clean[i]]
+ for i in range(len(gene1_clean))
+ }, index=[gene1_symbol, gene2_symbol])
+
+ try:
+ # Compute CCC
+ logging.info("Computing CCC correlation...")
+ ccc_result = ccc_gpu(data_df, n_jobs=1) # Use single job for pair
+ results['ccc'] = float(ccc_result.iloc[0, 1]) # Off-diagonal element
+ except Exception as e:
+ logging.warning(f"CCC computation failed: {e}")
+ results['ccc'] = None
+
+ try:
+ # Compute Pearson correlation
+ logging.info("Computing Pearson correlation...")
+ pearson_result = pearson(data_df)
+ results['pearson'] = float(pearson_result.iloc[0, 1])
+ except Exception as e:
+ logging.warning(f"Pearson computation failed: {e}")
+ results['pearson'] = None
+
+ try:
+ # Compute Spearman correlation
+ logging.info("Computing Spearman correlation...")
+ spearman_result = spearman(data_df)
+ results['spearman'] = float(spearman_result.iloc[0, 1])
+ except Exception as e:
+ logging.warning(f"Spearman computation failed: {e}")
+ results['spearman'] = None
+
+ return results
+
+ def _extract_gene_expression(self, tissue_data: pd.DataFrame, gene_id: str, gene_symbol: str) -> pd.Series:
+ """Extract expression data for a specific gene.
+
+ Args:
+ tissue_data: Tissue expression DataFrame
+ gene_id: Ensembl gene ID
+ gene_symbol: Gene symbol
+
+ Returns:
+ Series with gene expression values
+
+ Raises:
+ ValueError: If gene is not found in tissue data
+ """
+ # Try exact match first
+ if gene_id in tissue_data.index:
+ return tissue_data.loc[gene_id]
+
+ # Try without version
+ base_id = gene_id.split('.')[0]
+ matches = [idx for idx in tissue_data.index if idx.startswith(base_id)]
+
+ if len(matches) == 1:
+ return tissue_data.loc[matches[0]]
+ elif len(matches) > 1:
+ logging.warning(f"Multiple matches for {gene_symbol} ({gene_id}), using first match")
+ return tissue_data.loc[matches[0]]
+ else:
+ raise ValueError(f"Gene {gene_symbol} ({gene_id}) not found in tissue data")
+
+
+def save_results(results: Dict[str, Union[float, str]], output_dir: Path) -> Tuple[Path, Path]:
+ """Save correlation results to files.
+
+ Args:
+ results: Dictionary containing correlation results
+ output_dir: Directory to save files
+
+ Returns:
+ Tuple of (json_file_path, pickle_file_path)
+ """
+ import json
+ import pickle
+ from datetime import datetime
+
+ # Create filenames
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ gene1_symbol = results['gene1_symbol']
+ gene2_symbol = results['gene2_symbol']
+ tissue = results['tissue']
+
+ base_filename = f"{gene1_symbol}_{gene2_symbol}_{tissue}_{timestamp}"
+ json_file = output_dir / f"{base_filename}_correlation_results.json"
+ pickle_file = output_dir / f"{base_filename}_correlation_results.pkl"
+
+ # Ensure output directory exists
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Save as JSON (human readable)
+ json_data = {}
+ for key, value in results.items():
+ if isinstance(value, (int, float, str)):
+ json_data[key] = value
+ else:
+ json_data[key] = str(value)
+
+ with open(json_file, 'w') as f:
+ json.dump(json_data, f, indent=2)
+
+ # Save as pickle (preserves data types)
+ with open(pickle_file, 'wb') as f:
+ pickle.dump(results, f)
+
+ logging.info(f"Results saved to: {json_file}")
+ logging.info(f"Results saved to: {pickle_file}")
+
+ return json_file, pickle_file
+
+
+def main():
+ """Main function to handle command line arguments and execute analysis."""
+ parser = argparse.ArgumentParser(
+ description="Single Gene Pair Correlation Analysis Tool",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Show available tissues
+ python compute_single_gene_pair_correlations_cli.py --list-tissues
+
+ # Show genes in whole blood tissue
+ python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood
+
+ # Compute correlations between TP53 and BRCA1 in whole blood
+ python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood
+
+ # Save results and logs to output directory
+ python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\
+ --output-dir ./results --debug
+
+ # Use custom data directory and gene mapping
+ python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\
+ --data-dir /custom/path/data \\
+ --gene-mapping /custom/path/mappings.pkl \\
+ --output-dir ./custom_results
+ """
+ )
+
+ # Positional arguments for gene pair analysis
+ parser.add_argument(
+ 'genes',
+ nargs='*',
+ help='Two gene symbols or Ensembl IDs for correlation analysis (e.g., TP53 BRCA1)'
+ )
+
+ # Main options
+ parser.add_argument(
+ '--tissue',
+ type=str,
+ help='Tissue name for analysis (required for correlation analysis)'
+ )
+
+ parser.add_argument(
+ '--data-dir',
+ type=str,
+ default='/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue',
+ help='Directory containing tissue expression data files'
+ )
+
+ parser.add_argument(
+ '--gene-mapping',
+ type=str,
+ default='/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl',
+ help='Path to gene ID to symbol mapping file'
+ )
+
+ # Discovery options
+ parser.add_argument(
+ '--list-tissues',
+ action='store_true',
+ help='List all available tissues and exit'
+ )
+
+ parser.add_argument(
+ '--show-genes',
+ type=str,
+ metavar='TISSUE',
+ help='Show available genes for specified tissue and exit'
+ )
+
+ parser.add_argument(
+ '--n-genes',
+ type=int,
+ default=20,
+ help='Number of genes to show (default: 20)'
+ )
+
+ # Output options
+ parser.add_argument(
+ '--output-dir',
+ type=str,
+ help='Directory to save output files and logs (optional)'
+ )
+
+ # Utility options
+ parser.add_argument(
+ '--debug',
+ action='store_true',
+ help='Enable debug logging'
+ )
+
+ args = parser.parse_args()
+
+ # Setup output directory
+ output_dir = Path(args.output_dir) if args.output_dir else None
+
+ # Setup logging
+ log_file = setup_logging(debug=args.debug, output_dir=output_dir)
+
+ try:
+ # Initialize analyzer
+ analyzer = GeneExpressionAnalyzer(args.data_dir, args.gene_mapping)
+
+ # Handle discovery commands
+ if args.list_tissues:
+ tissues = analyzer.list_available_tissues()
+ print(f"\n=== Available Tissues ({len(tissues)}) ===")
+ for i, tissue in enumerate(tissues, 1):
+ print(f"{i:2d}. {tissue}")
+ print()
+ return
+
+ if args.show_genes:
+ analyzer.show_tissue_genes(args.show_genes, args.n_genes)
+ return
+
+ # Handle correlation analysis
+ if len(args.genes) != 2:
+ parser.error(
+ "Exactly two genes are required for correlation analysis. "
+ "Use --show-genes to see available genes, or --list-tissues to see available tissues."
+ )
+
+ if not args.tissue:
+ parser.error(
+ "Tissue is required for correlation analysis. "
+ "Use --list-tissues to see available tissues."
+ )
+
+ gene1, gene2 = args.genes
+ results = analyzer.compute_gene_pair_correlations(gene1, gene2, args.tissue)
+
+ # Save results to files if output directory provided
+ saved_files = None
+ if output_dir:
+ try:
+ saved_files = save_results(results, output_dir)
+ logging.info(f"Results saved to output directory: {output_dir}")
+ except Exception as e:
+ logging.error(f"Failed to save results: {e}")
+
+ # Print results
+ print("\n" + "="*60)
+ print("GENE PAIR CORRELATION RESULTS")
+ print("="*60)
+ print(f"Gene 1: {results['gene1_symbol']} ({results['gene1_ensembl_id']})")
+ print(f"Gene 2: {results['gene2_symbol']} ({results['gene2_ensembl_id']})")
+ print(f"Tissue: {results['tissue']}")
+ print(f"Samples: {results['n_samples']:,}")
+ print("-" * 60)
+
+ for method in ['ccc', 'pearson', 'spearman']:
+ value = results.get(method)
+ if value is not None:
+ print(f"{method.upper():>12}: {value:.6f}")
+ else:
+ print(f"{method.upper():>12}: Failed to compute")
+
+ print("="*60)
+
+ # Show saved files info
+ if saved_files:
+ print(f"Results saved to:")
+ print(f" JSON: {saved_files[0].name}")
+ print(f" Pickle: {saved_files[1].name}")
+
+ if log_file:
+ print(f"Log file: {log_file.name}")
+
+ print()
+
+ # Also return as dict for programmatic use
+ return results
+
+ except Exception as e:
+ logging.error(f"Error: {e}")
+ if args.debug:
+ import traceback
+ traceback.print_exc()
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/nbs/common/metadata_corr_cli.py b/nbs/common/metadata_corr_cli.py
new file mode 100755
index 00000000..669c9ccd
--- /dev/null
+++ b/nbs/common/metadata_corr_cli.py
@@ -0,0 +1,1191 @@
+#!/usr/bin/env python3
+"""
+CLI tool for exploring gene expression correlations with metadata.
+Converted from 00-data-exploration.ipynb
+"""
+
+import argparse
+import sys
+import warnings
+import re
+import time
+import logging
+from pathlib import Path
+import pandas as pd
+import numpy as np
+# from ccc.coef import ccc
+from ccc.coef.impl_gpu import ccc
+
+# Suppress specific NumPy warnings
+warnings.filterwarnings("ignore", message="invalid value encountered in cast")
+warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
+
+# Global quiet flag for batch processing
+QUIET_MODE = False
+
+
+def find_expression_files(expr_data_dir, include_patterns=None, exclude_patterns=None, quiet=False):
+ """Find expression files matching include/exclude patterns."""
+ expr_data_dir = Path(expr_data_dir)
+
+ if not expr_data_dir.exists():
+ raise FileNotFoundError(f"Expression data directory not found: {expr_data_dir}")
+
+ # Find all .pkl files with the expected pattern
+ pattern = re.compile(r"gtex_v8_data_(.+)\.pkl$")
+ all_files = []
+
+ for file_path in expr_data_dir.glob("*.pkl"):
+ match = pattern.match(file_path.name)
+ if match:
+ tissue_name = match.group(1)
+ all_files.append((file_path, tissue_name))
+
+ if not all_files:
+ raise FileNotFoundError(
+ f"No matching expression files found in {expr_data_dir}"
+ )
+
+ # Apply include patterns
+ if include_patterns:
+ filtered_files = []
+ for file_path, tissue_name in all_files:
+ for pattern in include_patterns:
+ if re.search(pattern.lower(), tissue_name.lower()) or re.search(
+ pattern.lower(), file_path.name.lower()
+ ):
+ filtered_files.append((file_path, tissue_name))
+ break
+ all_files = filtered_files
+
+ # Apply exclude patterns
+ if exclude_patterns:
+ filtered_files = []
+ for file_path, tissue_name in all_files:
+ excluded = False
+ for pattern in exclude_patterns:
+ if re.search(pattern.lower(), tissue_name.lower()) or re.search(
+ pattern.lower(), file_path.name.lower()
+ ):
+ excluded = True
+ break
+ if not excluded:
+ filtered_files.append((file_path, tissue_name))
+ all_files = filtered_files
+
+ if not quiet:
+ print(f"Found {len(all_files)} expression files to process:")
+ for file_path, tissue_name in all_files:
+ print(f" {tissue_name}: {file_path.name}")
+
+ return all_files
+
+
+def load_metadata_and_gene_map(data_dir, quiet=False):
+ """Load metadata and gene mapping files."""
+ # Define paths
+ DATA_DIR = Path(data_dir)
+
+ # File paths
+ METADATA_FILE = DATA_DIR / "gtex_v8-sample_metadata.pkl"
+ GENE_MAP_FILE = DATA_DIR / "gtex_gene_id_symbol_mappings.pkl"
+
+ # Check if files exist
+ for file_path in [METADATA_FILE, GENE_MAP_FILE]:
+ if not file_path.exists():
+ raise FileNotFoundError(f"Required file not found: {file_path}")
+
+ if not quiet:
+ print("Loading metadata and gene mapping files...")
+
+ # Load data
+ gtex_metadata = pd.read_pickle(METADATA_FILE)
+ gene_map = pd.read_pickle(GENE_MAP_FILE)
+
+ if not quiet:
+ print(f"Loaded metadata: {gtex_metadata.shape}")
+ print(f"Loaded gene mapping: {gene_map.shape}")
+
+ return gtex_metadata, gene_map
+
+
+def setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs=False):
+ """Set up a logger for a specific gene-tissue combination."""
+ logger_name = f"tissue_{gene_symbol}_{tissue_name}"
+ logger = logging.getLogger(logger_name)
+
+ # Clear any existing handlers
+ logger.handlers.clear()
+
+ # Set level
+ logger.setLevel(logging.INFO)
+
+ log_file = None
+ if not no_individual_logs:
+ # Create file handler
+ log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+ file_handler = logging.FileHandler(log_file, mode="w")
+ file_handler.setLevel(logging.INFO)
+
+ # Create formatter
+ formatter = logging.Formatter(
+ "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+ )
+ file_handler.setFormatter(formatter)
+
+ # Add handler to logger
+ logger.addHandler(file_handler)
+
+ # Always return a logger (may have no handlers if individual logs disabled)
+ return logger, log_file
+
+
+def setup_summary_logger(gene_symbols, output_dir):
+ """Set up a logger for the main function summary."""
+ logger_name = "summary"
+ logger = logging.getLogger(logger_name)
+
+ # Clear any existing handlers
+ logger.handlers.clear()
+
+ # Set level
+ logger.setLevel(logging.INFO)
+
+ # Create file handler
+ genes_connected = "_".join(gene_symbols)
+ log_file = output_dir / f"_{genes_connected}_summary_execution.log"
+ file_handler = logging.FileHandler(log_file, mode="w")
+ file_handler.setLevel(logging.INFO)
+
+ # Create formatter
+ formatter = logging.Formatter(
+ "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+ )
+ file_handler.setFormatter(formatter)
+
+ # Add handler to logger
+ logger.addHandler(file_handler)
+
+ return logger, log_file
+
+
+def log_and_print(message, logger=None, summary_file=None, quiet=None):
+ """Print message and log it if logger is provided, optionally write to summary file."""
+ # Use global quiet mode if not explicitly specified
+ if quiet is None:
+ quiet = QUIET_MODE
+
+ if not quiet:
+ print(message)
+ if logger:
+ logger.info(message)
+ if summary_file:
+ summary_file.write(message + "\n")
+ summary_file.flush() # Ensure immediate write to disk
+
+
+def get_gene_id(gene_symbol, gene_map):
+ """Get gene ID from gene symbol."""
+ matches = gene_map.loc[gene_map["gene_symbol"] == gene_symbol, "gene_ens_id"]
+
+ if len(matches) == 0:
+ raise ValueError(f"Gene symbol '{gene_symbol}' not found in gene mapping")
+ elif len(matches) > 1:
+ print(
+ f"Warning: Multiple matches found for '{gene_symbol}': {matches.tolist()}"
+ )
+ print(f"Using first match: {matches.iloc[0]}")
+
+ return matches.iloc[0]
+
+
+def compute_correlations_for_tissue(
+ gene_symbol,
+ tissue_name,
+ expr_file_path,
+ gtex_metadata,
+ gene_map,
+ output_dir,
+ pvalue_n_perms=1000000,
+ n_jobs=1,
+ no_individual_logs=False,
+):
+ """Compute correlation between gene expression and all metadata columns for a specific tissue."""
+
+ # Set up logging for this tissue
+ logger, log_file = setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs)
+
+ log_and_print(f"\n{'='*60}", logger)
+ log_and_print(f"Processing tissue: {tissue_name}", logger)
+ log_and_print(f"File: {expr_file_path.name}", logger)
+ log_and_print(f"Log file: {log_file}", logger)
+ log_and_print(f"{'='*60}", logger)
+
+ # Load expression data
+ log_and_print("Loading expression data...", logger)
+ expr_data = pd.read_pickle(expr_file_path)
+ log_and_print(f"Expression data shape: {expr_data.shape}", logger)
+
+ # Get gene ID
+ gene_id = get_gene_id(gene_symbol, gene_map)
+ log_and_print(f"Gene ID for {gene_symbol}: {gene_id}", logger)
+
+ # Check if gene exists in this tissue
+ if gene_id not in expr_data.index:
+ log_and_print(
+ f"Warning: Gene ID '{gene_id}' not found in {tissue_name} expression data",
+ logger,
+ )
+ return None, gene_id
+
+ # Get sample IDs from expression data
+ sample_ids = expr_data.columns
+ log_and_print(f"Number of samples: {len(sample_ids)}", logger)
+
+ # Get gene expression data
+ gene_expr_row = expr_data.loc[gene_id]
+
+ # Get metadata for these samples (only for samples that exist in both datasets)
+ common_samples = sample_ids.intersection(gtex_metadata.index)
+ if len(common_samples) == 0:
+ log_and_print(
+ f"Warning: No common samples found between {tissue_name} expression data and metadata",
+ logger,
+ )
+ return None, gene_id
+
+ log_and_print(f"Common samples: {len(common_samples)}", logger)
+
+ # Filter to common samples
+ gene_expr_filtered = gene_expr_row.loc[common_samples]
+ sample_metadata = gtex_metadata.loc[common_samples]
+
+ log_and_print(
+ f"Computing CCC between {gene_symbol} expression and all metadata columns...",
+ logger,
+ )
+ log_and_print(f"Using {pvalue_n_perms} permutations and {n_jobs} jobs", logger)
+ log_and_print(
+ f"Processing {len(sample_metadata.columns)} metadata columns...", logger
+ )
+
+ # Initialize results
+ results = []
+
+ # Iterate through all metadata columns
+ for i, column in enumerate(sample_metadata.columns, 1):
+ log_and_print(
+ f"Processing column {i}/{len(sample_metadata.columns)}: {column}", logger
+ )
+
+ try:
+ metadata_vector = sample_metadata[column]
+
+ # Skip columns with all NaN values
+ if metadata_vector.isna().all():
+ log_and_print(f" Skipping {column}: all values are NaN", logger)
+ results.append(
+ {
+ "metadata_column": column,
+ "ccc_value": np.nan,
+ "p_value": np.nan,
+ "status": "all_nan",
+ }
+ )
+ continue
+
+ # Skip columns with only one unique value (after removing NaN)
+ unique_values = metadata_vector.dropna().nunique()
+ if unique_values <= 1:
+ log_and_print(
+ f" Skipping {column}: only {unique_values} unique value(s)", logger
+ )
+ results.append(
+ {
+ "metadata_column": column,
+ "ccc_value": np.nan,
+ "p_value": np.nan,
+ "status": "insufficient_variation",
+ }
+ )
+ continue
+
+ # Compute CCC (suppress numpy warnings during computation)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", RuntimeWarning)
+ ccc_val, ccc_pval = ccc(
+ gene_expr_filtered,
+ metadata_vector,
+ pvalue_n_perms=pvalue_n_perms,
+ n_jobs=n_jobs,
+ )
+
+ results.append(
+ {
+ "metadata_column": column,
+ "ccc_value": ccc_val,
+ "p_value": ccc_pval,
+ "status": "success",
+ }
+ )
+
+ log_and_print(f" CCC: {ccc_val:.6f}, p-value: {ccc_pval:.2e}", logger)
+
+ except Exception as e:
+ log_and_print(f" Error processing {column}: {e}", logger)
+ results.append(
+ {
+ "metadata_column": column,
+ "ccc_value": np.nan,
+ "p_value": np.nan,
+ "status": f"error: {str(e)}",
+ }
+ )
+
+ # Convert to DataFrame with metadata column names as index
+ results_df = pd.DataFrame(results)
+ results_df.set_index("metadata_column", inplace=True)
+
+ # Add tissue information
+ results_df["tissue"] = tissue_name
+ results_df["gene_symbol"] = gene_symbol
+ results_df["gene_id"] = gene_id
+ results_df["n_samples"] = len(common_samples)
+
+ # Log completion
+ successful_analyses = results_df[results_df["status"] == "success"]
+ log_and_print(f"\nCompleted processing {tissue_name}:", logger)
+ log_and_print(f" Total metadata columns: {len(results_df)}", logger)
+ log_and_print(f" Successful analyses: {len(successful_analyses)}", logger)
+ log_and_print(
+ f" Skipped/Failed: {len(results_df) - len(successful_analyses)}", logger
+ )
+
+ # Close the logger
+ for handler in logger.handlers:
+ handler.close()
+ logger.removeHandler(handler)
+
+ return results_df, gene_id
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Analyze gene expression correlations with metadata using CCC across multiple tissues",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument(
+ "gene_symbols",
+ nargs="+",
+ help="Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)",
+ )
+
+ parser.add_argument(
+ "--expr-data-dir",
+ default="/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue",
+ help="Directory containing expression data files",
+ )
+
+ parser.add_argument(
+ "--include",
+ nargs="*",
+ help="Include only tissues matching these patterns (fuzzy match on tissue name)",
+ )
+
+ parser.add_argument(
+ "--exclude",
+ nargs="*",
+ help="Exclude tissues matching these patterns (fuzzy match on tissue name)",
+ )
+
+ parser.add_argument(
+ "--permutations",
+ type=int,
+ # default=1000000,
+ default=100000,
+ help="Number of permutations for p-value calculation",
+ )
+
+ parser.add_argument(
+ "--n-jobs", type=int, default=4, help="Number of parallel jobs for computation"
+ )
+
+ parser.add_argument(
+ "--list-metadata-columns",
+ action="store_true",
+ help="List available metadata columns and exit",
+ )
+
+ parser.add_argument(
+ "--list-tissues",
+ action="store_true",
+ help="List available tissue files and exit",
+ )
+
+ parser.add_argument(
+ "--output-dir",
+ default=".",
+ help="Directory to save output files (default: current directory)",
+ )
+
+ parser.add_argument(
+ "--quiet",
+ action="store_true",
+ help="Reduce output verbosity for batch processing",
+ )
+
+ parser.add_argument(
+ "--no-csv-output",
+ action="store_true",
+ help="Skip CSV file generation (only create pickle files)",
+ )
+
+ parser.add_argument(
+ "--no-individual-logs",
+ action="store_true",
+ help="Skip individual tissue log files (only keep summary logs)",
+ )
+
+ parser.add_argument(
+ "--data-dir",
+ default="/mnt/data/proj_data/ccc-gpu/data/tutorial",
+ help="Directory containing GTEx data files (metadata and gene mappings)",
+ )
+
+ args = parser.parse_args()
+
+ # Set global quiet mode
+ global QUIET_MODE
+ QUIET_MODE = args.quiet
+
+ try:
+ # Create output directory if it doesn't exist
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Set up summary logger
+ summary_logger, summary_log_file = setup_summary_logger(
+ args.gene_symbols, output_dir
+ )
+
+ # Set up summary tables file
+ genes_connected = "_".join(args.gene_symbols)
+ summary_tables_file_path = output_dir / f"_{genes_connected}_summary_tables.log"
+ summary_tables_file = open(summary_tables_file_path, "w")
+
+ log_and_print(f"Output directory: {output_dir.absolute()}", summary_logger)
+ log_and_print(f"Summary log file: {summary_log_file}", summary_logger)
+ log_and_print(
+ f"Summary tables file: {summary_tables_file_path}", summary_logger
+ )
+ log_and_print(
+ f"Gene symbols to analyze: {', '.join(args.gene_symbols)}", summary_logger
+ )
+
+ # Find expression files
+ expression_files = find_expression_files(
+ args.expr_data_dir,
+ include_patterns=args.include,
+ exclude_patterns=args.exclude,
+ quiet=args.quiet,
+ )
+
+ # If user wants to list tissues
+ if args.list_tissues:
+ log_and_print(
+ f"Available expression files in {args.expr_data_dir}:", summary_logger
+ )
+ for file_path, tissue_name in expression_files:
+ log_and_print(f" {tissue_name}: {file_path.name}", summary_logger)
+ summary_tables_file.close()
+ return
+
+ # Load metadata and gene mapping
+ gtex_metadata, gene_map = load_metadata_and_gene_map(args.data_dir, quiet=args.quiet)
+
+ # If user wants to list metadata columns
+ if args.list_metadata_columns:
+ log_and_print("Available metadata columns:", summary_logger)
+ for col in sorted(gtex_metadata.columns):
+ log_and_print(f" {col}", summary_logger)
+ summary_tables_file.close()
+ return
+
+ # Process each gene symbol
+ all_genes_results = {}
+ total_start_time = time.time()
+
+ for gene_idx, gene_symbol in enumerate(args.gene_symbols, 1):
+ log_and_print(f"\n{'='*100}", summary_logger)
+ log_and_print(
+ f"PROCESSING GENE {gene_idx}/{len(args.gene_symbols)}: {gene_symbol}",
+ summary_logger,
+ )
+ log_and_print(f"{'='*100}", summary_logger)
+
+ # Process each tissue for this gene
+ all_results = {}
+ gene_id = None
+ tissue_runtimes = {}
+ gene_start_time = time.time()
+
+ for i, (expr_file_path, tissue_name) in enumerate(expression_files, 1):
+ log_and_print(
+ f"\n[{i}/{len(expression_files)}] Starting processing for {gene_symbol} in {tissue_name}...",
+ summary_logger,
+ )
+ tissue_start_time = time.time()
+
+ try:
+ results_df, current_gene_id = compute_correlations_for_tissue(
+ gene_symbol,
+ tissue_name,
+ expr_file_path,
+ gtex_metadata,
+ gene_map,
+ output_dir,
+ args.permutations,
+ args.n_jobs,
+ args.no_individual_logs,
+ )
+
+ tissue_end_time = time.time()
+ tissue_runtime = tissue_end_time - tissue_start_time
+ tissue_runtimes[tissue_name] = tissue_runtime
+
+ if results_df is not None:
+ all_results[tissue_name] = results_df
+ gene_id = current_gene_id
+
+ # Save individual tissue results
+ output_file = (
+ output_dir
+ / f"{gene_symbol}_{tissue_name}_correlation_results.pkl"
+ )
+ log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+ results_df.to_pickle(output_file)
+ log_and_print(
+ f"Results for {gene_symbol} in {tissue_name} saved to: {output_file}",
+ summary_logger,
+ )
+ if not args.no_individual_logs and log_file:
+ log_and_print(
+ f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+ summary_logger,
+ )
+ log_and_print(
+ f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+ summary_logger,
+ )
+ else:
+ log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+ log_and_print(
+ f"No results generated for {gene_symbol} in {tissue_name}",
+ summary_logger,
+ )
+ if not args.no_individual_logs and log_file:
+ log_and_print(
+ f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+ summary_logger,
+ )
+ log_and_print(
+ f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+ summary_logger,
+ )
+
+ except Exception as e:
+ tissue_end_time = time.time()
+ tissue_runtime = tissue_end_time - tissue_start_time
+ tissue_runtimes[tissue_name] = tissue_runtime
+ log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+ log_and_print(
+ f"Error processing {gene_symbol} in {tissue_name}: {e}",
+ summary_logger,
+ )
+ if not args.no_individual_logs and log_file:
+ log_and_print(
+ f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}",
+ summary_logger,
+ )
+ log_and_print(
+ f"Runtime for {gene_symbol} in {tissue_name} (failed): {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)",
+ summary_logger,
+ )
+ continue
+
+ # Gene-level summary
+ gene_end_time = time.time()
+ gene_runtime = gene_end_time - gene_start_time
+
+ if not all_results:
+ log_and_print(
+ f"No successful analyses completed for {gene_symbol}.",
+ summary_logger,
+ )
+ log_and_print(
+ f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+ summary_logger,
+ )
+ continue
+
+ # Store results for this gene
+ all_genes_results[gene_symbol] = {
+ "results": all_results,
+ "gene_id": gene_id,
+ "tissue_runtimes": tissue_runtimes,
+ "gene_runtime": gene_runtime,
+ }
+
+ # Save combined results for this gene
+ combined_results = pd.concat(all_results.values(), ignore_index=False)
+ combined_output_file = (
+ output_dir / f"{gene_symbol}_all_tissues_correlation_results.pkl"
+ )
+ combined_results.to_pickle(combined_output_file)
+ if not args.no_csv_output:
+ combined_csv_file = (
+ output_dir / f"{gene_symbol}_all_tissues_correlation_results.csv"
+ )
+ combined_results.to_csv(combined_csv_file)
+
+ # Gene-specific summary
+ log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+ log_and_print(
+ "COMBINED RESULTS SUMMARY", summary_logger, summary_tables_file
+ )
+ log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+ log_and_print(
+ f"Gene Symbol: {gene_symbol}", summary_logger, summary_tables_file
+ )
+ log_and_print(f"Gene ID: {gene_id}", summary_logger, summary_tables_file)
+ log_and_print(
+ f"Permutations: {args.permutations:,}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Tissues processed: {len(all_results)}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Combined results saved to: {combined_output_file}",
+ summary_logger,
+ summary_tables_file,
+ )
+ if not args.no_csv_output:
+ log_and_print(
+ f"Combined results (CSV) saved to: {combined_csv_file}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Show summary statistics for this gene
+ successful_analyses = combined_results[
+ combined_results["status"] == "success"
+ ]
+ if len(successful_analyses) > 0:
+ log_and_print(
+ f"\nTotal successful analyses across all tissues: {len(successful_analyses)}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+ log_and_print(
+ "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+
+ # Sort by absolute CCC value (descending) - simplified approach
+ successful_analyses_copy = successful_analyses.copy()
+ successful_analyses_copy["abs_ccc"] = successful_analyses_copy[
+ "ccc_value"
+ ].abs()
+ top_results = successful_analyses_copy.sort_values(
+ "abs_ccc", ascending=False
+ )
+
+ # Display top results
+ log_and_print(
+ f"{'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 90, summary_logger, summary_tables_file)
+
+ for idx, row in top_results.head(20).iterrows():
+ tissue = row["tissue"]
+ ccc_val = row["ccc_value"]
+ p_val = row["p_value"]
+
+ # Determine significance
+ if p_val < 0.001:
+ significance = "***"
+ elif p_val < 0.01:
+ significance = "**"
+ elif p_val < 0.05:
+ significance = "*"
+ else:
+ significance = "ns"
+
+ log_and_print(
+ f"{tissue:<20} {idx:<25} {ccc_val:>10.6f} {p_val:>10.2e} {significance:<15}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Summary by tissue for this gene
+ log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+ log_and_print("SUMMARY BY TISSUE", summary_logger, summary_tables_file)
+ log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+
+ log_and_print(
+ f"{'Tissue':<20} {'N Samples':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 70, summary_logger, summary_tables_file)
+
+ for tissue_name in sorted(all_results.keys()):
+ tissue_results = all_results[tissue_name]
+ tissue_successful = tissue_results[
+ tissue_results["status"] == "success"
+ ]
+ n_samples = (
+ tissue_results["n_samples"].iloc[0]
+ if len(tissue_results) > 0
+ else 0
+ )
+
+ if len(tissue_successful) > 0:
+ mean_ccc = tissue_successful["ccc_value"].abs().mean()
+ max_ccc = tissue_successful["ccc_value"].abs().max()
+ log_and_print(
+ f"{tissue_name:<20} {n_samples:<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}",
+ summary_logger,
+ summary_tables_file,
+ )
+ else:
+ log_and_print(
+ f"{tissue_name:<20} {n_samples:<10} {'0':<12} {'N/A':<12} {'N/A':<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Runtime summary for this gene
+ log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file)
+ log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file)
+ log_and_print(f"{'='*80}", summary_logger, summary_tables_file)
+ log_and_print(
+ f"Total runtime: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Average runtime per tissue: {gene_runtime/len(expression_files):.2f} seconds",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ log_and_print("\nRuntime by tissue:", summary_logger, summary_tables_file)
+ log_and_print(
+ f"{'Tissue':<25} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Status':<10}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 70, summary_logger, summary_tables_file)
+
+ for tissue_name in sorted(tissue_runtimes.keys()):
+ runtime = tissue_runtimes[tissue_name]
+ status = "Success" if tissue_name in all_results else "Failed"
+ log_and_print(
+ f"{tissue_name:<25} {runtime:<15.2f} {runtime/60:<15.2f} {status:<10}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ if tissue_runtimes:
+ # Find fastest and slowest tissues
+ fastest_tissue = min(tissue_runtimes.items(), key=lambda x: x[1])
+ slowest_tissue = max(tissue_runtimes.items(), key=lambda x: x[1])
+
+ log_and_print(
+ f"\nFastest: {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Slowest: {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ log_and_print(
+ f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)",
+ summary_logger,
+ )
+
+ total_end_time = time.time()
+ total_runtime = total_end_time - total_start_time
+
+ if not all_genes_results:
+ log_and_print(
+ "No successful analyses completed for any gene.", summary_logger
+ )
+ summary_tables_file.close()
+ return
+
+ # Create overall summary
+ log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+ log_and_print("OVERALL RESULTS SUMMARY", summary_logger, summary_tables_file)
+ log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+ log_and_print(
+ f"Gene symbols processed: {', '.join(all_genes_results.keys())}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Total genes: {len(all_genes_results)}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Permutations: {args.permutations:,}", summary_logger, summary_tables_file
+ )
+ log_and_print(
+ f"Tissues per gene: {len(expression_files)}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Combine all results across genes
+ all_combined_results = []
+ for gene_symbol, gene_data in all_genes_results.items():
+ gene_combined = pd.concat(gene_data["results"].values(), ignore_index=False)
+ all_combined_results.append(gene_combined)
+
+ mega_combined_results = pd.concat(all_combined_results, ignore_index=False)
+
+ # Save mega combined results
+ mega_output_file = output_dir / "_all_genes_all_tissues_correlation_results.pkl"
+ mega_combined_results.to_pickle(mega_output_file)
+ log_and_print(
+ f"All genes combined results saved to: {mega_output_file}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Also save as CSV for easy viewing (if not disabled)
+ if not args.no_csv_output:
+ mega_csv_file = output_dir / "_all_genes_all_tissues_correlation_results.csv"
+ mega_combined_results.to_csv(mega_csv_file)
+ log_and_print(
+ f"All genes combined results (CSV) saved to: {mega_csv_file}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # List all log files created (if individual logs are enabled)
+ if not args.no_individual_logs:
+ log_and_print("\nLog files created:", summary_logger)
+ for gene_symbol in all_genes_results.keys():
+ for tissue_name in [name for _, name in expression_files]:
+ log_file = output_dir / f"{gene_symbol}_{tissue_name}.log"
+ if log_file.exists():
+ log_and_print(
+ f" {gene_symbol} - {tissue_name}: {log_file}", summary_logger
+ )
+
+ # Show summary statistics across all genes and tissues
+ successful_analyses = mega_combined_results[
+ mega_combined_results["status"] == "success"
+ ]
+ if len(successful_analyses) > 0:
+ log_and_print(
+ f"\nTotal successful analyses across all genes and tissues: {len(successful_analyses)}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+ log_and_print(
+ "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+ # Sort by absolute CCC value (descending) - simplified approach
+ successful_analyses_copy = successful_analyses.copy()
+ successful_analyses_copy["abs_ccc"] = successful_analyses_copy[
+ "ccc_value"
+ ].abs()
+ top_results = successful_analyses_copy.sort_values(
+ "abs_ccc", ascending=False
+ )
+
+ # Display top results
+ log_and_print(
+ f"{'Gene':<12} {'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 110, summary_logger, summary_tables_file)
+
+ for idx, row in top_results.head(30).iterrows():
+ gene = row["gene_symbol"]
+ tissue = row["tissue"]
+ ccc_val = row["ccc_value"]
+ p_val = row["p_value"]
+
+ # Determine significance
+ if p_val < 0.001:
+ significance = "***"
+ elif p_val < 0.01:
+ significance = "**"
+ elif p_val < 0.05:
+ significance = "*"
+ else:
+ significance = "ns"
+
+ log_and_print(
+ f"{gene:<12} {tissue:<20} {idx:<25} {ccc_val:>10.6f} {p_val:>10.2e} {significance:<15}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Summary by gene
+ log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+ log_and_print("SUMMARY BY GENE", summary_logger, summary_tables_file)
+ log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+ for gene_symbol, gene_data in all_genes_results.items():
+ gene_combined = pd.concat(
+ gene_data["results"].values(), ignore_index=False
+ )
+ gene_successful = gene_combined[gene_combined["status"] == "success"]
+
+ log_and_print(
+ f"\nGene: {gene_symbol} (ID: {gene_data['gene_id']})",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f" Tissues processed: {len(gene_data['results'])}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f" Successful analyses: {len(gene_successful)}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ if len(gene_successful) > 0:
+ mean_ccc = gene_successful["ccc_value"].abs().mean()
+ max_ccc = gene_successful["ccc_value"].abs().max()
+ log_and_print(
+ f" Mean |CCC|: {mean_ccc:.6f}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f" Max |CCC|: {max_ccc:.6f}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Top correlation for this gene
+ gene_successful_copy = gene_successful.copy()
+ gene_successful_copy["abs_ccc"] = gene_successful_copy[
+ "ccc_value"
+ ].abs()
+ top_corr = gene_successful_copy.sort_values(
+ "abs_ccc", ascending=False
+ ).iloc[0]
+ log_and_print(
+ f" Top correlation: {top_corr.name} in {top_corr['tissue']} (CCC: {top_corr['ccc_value']:.6f}, p: {top_corr['p_value']:.2e})",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ log_and_print(
+ f" Runtime: {gene_data['gene_runtime']:.2f} seconds ({gene_data['gene_runtime']/60:.2f} minutes)",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Summary by tissue across all genes
+ log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+ log_and_print(
+ "SUMMARY BY TISSUE (across all genes)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+
+ tissue_summary = {}
+ for gene_symbol, gene_data in all_genes_results.items():
+ for tissue_name, tissue_results in gene_data["results"].items():
+ if tissue_name not in tissue_summary:
+ tissue_summary[tissue_name] = []
+ tissue_summary[tissue_name].append(tissue_results)
+
+ log_and_print(
+ f"{'Tissue':<25} {'N Genes':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 75, summary_logger, summary_tables_file)
+
+ for tissue_name in sorted(tissue_summary.keys()):
+ tissue_all_genes = pd.concat(
+ tissue_summary[tissue_name], ignore_index=False
+ )
+ tissue_successful = tissue_all_genes[
+ tissue_all_genes["status"] == "success"
+ ]
+
+ if len(tissue_successful) > 0:
+ mean_ccc = tissue_successful["ccc_value"].abs().mean()
+ max_ccc = tissue_successful["ccc_value"].abs().max()
+ log_and_print(
+ f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}",
+ summary_logger,
+ summary_tables_file,
+ )
+ else:
+ log_and_print(
+ f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {'0':<12} {'N/A':<12} {'N/A':<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Runtime summary
+ log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file)
+ log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file)
+ log_and_print(f"{'='*100}", summary_logger, summary_tables_file)
+ log_and_print(
+ f"Total runtime: {total_runtime:.2f} seconds ({total_runtime/60:.2f} minutes)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Average runtime per gene: {total_runtime/len(args.gene_symbols):.2f} seconds",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Total gene-tissue combinations: {len(args.gene_symbols) * len(expression_files)}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Runtime by gene
+ log_and_print("\nRuntime by gene:", summary_logger, summary_tables_file)
+ log_and_print(
+ f"{'Gene':<15} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Tissues':<10} {'Successful':<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 75, summary_logger, summary_tables_file)
+
+ for gene_symbol, gene_data in all_genes_results.items():
+ successful_tissues = len(gene_data["results"])
+ log_and_print(
+ f"{gene_symbol:<15} {gene_data['gene_runtime']:<15.2f} {gene_data['gene_runtime']/60:<15.2f} {len(expression_files):<10} {successful_tissues:<12}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Aggregate tissue runtime statistics across all genes
+ all_tissue_runtimes = {}
+ for gene_symbol, gene_data in all_genes_results.items():
+ for tissue_name, runtime in gene_data["tissue_runtimes"].items():
+ if tissue_name not in all_tissue_runtimes:
+ all_tissue_runtimes[tissue_name] = []
+ all_tissue_runtimes[tissue_name].append(runtime)
+
+ if all_tissue_runtimes:
+ log_and_print(
+ "\nAverage runtime by tissue (across all genes):",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"{'Tissue':<25} {'Avg Runtime (sec)':<18} {'Avg Runtime (min)':<18} {'N Runs':<8} {'Min':<10} {'Max':<10}",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print("-" * 95, summary_logger, summary_tables_file)
+
+ tissue_avg_runtimes = []
+ for tissue_name in sorted(all_tissue_runtimes.keys()):
+ runtimes = all_tissue_runtimes[tissue_name]
+ avg_runtime = np.mean(runtimes)
+ min_runtime = np.min(runtimes)
+ max_runtime = np.max(runtimes)
+ tissue_avg_runtimes.append((tissue_name, avg_runtime))
+
+ log_and_print(
+ f"{tissue_name:<25} {avg_runtime:<18.2f} {avg_runtime/60:<18.2f} {len(runtimes):<8} {min_runtime:<10.2f} {max_runtime:<10.2f}",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Find fastest and slowest tissues (by average)
+ tissue_avg_runtimes.sort(key=lambda x: x[1])
+ fastest_tissue = tissue_avg_runtimes[0]
+ slowest_tissue = tissue_avg_runtimes[-1]
+
+ log_and_print(
+ f"\nFastest tissue (avg): {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Slowest tissue (avg): {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)",
+ summary_logger,
+ summary_tables_file,
+ )
+ log_and_print(
+ f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x",
+ summary_logger,
+ summary_tables_file,
+ )
+
+ # Final message about summary log
+ log_and_print(f"\nSummary log saved to: {summary_log_file}", summary_logger)
+ log_and_print(
+ f"Summary tables saved to: {summary_tables_file_path}", summary_logger
+ )
+
+ # Close the summary tables file
+ summary_tables_file.close()
+
+ # Close the summary logger
+ for handler in summary_logger.handlers:
+ handler.close()
+ summary_logger.removeHandler(handler)
+
+ except Exception as e:
+ print(f"Error: {e}", file=sys.stderr)
+ # Try to close the summary tables file if it was opened
+ try:
+ summary_tables_file.close()
+ except:
+ pass
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()