diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml deleted file mode 100644 index 8c462bbd..00000000 --- a/.github/workflows/lint.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: lint -on: - push: - pull_request: - types: [opened, reopened] -jobs: - run-linters: - name: Run linters - runs-on: ubuntu-latest - - steps: - - name: Check out Git repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.9 - - - name: Install Python dependencies - run: pip install black flake8 - - - name: Run linters - uses: wearerequired/lint-action@v1 - with: - github_token: ${{ secrets.github_token }} - # Enable linters - black: true - flake8: true - # Mark the following line true if you want linters to attempt to - # autocorrect your code - auto_fix: true - git_name: "Greene Lab Linter" - git_email: "miltondp@gmail.com" - commit_message: "fix code style issues with ${linter}" - diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml deleted file mode 100644 index 5abc2841..00000000 --- a/.github/workflows/pytest.yaml +++ /dev/null @@ -1,129 +0,0 @@ -name: tests -on: - push: - pull_request: - types: [opened, reopened] - -env: - # Increase this value to reset cache if environment.yml has not changed. - PY_CACHE_NUMBER: 2 - PY_ENV: ccc_gene_expr - -jobs: - ccc_pytest: - name: Python tests for CCC - runs-on: ${{ matrix.os }} - strategy: - max-parallel: 4 - fail-fast: false - matrix: - python-version: ["3.10", "3.11"] - os: [ubuntu-latest, macOS-latest, windows-latest] - steps: - - name: Checkout git repo - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest "numpy<2.0" scipy numba pandas scikit-learn - - name: Test CCC with pytest - env: - PYTHONPATH: libs/ - run: | - pytest tests/test_coef.py tests/test_pytorch_core.py tests/test_scipy_stats.py tests/test_sklearn_metrics.py - -# pytest: -# name: Python tests for analyses -# runs-on: ${{ matrix.os }} -# strategy: -# max-parallel: 4 -# fail-fast: false -# matrix: -# python-version: ["3.9"] -# os: [ubuntu-latest, macOS-latest, windows-latest] -# steps: -# - name: Checkout git repo -# uses: actions/checkout@v3 -# - name: Cache conda -# id: cache -# uses: actions/cache@v3 -# with: -# path: "${{ env.PY_ENV }}.tar.gz" -# key: ${{ runner.os }}-${{ env.PY_CACHE_NUMBER }}-${{ hashFiles('environment/environment.yml', 'environment/scripts/install_r_packages.r', 'environment/scripts/install_other_packages.sh') }} -# - name: Setup Miniconda -# if: steps.cache.outputs.cache-hit != 'true' -# uses: conda-incubator/setup-miniconda@v2 -# with: -# activate-environment: ${{ env.PY_ENV }} -# environment-file: environment/environment.yml -# auto-activate-base: false -# miniforge-variant: Mambaforge -# miniforge-version: 'latest' -# use-mamba: true -# - name: Install other packages and Conda-Pack environment -# if: steps.cache.outputs.cache-hit != 'true' -# shell: bash -l {0} -# run: | -# # other packages (R packages mainly) -# bash environment/scripts/install_other_packages.sh -# -# # install conda-pack, and pack environment -# conda install --yes -c conda-forge conda-pack coverage -# conda pack -f -n ${{ env.PY_ENV }} -o "${{ env.PY_ENV }}.tar.gz" -# - name: Unpack environment -# shell: bash -l {0} -# run: | -# mkdir -p "${{ env.PY_ENV }}" -# tar -xzf "${{ env.PY_ENV }}.tar.gz" -C "${{ env.PY_ENV }}" -# - name: Setup data and run pytest (Windows systems) -# if: runner.os == 'Windows' -# env: -# PYTHONPATH: libs/ -# shell: cmd -# run: | -# echo on -# cd ${{ env.PY_ENV }} -# call .\Scripts\activate.bat -# .\Scripts\conda-unpack.exe -# cd .. -# set R_HOME=%CONDA_PREFIX%\Lib\R -# python environment\scripts\setup_data.py --mode testing -# pytest -v -rs tests -# - name: Setup data and run pytest (non-Windows systems) -# if: runner.os != 'Windows' -# shell: bash -# env: -# PYTHONPATH: libs/ -# run: | -# source ${{ env.PY_ENV }}/bin/activate -# conda-unpack -# -# python environment/scripts/setup_data.py --mode testing -# -# if [ "$RUNNER_OS" == "Linux" ]; then -# # for linux/ubuntu, run the tests once: with numba jit activated -# # (which is the expected implementation) and with the jit -# # deactivated (otherwise coverage does not work). -# -# # numba jit activated -# pytest -v -rs tests -# -# # numba jit deactivated + code coverage -# export NUMBA_DISABLE_JIT=1 -# coverage run --source=libs/ -m pytest -v -rs tests -# coverage xml -o coverage.xml -# else -# pytest -v -rs tests -# fi -# - name: Codecov upload -# if: runner.os == 'Linux' -# uses: codecov/codecov-action@v2 -# with: -# files: ./coverage.xml -# name: codecov-${{ matrix.os }}-python${{ matrix.python-version }} -# fail_ci_if_error: true -# verbose: true diff --git a/README.md b/README.md index 14285186..c5a12bb7 100644 --- a/README.md +++ b/README.md @@ -73,17 +73,13 @@ cd ccc-gpu #### 2. Setup Environment with conda-lock -This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean: +This process uses [pipx](https://pipx.pypa.io/stable/) to install conda-lock in an isolated environment, keeping your base environment clean: > **Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular `environment.yml` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development. ```bash -# Create temporary environment for conda-lock -conda create -n ccc-gpu-setup python=3.10 -y # or: mamba create -n ccc-gpu-setup python=3.10 -y -conda activate ccc-gpu-setup - -# Install conda-lock in temporary environment -conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y +# Install conda-lock using pipx (installs in isolated environment) +pipx install conda-lock # Create the main ccc-gpu environment from lock file conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba @@ -95,32 +91,6 @@ conda activate ccc-gpu pip install . ``` -#### 3. Optional: Clean up temporary environment - -Once installation is complete, you can optionally remove the temporary setup environment: - -```bash -# Remove temporary environment (optional) -conda deactivate # Make sure you're not in ccc-gpu-setup -conda remove -n ccc-gpu-setup --all -y # or: mamba remove -n ccc-gpu-setup --all -y -``` - -#### Alternative: Install conda-lock in base environment - -If you prefer to install conda-lock directly in your base environment: - -```bash -# Option 1: Using pip -pip install conda-lock - -# Option 2: Using conda -conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y - -# Then create environment directly -conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba -conda activate ccc-gpu -pip install . -``` > **Note**: If you prefer to use Mamba for faster package resolution, you can install MiniForge which includes Mamba: > ```bash @@ -139,6 +109,10 @@ bash ./scripts/run_tests.sh python ``` ## Usage +### End-to-End Tutorial + +You can find a tutorial showing simplified analysis steps for those we used in our paper in this [notebook](nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb) using the GTEx v8 data. + ### Basic Usage @@ -161,35 +135,16 @@ correlation = ccc(x, y) print(f"CCC coefficient: {correlation:.3f}") ``` -### Controlling Debug Logging - -By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable: - -```bash -# Run with default behavior (no debug output) -python your_script.py - -# Enable debug logging for troubleshooting -CCC_GPU_LOGGING=1 python your_script.py - -# Or set it for the session -export CCC_GPU_LOGGING=1 -python your_script.py -``` - -This is particularly useful for: -- Debugging GPU memory issues -- Understanding CUDA device utilization -- Monitoring batch processing performance -- Troubleshooting installation problems - ### Working with Gene Expression Data CCC-GPU is particularly useful for genomics applications: ```python import pandas as pd -from ccc.coef import ccc +# New CCC-GPU implementation import +from ccc.coef.impl_gpu import ccc +# Original CCC implementation import +# from ccc.coef.impl import ccc # Load gene expression data # Assume genes are in columns, samples in rows @@ -217,6 +172,28 @@ for i, j in zip(top_indices[0], top_indices[1]): Refer to the original CCC Repository for more usage examples: [https://github.com/greenelab/ccc](https://github.com/greenelab/ccc) +### Controlling Debug Logging + +By default, CCC-GPU runs silently without debug output. You can enable detailed logging (including CUDA device information, memory usage, and processing details) using the `CCC_GPU_LOGGING` environment variable: + +```bash +# Run with default behavior (no debug output) +python your_script.py + +# Enable debug logging for troubleshooting +CCC_GPU_LOGGING=1 python your_script.py + +# Or set it for the session +export CCC_GPU_LOGGING=1 +python your_script.py +``` + +This is particularly useful for: +- Debugging GPU memory issues +- Understanding CUDA device utilization +- Monitoring batch processing performance +- Troubleshooting installation problems + ## Performance Benchmarks CCC-GPU provides significant performance improvements over CPU-only implementations: diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 68596b54..70b1065c 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -65,19 +65,15 @@ Install from source using the provided conda-lock environment: 2. Setup Environment with conda-lock ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This process uses a temporary environment to manage the conda-lock installation, keeping your base environment clean: +This process uses pipx to install conda-lock in an isolated environment, keeping your base environment clean: .. note:: **Why conda-lock?** We use conda-lock to ensure **reproducible installations** across different systems. Unlike regular ``environment.yml`` files, conda-lock provides exact version pins for all packages and their dependencies, preventing version conflicts and ensuring you get the same environment that was tested during development. .. code-block:: bash - # Create temporary environment for conda-lock - conda create -n ccc-gpu-setup python=3.10 -y # or: mamba create -n ccc-gpu-setup python=3.10 -y - conda activate ccc-gpu-setup - - # Install conda-lock in temporary environment - conda install --channel=conda-forge conda-lock -y # or: mamba install --channel=conda-forge conda-lock -y + # Install conda-lock using pipx (installs in isolated environment) + pipx install conda-lock # Create the main ccc-gpu environment from lock file conda-lock install --name ccc-gpu conda-lock.yml # or: conda-lock install --name ccc-gpu conda-lock.yml --conda mamba @@ -88,21 +84,23 @@ This process uses a temporary environment to manage the conda-lock installation, # Install the package from source pip install . -3. Optional: Clean up temporary environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. note:: + If you don't have pipx installed, you can install it with ``pip install pipx`` or follow the `pipx installation guide `_. + +3. Optional: Remove conda-lock +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once installation is complete, you can optionally remove the temporary setup environment: +If you no longer need conda-lock after installation, you can remove it: .. code-block:: bash - # Remove temporary environment (optional) - conda deactivate # Make sure you're not in ccc-gpu-setup - conda remove -n ccc-gpu-setup --all -y # or: mamba remove -n ccc-gpu-setup --all -y + # Remove conda-lock (optional) + pipx uninstall conda-lock Alternative: Install conda-lock in base environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you prefer to install conda-lock directly in your base environment: +If you prefer to install conda-lock directly in your base environment instead of using pipx: .. code-block:: bash diff --git a/nbs/03-manuscript/40_prepare_supp_data/README.md b/nbs/03-manuscript/40_prepare_supp_data/README.md index f972dcbe..6c733d8f 100644 --- a/nbs/03-manuscript/40_prepare_supp_data/README.md +++ b/nbs/03-manuscript/40_prepare_supp_data/README.md @@ -1,82 +1,188 @@ -# CCC Data Processing Script +# CCC Data Processing Scripts -This directory contains a script to process GTEx similarity matrices (.pkl files) and extract only the CCC (Clustered Correlation Coefficient) data. +This directory contains scripts to process GTEx similarity matrices (.pkl files) and extract CCC (Clustered Correlation Coefficient) data in optimized formats for efficient storage and fast queries. -## Script: `process_ccc_data.py` +## Available Scripts -### Description -Processes all .pkl files in the source directory, extracts only the 'ccc' column with multi-indices, and saves individual .parquet files with snappy compression for each input. This significantly reduces file sizes compared to .pkl format. +### 1. `process_ccc_to_duckdb.py` (Recommended) -### Usage +**Description:** Converts pickle files to DuckDB format for ultra-fast queries and efficient storage. + +#### Key Features +- **Sub-millisecond query performance** for individual gene pairs +- **4-5x better compression** than parquet format +- **Minimal memory usage** (queries use <1GB RAM vs 13GB+ for parquet) +- **SQL query capabilities** for complex analyses +- **Support for both individual and consolidated databases** + +#### Usage -#### Dry Run (recommended first step) ```bash # Activate the conda environment conda activate ccc-gpu -# Run dry run to see what files would be processed -python process_ccc_data.py --dry-run +# Install DuckDB (if not already installed) +pip install duckdb + +# Process all tissues into individual databases +python process_ccc_to_duckdb.py \ + --source-dir /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all \ + --output-dir /mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb + +# Create a single consolidated database (all tissues) +python process_ccc_to_duckdb.py --single-db + +# Process specific tissues only +python process_ccc_to_duckdb.py --tissues bladder brain_cortex + +# Dry run to see what would be processed +python process_ccc_to_duckdb.py --dry-run ``` -#### Full Processing +#### Arguments +- `--source-dir`: Source directory with .pkl files +- `--output-dir`: Output directory for DuckDB files +- `--single-db`: Create one consolidated database instead of individual ones +- `--tissues`: Process specific tissues only +- `--dry-run`: Show what would be processed without doing it +- `--debug`: Enable debug logging + +### 2. `ccc_duckdb_query.py` - Query Interface + +**Description:** Python wrapper for fast queries on DuckDB databases. + +#### Usage as Module + +```python +from ccc_duckdb_query import CCCDatabase + +# Open database +db = CCCDatabase("/path/to/bladder_ccc.duckdb") + +# Query single gene pair +ccc = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11") + +# Get all correlations for a gene +correlations = db.get_gene_correlations("ENSG00000141510.16", min_ccc=0.5) + +# Get top correlations +top_pairs = db.get_top_correlations(threshold=0.9, limit=100) + +# Batch query multiple pairs +pairs = [("gene1", "gene2"), ("gene3", "gene4")] +results = db.get_batch_correlations(pairs) + +# Custom SQL query +df = db.query("SELECT * FROM ccc_data WHERE ccc > 0.95 LIMIT 10") + +# Get database statistics +stats = db.get_statistics() + +db.close() +``` + +#### Usage as CLI + +```bash +# Get database statistics +python ccc_duckdb_query.py /path/to/database.duckdb --stats + +# Query specific gene pair +python ccc_duckdb_query.py /path/to/database.duckdb \ + --gene1 ENSG00000141510.16 --gene2 ENSG00000133703.11 + +# Get correlations for a gene +python ccc_duckdb_query.py /path/to/database.duckdb \ + --gene ENSG00000141510.16 --limit 50 + +# Get top correlations +python ccc_duckdb_query.py /path/to/database.duckdb \ + --top 0.9 --limit 100 +``` + +### 3. `process_ccc_data.py` (Legacy - Parquet Output) + +**Description:** Original script that creates parquet files. Kept for compatibility but DuckDB format is recommended. + ```bash # Run with default paths python process_ccc_data.py -# Run with custom paths +# Custom paths python process_ccc_data.py --source-dir /path/to/source --output-dir /path/to/output ``` -### Arguments -- `--source-dir`: Source directory containing .pkl files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all`) -- `--output-dir`: Output directory for processed parquet files (default: `/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet`) -- `--dry-run`: Show what would be processed without actually doing it -- `--debug`: Enable debug logging (shows detailed processing information) - -### Requirements -- `pandas`: For reading .pkl files and writing .parquet files -- `pyarrow`: Required for parquet format support with snappy compression -- `tqdm`: For progress bars (optional, script will work without it) - -### Logging -The script automatically creates detailed logs in the `logs/` directory with timestamps: -- **Log location**: `logs/process_ccc_data_YYYYMMDD_HHMMSS.log` -- **Log levels**: INFO (default) and DEBUG (with `--debug` flag) -- **Log content**: Processing progress, file details, errors, timing information, and archive sizes -- **Console output**: Key information is also printed to console for real-time monitoring - -Example log entries: +## Performance Comparison + +| Metric | Parquet | DuckDB | Improvement | +|--------|---------|---------|-------------| +| Storage Size | 302 GB | ~60-80 GB | 4-5x smaller | +| Load Time | >60s timeout | 0s (no loading) | Instant access | +| Single Query | >100ms | <1ms | 100x+ faster | +| Memory Usage | 13+ GB | <1 GB | 13x+ less | +| Random Access | Very slow | Sub-millisecond | Orders of magnitude | + +## Requirements + +```bash +# Core requirements +conda activate ccc-gpu +pip install pandas duckdb numpy tqdm + +# Optional for parquet support +pip install pyarrow +``` + +## Output Structure + +### DuckDB Format (Recommended) ``` -2024-01-15 10:30:15,123 - INFO - Starting CCC data processing -2024-01-15 10:30:15,124 - INFO - Found 54 .pkl files to process -2024-01-15 10:30:16,200 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl -2024-01-15 10:35:22,456 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl +/output_directory/ +├── bladder_ccc.duckdb # Individual tissue databases +├── brain_cortex_ccc.duckdb +├── whole_blood_ccc.duckdb +└── all_tissues_ccc.duckdb # Optional consolidated database ``` -### Output -The script will create individual `.parquet` files for each source file containing only CCC data with multi-indices preserved. Parquet format with snappy compression provides significant space savings compared to .pkl files while maintaining fast read/write performance. +### Database Schema +```sql +-- Individual tissue table +CREATE TABLE ccc_data ( + gene1 VARCHAR NOT NULL, + gene2 VARCHAR NOT NULL, + ccc REAL NOT NULL, + PRIMARY KEY (gene1, gene2) +); + +-- Indexes for fast lookups +CREATE INDEX idx_gene2 ON ccc_data(gene2); +CREATE INDEX idx_ccc ON ccc_data(ccc); +``` -### File Naming Convention -Input: `gtex_v8_data_whole_blood-var_pc_log2-all.pkl` -Output: `gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet` +## Example Workflow -### Example ```bash -# Activate environment and run +# 1. Convert all pickle files to DuckDB conda activate ccc-gpu -python process_ccc_data.py +python process_ccc_to_duckdb.py + +# 2. Test query performance +python ccc_duckdb_query.py /path/to/bladder_ccc.duckdb --stats + +# 3. Use in Python scripts +from ccc_duckdb_query import CCCDatabase + +with CCCDatabase("bladder_ccc.duckdb") as db: + # Fast queries for your analysis + ccc = db.get_correlation("gene1", "gene2") +``` + +## Advantages of DuckDB Format -# Run with debug logging for more detailed information -python process_ccc_data.py --debug - -# Expected output structure: -# nbs/03-manuscript/40_prepare_supp_data/ -# └── logs/ -# └── process_ccc_data_YYYYMMDD_HHMMSS.log -# -# /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/ -# ├── gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet -# ├── gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet -# ├── gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet -# └── ... (54 individual .parquet files total) -``` \ No newline at end of file +1. **No Loading Required**: Direct queries without loading entire dataset +2. **Memory Efficient**: Uses memory-mapped IO, minimal RAM footprint +3. **Fast Random Access**: Indexed lookups in microseconds +4. **SQL Support**: Complex queries and aggregations +5. **Better Compression**: Columnar storage with efficient encoding +6. **Concurrent Access**: Multiple readers can query simultaneously +7. **ACID Compliance**: Data integrity guarantees \ No newline at end of file diff --git a/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py new file mode 100755 index 00000000..38938650 --- /dev/null +++ b/nbs/03-manuscript/40_prepare_supp_data/ccc_duckdb_query.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 +""" +Python wrapper for fast CCC correlation queries from DuckDB databases. + +This module provides a simple interface for querying gene pair correlations +from the DuckDB databases created by process_ccc_to_duckdb.py. + +Example usage: + from ccc_duckdb_query import CCCDatabase + + # Single tissue database + db = CCCDatabase("/path/to/bladder_ccc.duckdb") + + # Query single gene pair + ccc_value = db.get_correlation("ENSG00000141510.16", "ENSG00000133703.11") + + # Get all correlations for a gene + correlations = db.get_gene_correlations("ENSG00000141510.16") + + # Get top correlations + top_pairs = db.get_top_correlations(threshold=0.9, limit=100) + + # Batch query multiple pairs + pairs = [("gene1", "gene2"), ("gene3", "gene4")] + results = db.get_batch_correlations(pairs) +""" + +from pathlib import Path +from typing import Dict, List, Tuple, Optional, Union +import pandas as pd +import duckdb +import logging + +logger = logging.getLogger(__name__) + + +class CCCDatabase: + """Wrapper for querying CCC correlation data from DuckDB databases.""" + + def __init__(self, db_path: Union[str, Path], tissue: Optional[str] = None): + """ + Initialize connection to DuckDB database. + + Args: + db_path: Path to DuckDB database file + tissue: Tissue name (for consolidated database) + """ + self.db_path = Path(db_path) + if not self.db_path.exists(): + raise FileNotFoundError(f"Database not found: {db_path}") + + self.con = duckdb.connect(str(self.db_path), read_only=True) + self.tissue = tissue + + # Detect database type (single tissue or consolidated) + tables = self.con.execute("SHOW TABLES").fetchall() + table_names = [t[0] for t in tables] + + if "tissues" in table_names: + # Consolidated database + self.db_type = "consolidated" + self.tissues = self._get_available_tissues() + + if tissue: + if tissue not in self.tissues: + raise ValueError(f"Tissue '{tissue}' not found. Available: {self.tissues}") + self.table_name = f"ccc_{tissue}" + else: + logger.info(f"Consolidated database with {len(self.tissues)} tissues") + logger.info(f"Available tissues: {', '.join(self.tissues[:5])}...") + else: + # Single tissue database + self.db_type = "single" + self.table_name = "ccc_data" + self.tissues = None + + def _get_available_tissues(self) -> List[str]: + """Get list of available tissues in consolidated database.""" + result = self.con.execute("SELECT tissue_name FROM tissues ORDER BY tissue_name").fetchall() + return [r[0] for r in result] + + def get_correlation(self, gene1: str, gene2: str) -> Optional[float]: + """ + Get CCC correlation for a specific gene pair. + + Args: + gene1: First gene ID + gene2: Second gene ID + + Returns: + CCC correlation value or None if not found + """ + if self.tissue is None and self.db_type == "consolidated": + raise ValueError("Tissue must be specified for consolidated database") + + # Try both orientations since correlation is symmetric + query = f""" + SELECT ccc FROM {self.table_name} + WHERE (gene1 = ? AND gene2 = ?) + OR (gene1 = ? AND gene2 = ?) + LIMIT 1 + """ + + result = self.con.execute(query, [gene1, gene2, gene2, gene1]).fetchone() + return result[0] if result else None + + def get_gene_correlations( + self, + gene: str, + min_ccc: Optional[float] = None, + limit: Optional[int] = None + ) -> pd.DataFrame: + """ + Get all correlations for a specific gene. + + Args: + gene: Gene ID + min_ccc: Minimum CCC threshold (optional) + limit: Maximum number of results (optional) + + Returns: + DataFrame with columns: gene_pair, ccc + """ + if self.tissue is None and self.db_type == "consolidated": + raise ValueError("Tissue must be specified for consolidated database") + + where_clause = "" + if min_ccc is not None: + where_clause = f"AND ccc >= {min_ccc}" + + limit_clause = "" + if limit is not None: + limit_clause = f"LIMIT {limit}" + + query = f""" + SELECT + CASE + WHEN gene1 = ? THEN gene2 + ELSE gene1 + END as gene_pair, + ccc + FROM {self.table_name} + WHERE (gene1 = ? OR gene2 = ?) + {where_clause} + ORDER BY ccc DESC + {limit_clause} + """ + + result = self.con.execute(query, [gene, gene, gene]).df() + return result + + def get_top_correlations( + self, + threshold: float = 0.9, + limit: int = 100 + ) -> pd.DataFrame: + """ + Get top correlations above a threshold. + + Args: + threshold: Minimum CCC value + limit: Maximum number of results + + Returns: + DataFrame with columns: gene1, gene2, ccc + """ + if self.tissue is None and self.db_type == "consolidated": + raise ValueError("Tissue must be specified for consolidated database") + + query = f""" + SELECT gene1, gene2, ccc + FROM {self.table_name} + WHERE ccc >= ? + ORDER BY ccc DESC + LIMIT ? + """ + + result = self.con.execute(query, [threshold, limit]).df() + return result + + def get_batch_correlations( + self, + pairs: List[Tuple[str, str]] + ) -> Dict[Tuple[str, str], Optional[float]]: + """ + Get correlations for multiple gene pairs efficiently. + + Args: + pairs: List of (gene1, gene2) tuples + + Returns: + Dictionary mapping (gene1, gene2) to CCC values + """ + if self.tissue is None and self.db_type == "consolidated": + raise ValueError("Tissue must be specified for consolidated database") + + if not pairs: + return {} + + # Create temporary table for batch lookup + self.con.execute("CREATE TEMPORARY TABLE query_pairs (gene1 VARCHAR, gene2 VARCHAR)") + + # Insert pairs + for g1, g2 in pairs: + self.con.execute("INSERT INTO query_pairs VALUES (?, ?)", [g1, g2]) + + # Batch query with joins + query = f""" + SELECT + COALESCE(qp.gene1, qp2.gene1) as query_gene1, + COALESCE(qp.gene2, qp2.gene2) as query_gene2, + c.ccc + FROM query_pairs qp + LEFT JOIN {self.table_name} c + ON qp.gene1 = c.gene1 AND qp.gene2 = c.gene2 + LEFT JOIN query_pairs qp2 + ON qp2.gene1 = c.gene2 AND qp2.gene2 = c.gene1 + WHERE c.ccc IS NOT NULL + """ + + results = self.con.execute(query).fetchall() + + # Drop temporary table + self.con.execute("DROP TABLE query_pairs") + + # Convert to dictionary + result_dict = {} + for row in results: + result_dict[(row[0], row[1])] = row[2] + + # Add None for missing pairs + for pair in pairs: + if pair not in result_dict and (pair[1], pair[0]) not in result_dict: + result_dict[pair] = None + + return result_dict + + def get_cross_tissue_correlation( + self, + gene1: str, + gene2: str + ) -> pd.DataFrame: + """ + Get correlation values across all tissues (consolidated database only). + + Args: + gene1: First gene ID + gene2: Second gene ID + + Returns: + DataFrame with columns: tissue, ccc + """ + if self.db_type != "consolidated": + raise ValueError("Cross-tissue query requires consolidated database") + + query = """ + SELECT tissue, ccc + FROM all_correlations + WHERE (gene1 = ? AND gene2 = ?) + OR (gene1 = ? AND gene2 = ?) + ORDER BY ccc DESC + """ + + result = self.con.execute(query, [gene1, gene2, gene2, gene1]).df() + return result + + def query(self, sql: str, parameters: Optional[List] = None) -> pd.DataFrame: + """ + Execute custom SQL query on the database. + + Args: + sql: SQL query string + parameters: Query parameters (optional) + + Returns: + Query results as DataFrame + """ + if parameters: + return self.con.execute(sql, parameters).df() + else: + return self.con.execute(sql).df() + + def get_statistics(self) -> Dict: + """ + Get database statistics. + + Returns: + Dictionary with database statistics + """ + stats = {} + + if self.db_type == "consolidated": + # Get tissue statistics + tissue_stats = self.con.execute(""" + SELECT + COUNT(*) as num_tissues, + SUM(num_pairs) as total_pairs, + MIN(min_ccc) as global_min_ccc, + MAX(max_ccc) as global_max_ccc, + AVG(mean_ccc) as avg_mean_ccc + FROM tissues + """).fetchone() + + stats['type'] = 'consolidated' + stats['num_tissues'] = tissue_stats[0] + stats['total_pairs'] = tissue_stats[1] + stats['global_min_ccc'] = tissue_stats[2] + stats['global_max_ccc'] = tissue_stats[3] + stats['avg_mean_ccc'] = tissue_stats[4] + + if self.tissue: + # Get specific tissue stats + tissue_info = self.con.execute(""" + SELECT num_pairs, min_ccc, max_ccc, mean_ccc + FROM tissues + WHERE tissue_name = ? + """, [self.tissue]).fetchone() + + if tissue_info: + stats['tissue'] = self.tissue + stats['tissue_pairs'] = tissue_info[0] + stats['tissue_min_ccc'] = tissue_info[1] + stats['tissue_max_ccc'] = tissue_info[2] + stats['tissue_mean_ccc'] = tissue_info[3] + + else: + # Single tissue database statistics + result = self.con.execute(f""" + SELECT + COUNT(*) as num_pairs, + MIN(ccc) as min_ccc, + MAX(ccc) as max_ccc, + AVG(ccc) as mean_ccc + FROM {self.table_name} + """).fetchone() + + stats['type'] = 'single' + stats['num_pairs'] = result[0] + stats['min_ccc'] = result[1] + stats['max_ccc'] = result[2] + stats['mean_ccc'] = result[3] + + # Database file size + stats['database_size_gb'] = self.db_path.stat().st_size / (1024**3) + + return stats + + def close(self): + """Close database connection.""" + self.con.close() + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.close() + + +def main(): + """Example usage and simple CLI.""" + import argparse + + parser = argparse.ArgumentParser(description="Query CCC correlation database") + parser.add_argument("database", help="Path to DuckDB database") + parser.add_argument("--tissue", help="Tissue name (for consolidated database)") + parser.add_argument("--gene1", help="First gene ID") + parser.add_argument("--gene2", help="Second gene ID") + parser.add_argument("--gene", help="Get all correlations for this gene") + parser.add_argument("--top", type=float, help="Get top correlations above threshold") + parser.add_argument("--limit", type=int, default=100, help="Limit number of results") + parser.add_argument("--stats", action="store_true", help="Show database statistics") + + args = parser.parse_args() + + # Initialize database + with CCCDatabase(args.database, tissue=args.tissue) as db: + + if args.stats: + stats = db.get_statistics() + print("\nDatabase Statistics:") + for key, value in stats.items(): + if isinstance(value, float): + print(f" {key}: {value:.4f}") + else: + print(f" {key}: {value}") + + elif args.gene1 and args.gene2: + # Query specific pair + ccc = db.get_correlation(args.gene1, args.gene2) + if ccc is not None: + print(f"CCC({args.gene1}, {args.gene2}) = {ccc:.6f}") + else: + print(f"No correlation found for pair ({args.gene1}, {args.gene2})") + + elif args.gene: + # Get all correlations for gene + results = db.get_gene_correlations(args.gene, limit=args.limit) + print(f"\nTop {len(results)} correlations for {args.gene}:") + print(results.to_string()) + + elif args.top: + # Get top correlations + results = db.get_top_correlations(threshold=args.top, limit=args.limit) + print(f"\nTop {len(results)} correlations above {args.top}:") + print(results.to_string()) + + else: + print("Please specify a query option (--gene1/--gene2, --gene, --top, or --stats)") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log b/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log deleted file mode 100644 index 0508e68b..00000000 --- a/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log +++ /dev/null @@ -1,392 +0,0 @@ -2025-09-11 22:50:04,101 - INFO - Starting CCC data processing -2025-09-11 22:50:04,101 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log -2025-09-11 22:50:04,101 - DEBUG - Debug logging enabled -2025-09-11 22:50:04,101 - INFO - Script arguments: source_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all, output_dir=/mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet, dry_run=False -2025-09-11 22:50:04,101 - INFO - Scanning directory for .pkl files: /mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all -2025-09-11 22:50:04,102 - INFO - Found 54 .pkl files to process -2025-09-11 22:50:04,102 - DEBUG - First few files: ['gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl', 'gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl', 'gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl', 'gtex_v8_data_artery_aorta-var_pc_log2-all.pkl', 'gtex_v8_data_artery_coronary-var_pc_log2-all.pkl'] -2025-09-11 22:50:04,102 - INFO - Output directory created/verified: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet -2025-09-11 22:50:04,102 - INFO - Starting processing of 54 files -2025-09-11 22:50:04,104 - DEBUG - Processing file 1/54: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl -2025-09-11 22:50:04,104 - INFO - Processing file: gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl -2025-09-11 22:50:38,091 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 22:50:40,159 - DEBUG - Extracted CCC data shape: (1460025703, 1) -2025-09-11 22:50:40,159 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet -2025-09-11 22:54:34,827 - INFO - Successfully processed gtex_v8_data_adipose_subcutaneous-var_pc_log2-all.pkl -> gtex_v8_data_adipose_subcutaneous-var_pc_log2-all_ccc_only.parquet -2025-09-11 22:54:34,827 - INFO - Size reduction: 19.04 GB -> 6.55 GB (65.6% smaller) -2025-09-11 22:54:36,104 - DEBUG - Processing file 2/54: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl -2025-09-11 22:54:36,105 - INFO - Processing file: gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl -2025-09-11 22:55:12,030 - DEBUG - Loaded data shape: (1440046611, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 22:55:14,077 - DEBUG - Extracted CCC data shape: (1440046611, 1) -2025-09-11 22:55:14,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet -2025-09-11 22:59:03,657 - INFO - Successfully processed gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all.pkl -> gtex_v8_data_adipose_visceral_omentum-var_pc_log2-all_ccc_only.parquet -2025-09-11 22:59:03,657 - INFO - Size reduction: 18.78 GB -> 6.35 GB (66.2% smaller) -2025-09-11 22:59:04,942 - DEBUG - Processing file 3/54: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl -2025-09-11 22:59:04,942 - INFO - Processing file: gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl -2025-09-11 22:59:38,946 - DEBUG - Loaded data shape: (1358012670, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 22:59:40,880 - DEBUG - Extracted CCC data shape: (1358012670, 1) -2025-09-11 22:59:40,880 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:03:24,150 - INFO - Successfully processed gtex_v8_data_adrenal_gland-var_pc_log2-all.pkl -> gtex_v8_data_adrenal_gland-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:03:24,150 - INFO - Size reduction: 17.71 GB -> 5.59 GB (68.4% smaller) -2025-09-11 23:03:25,363 - DEBUG - Processing file 4/54: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl -2025-09-11 23:03:25,363 - INFO - Processing file: gtex_v8_data_artery_aorta-var_pc_log2-all.pkl -2025-09-11 23:04:00,118 - DEBUG - Loaded data shape: (1419832116, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:04:02,111 - DEBUG - Extracted CCC data shape: (1419832116, 1) -2025-09-11 23:04:02,112 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:07:50,792 - INFO - Successfully processed gtex_v8_data_artery_aorta-var_pc_log2-all.pkl -> gtex_v8_data_artery_aorta-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:07:50,793 - INFO - Size reduction: 18.51 GB -> 6.18 GB (66.6% smaller) -2025-09-11 23:07:52,114 - DEBUG - Processing file 5/54: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl -2025-09-11 23:07:52,114 - INFO - Processing file: gtex_v8_data_artery_coronary-var_pc_log2-all.pkl -2025-09-11 23:08:31,245 - DEBUG - Loaded data shape: (1373430255, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:08:33,214 - DEBUG - Extracted CCC data shape: (1373430255, 1) -2025-09-11 23:08:33,214 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:12:11,613 - INFO - Successfully processed gtex_v8_data_artery_coronary-var_pc_log2-all.pkl -> gtex_v8_data_artery_coronary-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:12:11,613 - INFO - Size reduction: 17.91 GB -> 5.74 GB (67.9% smaller) -2025-09-11 23:12:12,970 - DEBUG - Processing file 6/54: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl -2025-09-11 23:12:12,970 - INFO - Processing file: gtex_v8_data_artery_tibial-var_pc_log2-all.pkl -2025-09-11 23:12:48,104 - DEBUG - Loaded data shape: (1454033701, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:12:50,137 - DEBUG - Extracted CCC data shape: (1454033701, 1) -2025-09-11 23:12:50,137 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:16:47,353 - INFO - Successfully processed gtex_v8_data_artery_tibial-var_pc_log2-all.pkl -> gtex_v8_data_artery_tibial-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:16:47,353 - INFO - Size reduction: 18.96 GB -> 6.49 GB (65.8% smaller) -2025-09-11 23:16:48,739 - DEBUG - Processing file 7/54: gtex_v8_data_bladder-var_pc_log2-all.pkl -2025-09-11 23:16:48,739 - INFO - Processing file: gtex_v8_data_bladder-var_pc_log2-all.pkl -2025-09-11 23:17:16,930 - DEBUG - Loaded data shape: (995271420, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:17:18,340 - DEBUG - Extracted CCC data shape: (995271420, 1) -2025-09-11 23:17:18,340 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:19:53,633 - INFO - Successfully processed gtex_v8_data_bladder-var_pc_log2-all.pkl -> gtex_v8_data_bladder-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:19:53,633 - INFO - Size reduction: 12.98 GB -> 3.37 GB (74.0% smaller) -2025-09-11 23:19:54,610 - DEBUG - Processing file 8/54: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl -2025-09-11 23:19:54,611 - INFO - Processing file: gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl -2025-09-11 23:20:26,794 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:20:28,666 - DEBUG - Extracted CCC data shape: (1313153128, 1) -2025-09-11 23:20:28,666 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:24:01,118 - INFO - Successfully processed gtex_v8_data_brain_amygdala-var_pc_log2-all.pkl -> gtex_v8_data_brain_amygdala-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:24:01,118 - INFO - Size reduction: 17.12 GB -> 5.14 GB (70.0% smaller) -2025-09-11 23:24:02,286 - DEBUG - Processing file 9/54: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl -2025-09-11 23:24:02,286 - INFO - Processing file: gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl -2025-09-11 23:24:40,076 - DEBUG - Loaded data shape: (1345637503, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:24:41,958 - DEBUG - Extracted CCC data shape: (1345637503, 1) -2025-09-11 23:24:41,958 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:28:18,530 - INFO - Successfully processed gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all.pkl -> gtex_v8_data_brain_anterior_cingulate_cortex_ba24-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:28:18,530 - INFO - Size reduction: 17.55 GB -> 5.44 GB (69.0% smaller) -2025-09-11 23:28:19,802 - DEBUG - Processing file 10/54: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl -2025-09-11 23:28:19,802 - INFO - Processing file: gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl -2025-09-11 23:28:52,599 - DEBUG - Loaded data shape: (1377836265, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:28:54,575 - DEBUG - Extracted CCC data shape: (1377836265, 1) -2025-09-11 23:28:54,576 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:32:34,520 - INFO - Successfully processed gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_caudate_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:32:34,520 - INFO - Size reduction: 17.97 GB -> 5.74 GB (68.0% smaller) -2025-09-11 23:32:35,848 - DEBUG - Processing file 11/54: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl -2025-09-11 23:32:35,848 - INFO - Processing file: gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl -2025-09-11 23:33:09,984 - DEBUG - Loaded data shape: (1357283151, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:33:11,931 - DEBUG - Extracted CCC data shape: (1357283151, 1) -2025-09-11 23:33:11,931 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:36:52,280 - INFO - Successfully processed gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellar_hemisphere-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:36:52,280 - INFO - Size reduction: 17.70 GB -> 5.58 GB (68.4% smaller) -2025-09-11 23:36:53,574 - DEBUG - Processing file 12/54: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl -2025-09-11 23:36:53,574 - INFO - Processing file: gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl -2025-09-11 23:37:32,215 - DEBUG - Loaded data shape: (1373692320, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:37:34,080 - DEBUG - Extracted CCC data shape: (1373692320, 1) -2025-09-11 23:37:34,080 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:41:32,142 - INFO - Successfully processed gtex_v8_data_brain_cerebellum-var_pc_log2-all.pkl -> gtex_v8_data_brain_cerebellum-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:41:32,142 - INFO - Size reduction: 17.91 GB -> 5.75 GB (67.9% smaller) -2025-09-11 23:41:33,450 - DEBUG - Processing file 13/54: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl -2025-09-11 23:41:33,450 - INFO - Processing file: gtex_v8_data_brain_cortex-var_pc_log2-all.pkl -2025-09-11 23:42:24,751 - DEBUG - Loaded data shape: (1428371076, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:42:26,765 - DEBUG - Extracted CCC data shape: (1428371076, 1) -2025-09-11 23:42:26,765 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:46:22,020 - INFO - Successfully processed gtex_v8_data_brain_cortex-var_pc_log2-all.pkl -> gtex_v8_data_brain_cortex-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:46:22,020 - INFO - Size reduction: 18.63 GB -> 6.26 GB (66.4% smaller) -2025-09-11 23:46:23,372 - DEBUG - Processing file 14/54: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl -2025-09-11 23:46:23,372 - INFO - Processing file: gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl -2025-09-11 23:47:00,136 - DEBUG - Loaded data shape: (1359576585, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:47:01,992 - DEBUG - Extracted CCC data shape: (1359576585, 1) -2025-09-11 23:47:01,992 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:50:37,217 - INFO - Successfully processed gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all.pkl -> gtex_v8_data_brain_frontal_cortex_ba9-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:50:37,217 - INFO - Size reduction: 17.73 GB -> 5.59 GB (68.5% smaller) -2025-09-11 23:50:38,531 - DEBUG - Processing file 15/54: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl -2025-09-11 23:50:38,531 - INFO - Processing file: gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl -2025-09-11 23:52:04,330 - DEBUG - Loaded data shape: (1381565895, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:52:06,311 - DEBUG - Extracted CCC data shape: (1381565895, 1) -2025-09-11 23:52:06,311 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:55:49,825 - INFO - Successfully processed gtex_v8_data_brain_hippocampus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hippocampus-var_pc_log2-all_ccc_only.parquet -2025-09-11 23:55:49,825 - INFO - Size reduction: 18.01 GB -> 5.78 GB (67.9% smaller) -2025-09-11 23:55:51,065 - DEBUG - Processing file 16/54: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl -2025-09-11 23:55:51,065 - INFO - Processing file: gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl -2025-09-11 23:56:27,935 - DEBUG - Loaded data shape: (1371020430, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-11 23:56:29,884 - DEBUG - Extracted CCC data shape: (1371020430, 1) -2025-09-11 23:56:29,884 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:00:09,533 - INFO - Successfully processed gtex_v8_data_brain_hypothalamus-var_pc_log2-all.pkl -> gtex_v8_data_brain_hypothalamus-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:00:09,533 - INFO - Size reduction: 17.88 GB -> 5.67 GB (68.3% smaller) -2025-09-12 00:00:10,797 - DEBUG - Processing file 17/54: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl -2025-09-12 00:00:10,797 - INFO - Processing file: gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl -2025-09-12 00:00:44,756 - DEBUG - Loaded data shape: (1389198405, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:00:46,719 - DEBUG - Extracted CCC data shape: (1389198405, 1) -2025-09-12 00:00:46,719 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:05:05,403 - INFO - Successfully processed gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_nucleus_accumbens_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:05:05,403 - INFO - Size reduction: 18.11 GB -> 5.84 GB (67.7% smaller) -2025-09-12 00:05:06,693 - DEBUG - Processing file 18/54: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl -2025-09-12 00:05:06,693 - INFO - Processing file: gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl -2025-09-12 00:05:39,317 - DEBUG - Loaded data shape: (1336936195, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:05:41,239 - DEBUG - Extracted CCC data shape: (1336936195, 1) -2025-09-12 00:05:41,239 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:09:17,133 - INFO - Successfully processed gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all.pkl -> gtex_v8_data_brain_putamen_basal_ganglia-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:09:17,134 - INFO - Size reduction: 17.43 GB -> 5.35 GB (69.3% smaller) -2025-09-12 00:09:18,419 - DEBUG - Processing file 19/54: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl -2025-09-12 00:09:18,419 - INFO - Processing file: gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl -2025-09-12 00:09:55,385 - DEBUG - Loaded data shape: (1305886065, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:09:57,241 - DEBUG - Extracted CCC data shape: (1305886065, 1) -2025-09-12 00:09:57,241 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:13:24,327 - INFO - Successfully processed gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all.pkl -> gtex_v8_data_brain_spinal_cord_cervical_c1-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:13:24,328 - INFO - Size reduction: 17.03 GB -> 5.07 GB (70.2% smaller) -2025-09-12 00:13:25,591 - DEBUG - Processing file 20/54: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl -2025-09-12 00:13:25,591 - INFO - Processing file: gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl -2025-09-12 00:13:56,027 - DEBUG - Loaded data shape: (1278940600, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:13:57,877 - DEBUG - Extracted CCC data shape: (1278940600, 1) -2025-09-12 00:13:57,877 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:17:25,692 - INFO - Successfully processed gtex_v8_data_brain_substantia_nigra-var_pc_log2-all.pkl -> gtex_v8_data_brain_substantia_nigra-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:17:25,692 - INFO - Size reduction: 16.68 GB -> 4.82 GB (71.1% smaller) -2025-09-12 00:17:26,928 - DEBUG - Processing file 21/54: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl -2025-09-12 00:17:26,928 - INFO - Processing file: gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl -2025-09-12 00:18:07,581 - DEBUG - Loaded data shape: (1452847560, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:18:09,592 - DEBUG - Extracted CCC data shape: (1452847560, 1) -2025-09-12 00:18:09,593 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:22:35,756 - INFO - Successfully processed gtex_v8_data_breast_mammary_tissue-var_pc_log2-all.pkl -> gtex_v8_data_breast_mammary_tissue-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:22:35,756 - INFO - Size reduction: 18.94 GB -> 6.50 GB (65.7% smaller) -2025-09-12 00:22:37,092 - DEBUG - Processing file 22/54: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl -2025-09-12 00:22:37,092 - INFO - Processing file: gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl -2025-09-12 00:23:16,518 - DEBUG - Loaded data shape: (1401877725, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:23:18,482 - DEBUG - Extracted CCC data shape: (1401877725, 1) -2025-09-12 00:23:18,482 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:27:42,509 - INFO - Successfully processed gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all.pkl -> gtex_v8_data_cells_cultured_fibroblasts-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:27:42,510 - INFO - Size reduction: 18.28 GB -> 6.01 GB (67.1% smaller) -2025-09-12 00:27:43,822 - DEBUG - Processing file 23/54: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl -2025-09-12 00:27:43,822 - INFO - Processing file: gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl -2025-09-12 00:28:16,439 - DEBUG - Loaded data shape: (1338539670, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:28:18,322 - DEBUG - Extracted CCC data shape: (1338539670, 1) -2025-09-12 00:28:18,322 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:31:50,501 - INFO - Successfully processed gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all.pkl -> gtex_v8_data_cells_ebvtransformed_lymphocytes-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:31:50,501 - INFO - Size reduction: 17.45 GB -> 5.43 GB (68.9% smaller) -2025-09-12 00:31:51,756 - DEBUG - Processing file 24/54: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl -2025-09-12 00:31:51,756 - INFO - Processing file: gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl -2025-09-12 00:32:16,338 - DEBUG - Loaded data shape: (871468626, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:32:17,565 - DEBUG - Extracted CCC data shape: (871468626, 1) -2025-09-12 00:32:17,565 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:34:32,270 - INFO - Successfully processed gtex_v8_data_cervix_ectocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_ectocervix-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:34:32,270 - INFO - Size reduction: 11.36 GB -> 2.56 GB (77.4% smaller) -2025-09-12 00:34:33,139 - DEBUG - Processing file 25/54: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl -2025-09-12 00:34:33,139 - INFO - Processing file: gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl -2025-09-12 00:34:57,887 - DEBUG - Loaded data shape: (883533666, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:34:59,115 - DEBUG - Extracted CCC data shape: (883533666, 1) -2025-09-12 00:34:59,116 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:37:14,667 - INFO - Successfully processed gtex_v8_data_cervix_endocervix-var_pc_log2-all.pkl -> gtex_v8_data_cervix_endocervix-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:37:14,667 - INFO - Size reduction: 11.52 GB -> 2.70 GB (76.6% smaller) -2025-09-12 00:37:15,482 - DEBUG - Processing file 26/54: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl -2025-09-12 00:37:15,482 - INFO - Processing file: gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl -2025-09-12 00:38:40,776 - DEBUG - Loaded data shape: (1414189153, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:38:42,695 - DEBUG - Extracted CCC data shape: (1414189153, 1) -2025-09-12 00:38:42,695 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:42:32,231 - INFO - Successfully processed gtex_v8_data_colon_sigmoid-var_pc_log2-all.pkl -> gtex_v8_data_colon_sigmoid-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:42:32,231 - INFO - Size reduction: 18.44 GB -> 6.12 GB (66.8% smaller) -2025-09-12 00:42:33,416 - DEBUG - Processing file 27/54: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl -2025-09-12 00:42:33,416 - INFO - Processing file: gtex_v8_data_colon_transverse-var_pc_log2-all.pkl -2025-09-12 00:43:13,557 - DEBUG - Loaded data shape: (1425646503, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:43:15,585 - DEBUG - Extracted CCC data shape: (1425646503, 1) -2025-09-12 00:43:15,585 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:47:04,970 - INFO - Successfully processed gtex_v8_data_colon_transverse-var_pc_log2-all.pkl -> gtex_v8_data_colon_transverse-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:47:04,971 - INFO - Size reduction: 18.59 GB -> 6.22 GB (66.6% smaller) -2025-09-12 00:47:06,372 - DEBUG - Processing file 28/54: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl -2025-09-12 00:47:06,372 - INFO - Processing file: gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl -2025-09-12 00:47:40,754 - DEBUG - Loaded data shape: (1407708330, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:47:42,712 - DEBUG - Extracted CCC data shape: (1407708330, 1) -2025-09-12 00:47:42,712 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:51:30,851 - INFO - Successfully processed gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_gastroesophageal_junction-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:51:30,852 - INFO - Size reduction: 18.36 GB -> 6.05 GB (67.0% smaller) -2025-09-12 00:51:32,216 - DEBUG - Processing file 29/54: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl -2025-09-12 00:51:32,217 - INFO - Processing file: gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl -2025-09-12 00:52:12,189 - DEBUG - Loaded data shape: (1429226380, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:52:14,202 - DEBUG - Extracted CCC data shape: (1429226380, 1) -2025-09-12 00:52:14,202 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:56:04,136 - INFO - Successfully processed gtex_v8_data_esophagus_mucosa-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_mucosa-var_pc_log2-all_ccc_only.parquet -2025-09-12 00:56:04,136 - INFO - Size reduction: 18.64 GB -> 6.26 GB (66.4% smaller) -2025-09-12 00:56:05,503 - DEBUG - Processing file 30/54: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl -2025-09-12 00:56:05,504 - INFO - Processing file: gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl -2025-09-12 00:56:40,714 - DEBUG - Loaded data shape: (1438705261, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 00:56:42,738 - DEBUG - Extracted CCC data shape: (1438705261, 1) -2025-09-12 00:56:42,738 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:01:04,618 - INFO - Successfully processed gtex_v8_data_esophagus_muscularis-var_pc_log2-all.pkl -> gtex_v8_data_esophagus_muscularis-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:01:04,618 - INFO - Size reduction: 18.76 GB -> 6.34 GB (66.2% smaller) -2025-09-12 01:01:05,944 - DEBUG - Processing file 31/54: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl -2025-09-12 01:01:05,944 - INFO - Processing file: gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl -2025-09-12 01:01:30,707 - DEBUG - Loaded data shape: (869799486, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:01:31,984 - DEBUG - Extracted CCC data shape: (869799486, 1) -2025-09-12 01:01:31,984 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:03:48,060 - INFO - Successfully processed gtex_v8_data_fallopian_tube-var_pc_log2-all.pkl -> gtex_v8_data_fallopian_tube-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:03:48,061 - INFO - Size reduction: 11.34 GB -> 2.57 GB (77.4% smaller) -2025-09-12 01:03:48,926 - DEBUG - Processing file 32/54: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl -2025-09-12 01:03:48,926 - INFO - Processing file: gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl -2025-09-12 01:04:24,279 - DEBUG - Loaded data shape: (1416051153, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:04:26,222 - DEBUG - Extracted CCC data shape: (1416051153, 1) -2025-09-12 01:04:26,222 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:08:16,477 - INFO - Successfully processed gtex_v8_data_heart_atrial_appendage-var_pc_log2-all.pkl -> gtex_v8_data_heart_atrial_appendage-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:08:16,477 - INFO - Size reduction: 18.46 GB -> 6.11 GB (66.9% smaller) -2025-09-12 01:08:17,711 - DEBUG - Processing file 33/54: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl -2025-09-12 01:08:17,711 - INFO - Processing file: gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl -2025-09-12 01:08:56,782 - DEBUG - Loaded data shape: (1389303828, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:08:58,759 - DEBUG - Extracted CCC data shape: (1389303828, 1) -2025-09-12 01:08:58,759 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:12:40,469 - INFO - Successfully processed gtex_v8_data_heart_left_ventricle-var_pc_log2-all.pkl -> gtex_v8_data_heart_left_ventricle-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:12:40,469 - INFO - Size reduction: 18.12 GB -> 5.84 GB (67.8% smaller) -2025-09-12 01:12:41,779 - DEBUG - Processing file 34/54: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl -2025-09-12 01:12:41,779 - INFO - Processing file: gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl -2025-09-12 01:13:12,636 - DEBUG - Loaded data shape: (1231692528, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:13:14,333 - DEBUG - Extracted CCC data shape: (1231692528, 1) -2025-09-12 01:13:14,333 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:16:29,445 - INFO - Successfully processed gtex_v8_data_kidney_cortex-var_pc_log2-all.pkl -> gtex_v8_data_kidney_cortex-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:16:29,445 - INFO - Size reduction: 16.06 GB -> 4.38 GB (72.7% smaller) -2025-09-12 01:16:30,622 - DEBUG - Processing file 35/54: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl -2025-09-12 01:16:30,622 - INFO - Processing file: gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl -2025-09-12 01:16:48,096 - DEBUG - Loaded data shape: (692459505, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:16:49,091 - DEBUG - Extracted CCC data shape: (692459505, 1) -2025-09-12 01:16:49,091 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:18:36,495 - INFO - Successfully processed gtex_v8_data_kidney_medulla-var_pc_log2-all.pkl -> gtex_v8_data_kidney_medulla-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:18:36,495 - INFO - Size reduction: 9.03 GB -> 1.43 GB (84.1% smaller) -2025-09-12 01:18:37,172 - DEBUG - Processing file 36/54: gtex_v8_data_liver-var_pc_log2-all.pkl -2025-09-12 01:18:37,172 - INFO - Processing file: gtex_v8_data_liver-var_pc_log2-all.pkl -2025-09-12 01:19:14,213 - DEBUG - Loaded data shape: (1313153128, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:19:16,077 - DEBUG - Extracted CCC data shape: (1313153128, 1) -2025-09-12 01:19:16,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:22:44,977 - INFO - Successfully processed gtex_v8_data_liver-var_pc_log2-all.pkl -> gtex_v8_data_liver-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:22:44,977 - INFO - Size reduction: 17.12 GB -> 5.16 GB (69.9% smaller) -2025-09-12 01:22:46,127 - DEBUG - Processing file 37/54: gtex_v8_data_lung-var_pc_log2-all.pkl -2025-09-12 01:22:46,127 - INFO - Processing file: gtex_v8_data_lung-var_pc_log2-all.pkl -2025-09-12 01:23:21,938 - DEBUG - Loaded data shape: (1461917628, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:23:24,049 - DEBUG - Extracted CCC data shape: (1461917628, 1) -2025-09-12 01:23:24,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:27:20,003 - INFO - Successfully processed gtex_v8_data_lung-var_pc_log2-all.pkl -> gtex_v8_data_lung-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:27:20,003 - INFO - Size reduction: 19.06 GB -> 6.58 GB (65.5% smaller) -2025-09-12 01:27:21,442 - DEBUG - Processing file 38/54: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl -2025-09-12 01:27:21,442 - INFO - Processing file: gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl -2025-09-12 01:27:54,922 - DEBUG - Loaded data shape: (1331409003, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:27:56,838 - DEBUG - Extracted CCC data shape: (1331409003, 1) -2025-09-12 01:27:56,838 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:31:28,578 - INFO - Successfully processed gtex_v8_data_minor_salivary_gland-var_pc_log2-all.pkl -> gtex_v8_data_minor_salivary_gland-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:31:28,578 - INFO - Size reduction: 17.36 GB -> 5.35 GB (69.2% smaller) -2025-09-12 01:31:29,896 - DEBUG - Processing file 39/54: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl -2025-09-12 01:31:29,896 - INFO - Processing file: gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl -2025-09-12 01:32:06,348 - DEBUG - Loaded data shape: (1460025703, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:32:08,350 - DEBUG - Extracted CCC data shape: (1460025703, 1) -2025-09-12 01:32:08,350 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:36:04,226 - INFO - Successfully processed gtex_v8_data_muscle_skeletal-var_pc_log2-all.pkl -> gtex_v8_data_muscle_skeletal-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:36:04,226 - INFO - Size reduction: 19.04 GB -> 6.54 GB (65.7% smaller) -2025-09-12 01:36:05,600 - DEBUG - Processing file 40/54: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl -2025-09-12 01:36:05,601 - INFO - Processing file: gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl -2025-09-12 01:36:42,425 - DEBUG - Loaded data shape: (1472643585, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:36:44,497 - DEBUG - Extracted CCC data shape: (1472643585, 1) -2025-09-12 01:36:44,497 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:40:42,722 - INFO - Successfully processed gtex_v8_data_nerve_tibial-var_pc_log2-all.pkl -> gtex_v8_data_nerve_tibial-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:40:42,722 - INFO - Size reduction: 19.20 GB -> 6.69 GB (65.2% smaller) -2025-09-12 01:40:44,106 - DEBUG - Processing file 41/54: gtex_v8_data_ovary-var_pc_log2-all.pkl -2025-09-12 01:40:44,106 - INFO - Processing file: gtex_v8_data_ovary-var_pc_log2-all.pkl -2025-09-12 01:41:18,148 - DEBUG - Loaded data shape: (1353222276, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:41:20,048 - DEBUG - Extracted CCC data shape: (1353222276, 1) -2025-09-12 01:41:20,049 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:45:04,730 - INFO - Successfully processed gtex_v8_data_ovary-var_pc_log2-all.pkl -> gtex_v8_data_ovary-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:45:04,730 - INFO - Size reduction: 17.65 GB -> 5.56 GB (68.5% smaller) -2025-09-12 01:45:06,016 - DEBUG - Processing file 42/54: gtex_v8_data_pancreas-var_pc_log2-all.pkl -2025-09-12 01:45:06,016 - INFO - Processing file: gtex_v8_data_pancreas-var_pc_log2-all.pkl -2025-09-12 01:46:30,520 - DEBUG - Loaded data shape: (1369711630, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:46:32,480 - DEBUG - Extracted CCC data shape: (1369711630, 1) -2025-09-12 01:46:32,480 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:50:13,619 - INFO - Successfully processed gtex_v8_data_pancreas-var_pc_log2-all.pkl -> gtex_v8_data_pancreas-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:50:13,619 - INFO - Size reduction: 17.86 GB -> 5.68 GB (68.2% smaller) -2025-09-12 01:50:14,880 - DEBUG - Processing file 43/54: gtex_v8_data_pituitary-var_pc_log2-all.pkl -2025-09-12 01:50:14,880 - INFO - Processing file: gtex_v8_data_pituitary-var_pc_log2-all.pkl -2025-09-12 01:50:55,006 - DEBUG - Loaded data shape: (1418660011, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:50:57,042 - DEBUG - Extracted CCC data shape: (1418660011, 1) -2025-09-12 01:50:57,042 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:54:43,653 - INFO - Successfully processed gtex_v8_data_pituitary-var_pc_log2-all.pkl -> gtex_v8_data_pituitary-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:54:43,653 - INFO - Size reduction: 18.50 GB -> 6.17 GB (66.6% smaller) -2025-09-12 01:54:44,998 - DEBUG - Processing file 44/54: gtex_v8_data_prostate-var_pc_log2-all.pkl -2025-09-12 01:54:44,998 - INFO - Processing file: gtex_v8_data_prostate-var_pc_log2-all.pkl -2025-09-12 01:55:19,883 - DEBUG - Loaded data shape: (1395161076, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:55:21,840 - DEBUG - Extracted CCC data shape: (1395161076, 1) -2025-09-12 01:55:21,840 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:59:09,359 - INFO - Successfully processed gtex_v8_data_prostate-var_pc_log2-all.pkl -> gtex_v8_data_prostate-var_pc_log2-all_ccc_only.parquet -2025-09-12 01:59:09,359 - INFO - Size reduction: 18.19 GB -> 5.96 GB (67.2% smaller) -2025-09-12 01:59:10,703 - DEBUG - Processing file 45/54: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl -2025-09-12 01:59:10,703 - INFO - Processing file: gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl -2025-09-12 01:59:51,917 - DEBUG - Loaded data shape: (1458621066, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 01:59:53,937 - DEBUG - Extracted CCC data shape: (1458621066, 1) -2025-09-12 01:59:53,937 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:03:48,454 - INFO - Successfully processed gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all.pkl -> gtex_v8_data_skin_not_sun_exposed_suprapubic-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:03:48,454 - INFO - Size reduction: 19.02 GB -> 6.55 GB (65.6% smaller) -2025-09-12 02:03:49,884 - DEBUG - Processing file 46/54: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl -2025-09-12 02:03:49,884 - INFO - Processing file: gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl -2025-09-12 02:05:18,157 - DEBUG - Loaded data shape: (1473566328, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:05:20,326 - DEBUG - Extracted CCC data shape: (1473566328, 1) -2025-09-12 02:05:20,327 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:09:19,847 - INFO - Successfully processed gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl -> gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:09:19,847 - INFO - Size reduction: 19.21 GB -> 6.69 GB (65.2% smaller) -2025-09-12 02:09:21,270 - DEBUG - Processing file 47/54: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl -2025-09-12 02:09:21,270 - INFO - Processing file: gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl -2025-09-12 02:09:59,324 - DEBUG - Loaded data shape: (1353014190, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:10:01,233 - DEBUG - Extracted CCC data shape: (1353014190, 1) -2025-09-12 02:10:01,233 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:13:38,031 - INFO - Successfully processed gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all.pkl -> gtex_v8_data_small_intestine_terminal_ileum-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:13:38,032 - INFO - Size reduction: 17.64 GB -> 5.54 GB (68.6% smaller) -2025-09-12 02:13:39,310 - DEBUG - Processing file 48/54: gtex_v8_data_spleen-var_pc_log2-all.pkl -2025-09-12 02:13:39,310 - INFO - Processing file: gtex_v8_data_spleen-var_pc_log2-all.pkl -2025-09-12 02:14:13,490 - DEBUG - Loaded data shape: (1367095905, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:14:15,383 - DEBUG - Extracted CCC data shape: (1367095905, 1) -2025-09-12 02:14:15,383 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:17:52,891 - INFO - Successfully processed gtex_v8_data_spleen-var_pc_log2-all.pkl -> gtex_v8_data_spleen-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:17:52,891 - INFO - Size reduction: 17.83 GB -> 5.68 GB (68.1% smaller) -2025-09-12 02:17:54,199 - DEBUG - Processing file 49/54: gtex_v8_data_stomach-var_pc_log2-all.pkl -2025-09-12 02:17:54,199 - INFO - Processing file: gtex_v8_data_stomach-var_pc_log2-all.pkl -2025-09-12 02:18:29,137 - DEBUG - Loaded data shape: (1402248403, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:18:31,077 - DEBUG - Extracted CCC data shape: (1402248403, 1) -2025-09-12 02:18:31,077 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:23:07,195 - INFO - Successfully processed gtex_v8_data_stomach-var_pc_log2-all.pkl -> gtex_v8_data_stomach-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:23:07,195 - INFO - Size reduction: 18.28 GB -> 5.99 GB (67.2% smaller) -2025-09-12 02:23:08,518 - DEBUG - Processing file 50/54: gtex_v8_data_testis-var_pc_log2-all.pkl -2025-09-12 02:23:08,519 - INFO - Processing file: gtex_v8_data_testis-var_pc_log2-all.pkl -2025-09-12 02:23:47,944 - DEBUG - Loaded data shape: (1502917725, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:23:50,000 - DEBUG - Extracted CCC data shape: (1502917725, 1) -2025-09-12 02:23:50,000 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:27:48,966 - INFO - Successfully processed gtex_v8_data_testis-var_pc_log2-all.pkl -> gtex_v8_data_testis-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:27:48,966 - INFO - Size reduction: 19.60 GB -> 7.02 GB (64.2% smaller) -2025-09-12 02:27:50,385 - DEBUG - Processing file 51/54: gtex_v8_data_thyroid-var_pc_log2-all.pkl -2025-09-12 02:27:50,385 - INFO - Processing file: gtex_v8_data_thyroid-var_pc_log2-all.pkl -2025-09-12 02:28:27,204 - DEBUG - Loaded data shape: (1472317980, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:28:29,203 - DEBUG - Extracted CCC data shape: (1472317980, 1) -2025-09-12 02:28:29,204 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:32:22,715 - INFO - Successfully processed gtex_v8_data_thyroid-var_pc_log2-all.pkl -> gtex_v8_data_thyroid-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:32:22,715 - INFO - Size reduction: 19.20 GB -> 6.68 GB (65.2% smaller) -2025-09-12 02:32:24,123 - DEBUG - Processing file 52/54: gtex_v8_data_uterus-var_pc_log2-all.pkl -2025-09-12 02:32:24,123 - INFO - Processing file: gtex_v8_data_uterus-var_pc_log2-all.pkl -2025-09-12 02:32:56,654 - DEBUG - Loaded data shape: (1308289128, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:32:58,515 - DEBUG - Extracted CCC data shape: (1308289128, 1) -2025-09-12 02:32:58,515 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:36:26,336 - INFO - Successfully processed gtex_v8_data_uterus-var_pc_log2-all.pkl -> gtex_v8_data_uterus-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:36:26,338 - INFO - Size reduction: 17.06 GB -> 5.16 GB (69.8% smaller) -2025-09-12 02:36:27,576 - DEBUG - Processing file 53/54: gtex_v8_data_vagina-var_pc_log2-all.pkl -2025-09-12 02:36:27,576 - INFO - Processing file: gtex_v8_data_vagina-var_pc_log2-all.pkl -2025-09-12 02:37:09,453 - DEBUG - Loaded data shape: (1328623926, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:37:11,361 - DEBUG - Extracted CCC data shape: (1328623926, 1) -2025-09-12 02:37:11,361 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:40:41,189 - INFO - Successfully processed gtex_v8_data_vagina-var_pc_log2-all.pkl -> gtex_v8_data_vagina-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:40:41,189 - INFO - Size reduction: 17.32 GB -> 5.33 GB (69.2% smaller) -2025-09-12 02:40:42,400 - DEBUG - Processing file 54/54: gtex_v8_data_whole_blood-var_pc_log2-all.pkl -2025-09-12 02:40:42,400 - INFO - Processing file: gtex_v8_data_whole_blood-var_pc_log2-all.pkl -2025-09-12 02:41:17,012 - DEBUG - Loaded data shape: (1420258456, 3), columns: ['ccc', 'pearson', 'spearman'] -2025-09-12 02:41:19,005 - DEBUG - Extracted CCC data shape: (1420258456, 1) -2025-09-12 02:41:19,005 - DEBUG - Saving .parquet file: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet/gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:46:00,216 - INFO - Successfully processed gtex_v8_data_whole_blood-var_pc_log2-all.pkl -> gtex_v8_data_whole_blood-var_pc_log2-all_ccc_only.parquet -2025-09-12 02:46:00,216 - INFO - Size reduction: 18.52 GB -> 6.12 GB (66.9% smaller) -2025-09-12 02:46:01,553 - INFO - File processing completed in 3:55:57.451227 -2025-09-12 02:46:01,553 - INFO - Processing complete! Successfully processed: 54/54 files -2025-09-12 02:46:01,553 - INFO - Total execution time: 3:55:57.451335 -2025-09-12 02:46:01,553 - INFO - Output directory: /mnt/data/proj_data/ccc-gpu/data/gtex/ccc_similarity_matrices_parquet -2025-09-12 02:46:01,553 - INFO - Log file: /home/haoyu/_database/projs/ccc-gpu/nbs/03-manuscript/40_prepare_supp_data/logs/process_ccc_data_20250911_225004.log diff --git a/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py new file mode 100755 index 00000000..bb729d39 --- /dev/null +++ b/nbs/03-manuscript/40_prepare_supp_data/process_ccc_to_duckdb.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python3 +""" +Convert GTEx CCC correlation data from pickle format to DuckDB format for efficient storage and fast queries. + +This script processes all .pkl files containing gene correlation data and creates optimized +DuckDB databases that provide: +- Fast random access to gene pairs (sub-millisecond queries) +- Significantly reduced storage size +- SQL query capabilities +- Minimal memory usage for queries +""" + +import argparse +import logging +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import gc + +import pandas as pd +import duckdb +import numpy as np +from tqdm import tqdm + + +def setup_logging(debug: bool = False) -> str: + """Set up logging with timestamped log file.""" + script_dir = Path(__file__).parent + logs_dir = script_dir / "logs" + logs_dir.mkdir(exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_filename = logs_dir / f"process_ccc_to_duckdb_{timestamp}.log" + + logging.basicConfig( + level=logging.DEBUG if debug else logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_filename), + logging.StreamHandler(sys.stdout) + ] + ) + + logger = logging.getLogger(__name__) + logger.info(f"Starting CCC to DuckDB conversion") + logger.info(f"Log file: {log_filename}") + + return str(log_filename) + + +def convert_pickle_to_duckdb( + pkl_file: Path, + output_dir: Path, + single_db: bool = False, + db_con: Optional[duckdb.DuckDBPyConnection] = None, + chunk_size: int = 10_000_000 +) -> Dict: + """ + Convert a single pickle file to DuckDB format. + + Args: + pkl_file: Path to input pickle file + output_dir: Directory for output database files + single_db: If True, append to single database (db_con must be provided) + db_con: Existing DuckDB connection (if single_db is True) + chunk_size: Number of rows to process at once + + Returns: + Dictionary with conversion statistics + """ + logger = logging.getLogger(__name__) + stats = {} + start_time = datetime.now() + + # Get tissue name from filename + tissue_name = pkl_file.stem.replace('gtex_v8_data_', '').replace('-var_pc_log2-all', '') + stats['tissue'] = tissue_name + stats['input_file'] = pkl_file.name + + logger.info(f"Processing: {pkl_file.name}") + + try: + # Load pickle file + logger.info(f"Loading pickle file...") + load_start = datetime.now() + df = pd.read_pickle(pkl_file) + load_time = (datetime.now() - load_start).total_seconds() + + stats['input_rows'] = len(df) + stats['input_size_gb'] = pkl_file.stat().st_size / (1024**3) + logger.info(f"Loaded {len(df):,} rows in {load_time:.1f}s ({stats['input_size_gb']:.2f} GB)") + + # Extract only CCC column and reset index + logger.info("Preparing data...") + df_ccc = df[['ccc']].reset_index() + df_ccc.columns = ['gene1', 'gene2', 'ccc'] + + # Convert to appropriate types + df_ccc['ccc'] = df_ccc['ccc'].astype('float32') + df_ccc['gene1'] = df_ccc['gene1'].astype(str) + df_ccc['gene2'] = df_ccc['gene2'].astype(str) + + # Clean up original dataframe to free memory + del df + gc.collect() + + # Create or connect to database + if single_db: + con = db_con + table_name = f"ccc_{tissue_name}" + else: + db_file = output_dir / f"{tissue_name}_ccc.duckdb" + con = duckdb.connect(str(db_file)) + table_name = "ccc_data" + + # Create table + logger.info(f"Creating table {table_name}...") + + if single_db: + # For single database, include tissue in the table + con.execute(f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + gene1 VARCHAR NOT NULL, + gene2 VARCHAR NOT NULL, + ccc REAL NOT NULL, + PRIMARY KEY (gene1, gene2) + ) + """) + else: + con.execute(f""" + CREATE TABLE {table_name} ( + gene1 VARCHAR NOT NULL, + gene2 VARCHAR NOT NULL, + ccc REAL NOT NULL, + PRIMARY KEY (gene1, gene2) + ) + """) + + # Insert data efficiently + logger.info(f"Inserting {len(df_ccc):,} rows...") + insert_start = datetime.now() + + # Use DuckDB's register for bulk insert + con.register('df_temp', df_ccc) + con.execute(f"INSERT INTO {table_name} SELECT * FROM df_temp") + con.unregister('df_temp') + + insert_time = (datetime.now() - insert_start).total_seconds() + stats['insert_time'] = insert_time + logger.info(f"Data inserted in {insert_time:.1f}s") + + # Clean up dataframe + del df_ccc + gc.collect() + + # Create indexes for faster lookups + logger.info("Creating indexes...") + index_start = datetime.now() + + # Create index on gene2 for reverse lookups + con.execute(f"CREATE INDEX idx_{table_name}_gene2 ON {table_name}(gene2)") + + # Create index on ccc for range queries + con.execute(f"CREATE INDEX idx_{table_name}_ccc ON {table_name}(ccc)") + + # Analyze table for query optimization + con.execute(f"ANALYZE {table_name}") + + index_time = (datetime.now() - index_start).total_seconds() + stats['index_time'] = index_time + + # Get final statistics + result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone() + stats['output_rows'] = result[0] + + if not single_db: + # Close connection and get file size + con.close() + stats['output_size_gb'] = (output_dir / f"{tissue_name}_ccc.duckdb").stat().st_size / (1024**3) + + stats['total_time'] = (datetime.now() - start_time).total_seconds() + + if 'output_size_gb' in stats: + stats['compression_ratio'] = stats['input_size_gb'] / stats['output_size_gb'] + logger.info(f"Completed: {stats['input_size_gb']:.2f} GB -> {stats['output_size_gb']:.2f} GB " + f"(compression: {stats['compression_ratio']:.1f}x)") + + logger.info(f"Total time: {stats['total_time']:.1f}s") + + except Exception as e: + logger.error(f"Error processing {pkl_file.name}: {e}") + stats['error'] = str(e) + if not single_db and 'con' in locals(): + con.close() + + return stats + + +def create_consolidated_database( + pkl_files: List[Path], + output_dir: Path +) -> Dict: + """ + Create a single consolidated DuckDB database with all tissues. + + Args: + pkl_files: List of pickle files to process + output_dir: Directory for output database + + Returns: + Dictionary with overall statistics + """ + logger = logging.getLogger(__name__) + + db_file = output_dir / "all_tissues_ccc.duckdb" + logger.info(f"Creating consolidated database: {db_file}") + + con = duckdb.connect(str(db_file)) + all_stats = [] + + try: + # Create master table for tissue metadata + con.execute(""" + CREATE TABLE tissues ( + tissue_id INTEGER PRIMARY KEY, + tissue_name VARCHAR UNIQUE NOT NULL, + num_pairs BIGINT, + min_ccc REAL, + max_ccc REAL, + mean_ccc REAL + ) + """) + + tissue_id = 1 + + for pkl_file in tqdm(pkl_files, desc="Processing tissues"): + stats = convert_pickle_to_duckdb( + pkl_file=pkl_file, + output_dir=output_dir, + single_db=True, + db_con=con + ) + + if 'error' not in stats: + # Add tissue metadata + tissue_name = stats['tissue'] + table_name = f"ccc_{tissue_name}" + + tissue_stats = con.execute(f""" + SELECT + COUNT(*) as num_pairs, + MIN(ccc) as min_ccc, + MAX(ccc) as max_ccc, + AVG(ccc) as mean_ccc + FROM {table_name} + """).fetchone() + + con.execute(""" + INSERT INTO tissues (tissue_id, tissue_name, num_pairs, min_ccc, max_ccc, mean_ccc) + VALUES (?, ?, ?, ?, ?, ?) + """, [tissue_id, tissue_name, *tissue_stats]) + + tissue_id += 1 + + all_stats.append(stats) + + # Create a view for easy cross-tissue queries + logger.info("Creating cross-tissue query views...") + + # Get list of all tissue tables + tissue_tables = con.execute(""" + SELECT 'ccc_' || tissue_name as table_name, tissue_name + FROM tissues + """).fetchall() + + # Create UNION ALL view for searching across all tissues + union_parts = [] + for table_name, tissue_name in tissue_tables: + union_parts.append(f""" + SELECT '{tissue_name}' as tissue, gene1, gene2, ccc + FROM {table_name} + """) + + if union_parts: + union_query = " UNION ALL ".join(union_parts) + con.execute(f""" + CREATE VIEW all_correlations AS + {union_query} + """) + + logger.info("Created all_correlations view for cross-tissue queries") + + # Optimize database + logger.info("Optimizing database...") + con.execute("PRAGMA optimize") + + # Get final database size + con.close() + + db_size = db_file.stat().st_size / (1024**3) + logger.info(f"Consolidated database size: {db_size:.2f} GB") + + return { + 'database': str(db_file), + 'tissues_processed': len([s for s in all_stats if 'error' not in s]), + 'tissues_failed': len([s for s in all_stats if 'error' in s]), + 'total_size_gb': db_size, + 'stats': all_stats + } + + except Exception as e: + logger.error(f"Error creating consolidated database: {e}") + con.close() + raise + + +def main(): + parser = argparse.ArgumentParser( + description="Convert GTEx CCC data from pickle to DuckDB format" + ) + parser.add_argument( + "--source-dir", + type=str, + default="/mnt/data/proj_data/ccc-gpu/data/gtex/similarity_matrices/all", + help="Source directory containing .pkl files" + ) + parser.add_argument( + "--output-dir", + type=str, + default="/mnt/data/proj_data/ccc-gpu/manuscript_data/supplementary_data/ccc_duckdb", + help="Output directory for DuckDB files" + ) + parser.add_argument( + "--single-db", + action="store_true", + help="Create a single consolidated database instead of one per tissue" + ) + parser.add_argument( + "--tissues", + nargs="+", + help="Specific tissues to process (default: all)" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be processed without doing it" + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging" + ) + + args = parser.parse_args() + + # Setup logging + log_file = setup_logging(debug=args.debug) + logger = logging.getLogger(__name__) + + # Convert paths + source_dir = Path(args.source_dir) + output_dir = Path(args.output_dir) + + logger.info(f"Configuration:") + logger.info(f" Source: {source_dir}") + logger.info(f" Output: {output_dir}") + logger.info(f" Single DB: {args.single_db}") + + # Check source directory + if not source_dir.exists(): + logger.error(f"Source directory not found: {source_dir}") + sys.exit(1) + + # Get list of pickle files + pkl_files = sorted(source_dir.glob("*.pkl")) + + # Filter by specific tissues if requested + if args.tissues: + filtered = [] + for tissue in args.tissues: + matching = [f for f in pkl_files if tissue in f.name] + filtered.extend(matching) + pkl_files = filtered + + if not pkl_files: + logger.error("No pickle files found to process") + sys.exit(1) + + logger.info(f"Found {len(pkl_files)} files to process") + + if args.dry_run: + print("\nFiles that would be processed:") + for f in pkl_files: + size_gb = f.stat().st_size / (1024**3) + print(f" {f.name} ({size_gb:.2f} GB)") + print(f"\nOutput would be written to: {output_dir}") + return + + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) + + # Process files + start_time = datetime.now() + + if args.single_db: + # Create single consolidated database + results = create_consolidated_database(pkl_files, output_dir) + + print(f"\n{'='*60}") + print("PROCESSING COMPLETE") + print(f"{'='*60}") + print(f"Database: {results['database']}") + print(f"Tissues processed: {results['tissues_processed']}") + print(f"Tissues failed: {results['tissues_failed']}") + print(f"Total size: {results['total_size_gb']:.2f} GB") + + else: + # Create individual databases + all_stats = [] + + for pkl_file in tqdm(pkl_files, desc="Processing files"): + stats = convert_pickle_to_duckdb( + pkl_file=pkl_file, + output_dir=output_dir, + single_db=False + ) + all_stats.append(stats) + + # Summary + successful = [s for s in all_stats if 'error' not in s] + failed = [s for s in all_stats if 'error' in s] + + print(f"\n{'='*60}") + print("PROCESSING COMPLETE") + print(f"{'='*60}") + print(f"Files processed: {len(successful)}/{len(pkl_files)}") + + if successful: + total_input = sum(s['input_size_gb'] for s in successful) + total_output = sum(s.get('output_size_gb', 0) for s in successful) + avg_compression = total_input / total_output if total_output > 0 else 0 + + print(f"Total input size: {total_input:.2f} GB") + print(f"Total output size: {total_output:.2f} GB") + print(f"Average compression: {avg_compression:.1f}x") + + if failed: + print(f"\nFailed files ({len(failed)}):") + for s in failed: + print(f" {s['input_file']}: {s['error']}") + + total_time = (datetime.now() - start_time).total_seconds() + print(f"\nTotal processing time: {total_time/60:.1f} minutes") + print(f"Log file: {log_file}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb new file mode 100644 index 00000000..68f2b129 --- /dev/null +++ b/nbs/99-tutorials/05-walkthrough-with-gtex-data.ipynb @@ -0,0 +1,4056 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Description\n", + "This notebook demonstrates:\n", + "\n", + "1. how to compute coefficients values\n", + "2. how to correlate gene expression data with categorical metadata\n", + "\n", + "using CCC GPU with public data from GTEx v8." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please follow the instructions in the [README](../../README.md), section \"Quick Install with pip\" to install CCC-GPU with a conda environment `ccc-gpu-env`.\n", + "\n", + "Then activate the environment and start the jupyter notebook server in order to run this notebook.\n", + "\n", + "```bash\n", + "conda activate ccc-gpu-env\n", + "pip install notebook\n", + "jupyter notebook\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import pandas as pd\n", + "import urllib.request\n", + "from tqdm import tqdm\n", + "from pathlib import Path\n", + "\n", + "from ccc.utils import simplify_string\n", + "from ccc import conf" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Set this path to the directory where you want to save the intermediate data and results\n", + "ANALYSIS_DIR = Path(\"/mnt/data/proj_data/ccc-gpu/data/tutorial\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Fetching and Preprocessing\n", + "This section downloads:\n", + "1. the public GTEx v8 gene TPMs data (https://www.gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)\n", + "2. the GTEx sample attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n", + "3. the GTEx subject attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)\n", + "\n", + "and perform preprocessing to prepare the data for the analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download GTEx v8 gene expression data and split by tissue" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gtex_all_sample_ids_with_expr_data already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\n", + "gtex_sample_attrs already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\n", + "Downloading gtex_subject_attrs to /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\n", + "Download completed!\n" + ] + } + ], + "source": [ + "# Create analysis directory if it doesn't exist\n", + "os.makedirs(ANALYSIS_DIR, exist_ok=True)\n", + "\n", + "# Define files to download\n", + "files_to_download = {\n", + " \"gtex_all_sample_ids_with_expr_data\": \"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz\",\n", + " \"gtex_sample_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt\",\n", + " \"gtex_subject_attrs\": \"https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt\"\n", + "}\n", + "\n", + "# Dictionary to store file paths\n", + "file_paths = {}\n", + "\n", + "# Download files\n", + "for var_name, url in files_to_download.items():\n", + " filename = Path(url).name\n", + " file_path = Path(ANALYSIS_DIR) / filename\n", + " file_paths[var_name] = file_path\n", + " \n", + " if not file_path.exists():\n", + " print(f\"Downloading {var_name} to {file_path}\")\n", + " urllib.request.urlretrieve(url, file_path)\n", + " print(\"Download completed!\")\n", + " else:\n", + " print(f\"{var_name} already exists at {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GTEx sample attributes shape: (22951, 63)\n", + "GTEx sample attributes columns: Index(['SAMPID', 'SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'SMRIN', 'SMTS', 'SMTSD',\n", + " 'SMUBRID', 'SMTSISCH', 'SMTSPAX', 'SMNABTCH', 'SMNABTCHT', 'SMNABTCHD',\n", + " 'SMGEBTCH', 'SMGEBTCHD', 'SMGEBTCHT', 'SMAFRZE', 'SMGTC', 'SME2MPRT',\n", + " 'SMCHMPRS', 'SMNTRART', 'SMNUMGPS', 'SMMAPRT', 'SMEXNCRT', 'SM550NRM',\n", + " 'SMGNSDTC', 'SMUNMPRT', 'SM350NRM', 'SMRDLGTH', 'SMMNCPB', 'SME1MMRT',\n", + " 'SMSFLGTH', 'SMESTLBS', 'SMMPPD', 'SMNTERRT', 'SMRRNANM', 'SMRDTTL',\n", + " 'SMVQCFL', 'SMMNCV', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMGAPPCT',\n", + " 'SMUNPDRD', 'SMNTRNRT', 'SMMPUNRT', 'SMEXPEFF', 'SMMPPDUN', 'SME2MMRT',\n", + " 'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SME1ANTI', 'SMSPLTRD',\n", + " 'SMBSMMRT', 'SME1SNSE', 'SME1PCTS', 'SMRRNART', 'SME1MPRT', 'SMNUM5CD',\n", + " 'SMDPMPRT', 'SME2PCTS'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "gtex_sample_attrs = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\")\n", + "print(f\"GTEx sample attributes shape: {gtex_sample_attrs.shape}\")\n", + "print(f\"GTEx sample attributes columns: {gtex_sample_attrs.columns}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "55\n", + "['Whole Blood' 'Brain - Frontal Cortex (BA9)' 'Adipose - Subcutaneous'\n", + " 'Muscle - Skeletal' 'Artery - Tibial' 'Artery - Coronary'\n", + " 'Heart - Atrial Appendage' 'Adipose - Visceral (Omentum)' 'Ovary'\n", + " 'Uterus' 'Vagina' 'Breast - Mammary Tissue'\n", + " 'Skin - Not Sun Exposed (Suprapubic)' 'Minor Salivary Gland'\n", + " 'Brain - Cortex' 'Adrenal Gland' 'Thyroid' 'Lung' 'Spleen' 'Pancreas'\n", + " 'Esophagus - Muscularis' 'Esophagus - Mucosa'\n", + " 'Esophagus - Gastroesophageal Junction' 'Stomach' 'Colon - Sigmoid'\n", + " 'Small Intestine - Terminal Ileum' 'Colon - Transverse' 'Prostate'\n", + " 'Testis' 'Skin - Sun Exposed (Lower leg)' 'Nerve - Tibial'\n", + " 'Heart - Left Ventricle' 'Pituitary' 'Brain - Cerebellum'\n", + " 'Cells - Cultured fibroblasts' 'Artery - Aorta'\n", + " 'Cells - EBV-transformed lymphocytes' 'Brain - Cerebellar Hemisphere'\n", + " 'Brain - Caudate (basal ganglia)'\n", + " 'Brain - Nucleus accumbens (basal ganglia)'\n", + " 'Brain - Putamen (basal ganglia)' 'Brain - Hypothalamus'\n", + " 'Brain - Spinal cord (cervical c-1)' 'Liver' 'Brain - Hippocampus'\n", + " 'Brain - Anterior cingulate cortex (BA24)' 'Brain - Substantia nigra'\n", + " 'Kidney - Cortex' 'Brain - Amygdala' 'Cervix - Ectocervix'\n", + " 'Fallopian Tube' 'Cervix - Endocervix' 'Bladder' 'Kidney - Medulla'\n", + " 'Cells - Leukemia cell line (CML)']\n" + ] + } + ], + "source": [ + "# Get tissue names\n", + "gtex_tissues = gtex_sample_attrs[\"SMTSD\"].unique()\n", + "print(len(gtex_tissues))\n", + "print(gtex_tissues)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get sample IDs for each tissue" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of samples with expression data: 17382\n", + "Sample IDs with expression data: ['GTEX-1HFI7-2426-SM-B2LXV', 'GTEX-11TTK-0226-SM-5N9EC', 'GTEX-11UD2-1226-SM-5EQMI', 'GTEX-X4EO-0006-SM-3P5ZF', 'GTEX-13O21-0326-SM-5J1N9', 'GTEX-XBED-1526-SM-4AT5W', 'GTEX-13NZ8-0011-R8b-SM-5KM48', 'GTEX-1H3O1-0005-SM-ACKV8', 'GTEX-13JVG-0011-R5a-SM-5MR4O', 'GTEX-1F88F-1126-SM-7MKHL']\n" + ] + } + ], + "source": [ + "# first, get all sample IDs with expression data\n", + "gtex_all_sample_ids_with_expr_data = set(\n", + " pd.read_csv(\n", + " file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n", + " sep=\"\\t\",\n", + " skiprows=2,\n", + " nrows=1,\n", + " usecols=lambda x: x not in (\"Name\", \"Description\"),\n", + " ).columns\n", + ")\n", + "\n", + "print(f\"Number of samples with expression data: {len(gtex_all_sample_ids_with_expr_data)}\")\n", + "print(f\"Sample IDs with expression data: {list(gtex_all_sample_ids_with_expr_data)[:10]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# get sample IDs by tissue\n", + "sample_ids_by_tissue = {\n", + " tissue_name: sorted(\n", + " list(\n", + " gtex_all_sample_ids_with_expr_data.intersection(\n", + " set(\n", + " gtex_sample_attrs[gtex_sample_attrs[\"SMTSD\"] == tissue_name][\n", + " \"SAMPID\"\n", + " ].tolist()\n", + " )\n", + " )\n", + " )\n", + " )\n", + " for tissue_name in gtex_tissues\n", + "}\n", + "\n", + "assert len(gtex_tissues) == len(sample_ids_by_tissue)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['GTEX-111YS-0006-SM-5NQBE',\n", + " 'GTEX-1122O-0005-SM-5O99J',\n", + " 'GTEX-1128S-0005-SM-5P9HI',\n", + " 'GTEX-113IC-0006-SM-5NQ9C',\n", + " 'GTEX-113JC-0006-SM-5O997',\n", + " 'GTEX-117XS-0005-SM-5PNU6',\n", + " 'GTEX-117YW-0005-SM-5NQ8Z',\n", + " 'GTEX-1192W-0005-SM-5NQBQ',\n", + " 'GTEX-1192X-0005-SM-5NQC3',\n", + " 'GTEX-11DXW-0006-SM-5NQ7Y']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_ids_by_tissue[\"Whole Blood\"][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure all IDs are unique\n", + "assert all(\n", + " [\n", + " len(sample_ids_by_tissue[tissue_name])\n", + " == len(set(sample_ids_by_tissue[tissue_name]))\n", + " for tissue_name in sample_ids_by_tissue.keys()\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Show sample size by tissue" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tissuesample_size
3Muscle - Skeletal803
0Whole Blood755
29Skin - Sun Exposed (Lower leg)701
4Artery - Tibial663
2Adipose - Subcutaneous663
16Thyroid653
30Nerve - Tibial619
12Skin - Not Sun Exposed (Suprapubic)604
17Lung578
21Esophagus - Mucosa555
7Adipose - Visceral (Omentum)541
20Esophagus - Muscularis515
34Cells - Cultured fibroblasts504
11Breast - Mammary Tissue459
31Heart - Left Ventricle432
35Artery - Aorta432
6Heart - Atrial Appendage429
26Colon - Transverse406
22Esophagus - Gastroesophageal Junction375
24Colon - Sigmoid373
28Testis361
23Stomach359
19Pancreas328
32Pituitary283
15Adrenal Gland258
14Brain - Cortex255
38Brain - Caudate (basal ganglia)246
39Brain - Nucleus accumbens (basal ganglia)246
27Prostate245
18Spleen241
33Brain - Cerebellum241
5Artery - Coronary240
43Liver226
37Brain - Cerebellar Hemisphere215
1Brain - Frontal Cortex (BA9)209
40Brain - Putamen (basal ganglia)205
41Brain - Hypothalamus202
44Brain - Hippocampus197
25Small Intestine - Terminal Ileum187
8Ovary180
45Brain - Anterior cingulate cortex (BA24)176
36Cells - EBV-transformed lymphocytes174
13Minor Salivary Gland162
42Brain - Spinal cord (cervical c-1)159
10Vagina156
48Brain - Amygdala152
9Uterus142
46Brain - Substantia nigra139
47Kidney - Cortex85
52Bladder21
51Cervix - Endocervix10
50Fallopian Tube9
49Cervix - Ectocervix9
53Kidney - Medulla4
54Cells - Leukemia cell line (CML)0
\n", + "
" + ], + "text/plain": [ + " tissue sample_size\n", + "3 Muscle - Skeletal 803\n", + "0 Whole Blood 755\n", + "29 Skin - Sun Exposed (Lower leg) 701\n", + "4 Artery - Tibial 663\n", + "2 Adipose - Subcutaneous 663\n", + "16 Thyroid 653\n", + "30 Nerve - Tibial 619\n", + "12 Skin - Not Sun Exposed (Suprapubic) 604\n", + "17 Lung 578\n", + "21 Esophagus - Mucosa 555\n", + "7 Adipose - Visceral (Omentum) 541\n", + "20 Esophagus - Muscularis 515\n", + "34 Cells - Cultured fibroblasts 504\n", + "11 Breast - Mammary Tissue 459\n", + "31 Heart - Left Ventricle 432\n", + "35 Artery - Aorta 432\n", + "6 Heart - Atrial Appendage 429\n", + "26 Colon - Transverse 406\n", + "22 Esophagus - Gastroesophageal Junction 375\n", + "24 Colon - Sigmoid 373\n", + "28 Testis 361\n", + "23 Stomach 359\n", + "19 Pancreas 328\n", + "32 Pituitary 283\n", + "15 Adrenal Gland 258\n", + "14 Brain - Cortex 255\n", + "38 Brain - Caudate (basal ganglia) 246\n", + "39 Brain - Nucleus accumbens (basal ganglia) 246\n", + "27 Prostate 245\n", + "18 Spleen 241\n", + "33 Brain - Cerebellum 241\n", + "5 Artery - Coronary 240\n", + "43 Liver 226\n", + "37 Brain - Cerebellar Hemisphere 215\n", + "1 Brain - Frontal Cortex (BA9) 209\n", + "40 Brain - Putamen (basal ganglia) 205\n", + "41 Brain - Hypothalamus 202\n", + "44 Brain - Hippocampus 197\n", + "25 Small Intestine - Terminal Ileum 187\n", + "8 Ovary 180\n", + "45 Brain - Anterior cingulate cortex (BA24) 176\n", + "36 Cells - EBV-transformed lymphocytes 174\n", + "13 Minor Salivary Gland 162\n", + "42 Brain - Spinal cord (cervical c-1) 159\n", + "10 Vagina 156\n", + "48 Brain - Amygdala 152\n", + "9 Uterus 142\n", + "46 Brain - Substantia nigra 139\n", + "47 Kidney - Cortex 85\n", + "52 Bladder 21\n", + "51 Cervix - Endocervix 10\n", + "50 Fallopian Tube 9\n", + "49 Cervix - Ectocervix 9\n", + "53 Kidney - Medulla 4\n", + "54 Cells - Leukemia cell line (CML) 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tissue_sample_size = pd.DataFrame(\n", + " [{\"tissue\": k, \"sample_size\": len(v)} for k, v in sample_ids_by_tissue.items()]\n", + ")\n", + "\n", + "tissue_sample_size = tissue_sample_size.sort_values(\"sample_size\", ascending=False)\n", + "display(tissue_sample_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Simple validations\n", + "_tmp = tissue_sample_size.set_index(\"tissue\").squeeze()\n", + "assert _tmp.loc[\"Muscle - Skeletal\"] == 803\n", + "assert _tmp.loc[\"Whole Blood\"] == 755\n", + "assert _tmp.loc[\"Skin - Not Sun Exposed (Suprapubic)\"] == 604\n", + "assert _tmp.loc[\"Kidney - Medulla\"] == 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split expression data by tissue" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cells - Leukemia cell line (CML): 100%|█████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 4357.51it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping Muscle - Skeletal - file already exists\n", + "Skipping Whole Blood - file already exists\n", + "Skipping Skin - Sun Exposed (Lower leg) - file already exists\n", + "Skipping Artery - Tibial - file already exists\n", + "Skipping Adipose - Subcutaneous - file already exists\n", + "Skipping Thyroid - file already exists\n", + "Skipping Nerve - Tibial - file already exists\n", + "Skipping Skin - Not Sun Exposed (Suprapubic) - file already exists\n", + "Skipping Lung - file already exists\n", + "Skipping Esophagus - Mucosa - file already exists\n", + "Skipping Adipose - Visceral (Omentum) - file already exists\n", + "Skipping Esophagus - Muscularis - file already exists\n", + "Skipping Cells - Cultured fibroblasts - file already exists\n", + "Skipping Breast - Mammary Tissue - file already exists\n", + "Skipping Heart - Left Ventricle - file already exists\n", + "Skipping Artery - Aorta - file already exists\n", + "Skipping Heart - Atrial Appendage - file already exists\n", + "Skipping Colon - Transverse - file already exists\n", + "Skipping Esophagus - Gastroesophageal Junction - file already exists\n", + "Skipping Colon - Sigmoid - file already exists\n", + "Skipping Testis - file already exists\n", + "Skipping Stomach - file already exists\n", + "Skipping Pancreas - file already exists\n", + "Skipping Pituitary - file already exists\n", + "Skipping Adrenal Gland - file already exists\n", + "Skipping Brain - Cortex - file already exists\n", + "Skipping Brain - Caudate (basal ganglia) - file already exists\n", + "Skipping Brain - Nucleus accumbens (basal ganglia) - file already exists\n", + "Skipping Prostate - file already exists\n", + "Skipping Spleen - file already exists\n", + "Skipping Brain - Cerebellum - file already exists\n", + "Skipping Artery - Coronary - file already exists\n", + "Skipping Liver - file already exists\n", + "Skipping Brain - Cerebellar Hemisphere - file already exists\n", + "Skipping Brain - Frontal Cortex (BA9) - file already exists\n", + "Skipping Brain - Putamen (basal ganglia) - file already exists\n", + "Skipping Brain - Hypothalamus - file already exists\n", + "Skipping Brain - Hippocampus - file already exists\n", + "Skipping Small Intestine - Terminal Ileum - file already exists\n", + "Skipping Ovary - file already exists\n", + "Skipping Brain - Anterior cingulate cortex (BA24) - file already exists\n", + "Skipping Cells - EBV-transformed lymphocytes - file already exists\n", + "Skipping Minor Salivary Gland - file already exists\n", + "Skipping Brain - Spinal cord (cervical c-1) - file already exists\n", + "Skipping Vagina - file already exists\n", + "Skipping Brain - Amygdala - file already exists\n", + "Skipping Uterus - file already exists\n", + "Skipping Brain - Substantia nigra - file already exists\n", + "Skipping Kidney - Cortex - file already exists\n", + "Skipping Bladder - file already exists\n", + "Skipping Cervix - Endocervix - file already exists\n", + "Skipping Fallopian Tube - file already exists\n", + "Skipping Cervix - Ectocervix - file already exists\n", + "Skipping Kidney - Medulla - file already exists\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "TISSUE_DATA_DIR = ANALYSIS_DIR / \"data_by_tissue\"\n", + "TISSUE_DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "pbar = tqdm(tissue_sample_size[\"tissue\"])\n", + "\n", + "gene_id_symbol_map_tuples = set()\n", + "\n", + "for tissue_name in pbar:\n", + " pbar.set_description(tissue_name)\n", + "\n", + " tissue_ids = sample_ids_by_tissue[tissue_name]\n", + " if len(tissue_ids) == 0:\n", + " continue\n", + "\n", + " # Generate output filename\n", + " tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))\n", + " output_file = TISSUE_DATA_DIR / f\"gtex_v8_data_{tissue_name_simple}.pkl\"\n", + " output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n", + " \n", + " # Skip if file already exists\n", + " if output_file.exists() and output_gene_mappings.exists():\n", + " print(f\"Skipping {tissue_name} - file already exists\")\n", + " continue\n", + "\n", + " try:\n", + " tissue_data = pd.read_csv(\n", + " file_paths[\"gtex_all_sample_ids_with_expr_data\"],\n", + " sep=\"\\t\",\n", + " skiprows=2,\n", + " usecols=[\"Name\", \"Description\"] + tissue_ids,\n", + " )\n", + "\n", + " tissue_data = tissue_data.rename(\n", + " columns={\n", + " \"Name\": \"gene_ens_id\",\n", + " \"Description\": \"gene_symbol\",\n", + " }\n", + " )\n", + "\n", + " # Validate data before processing\n", + " if tissue_data.empty:\n", + " print(f\"Warning: No data found for {tissue_name}\")\n", + " continue\n", + "\n", + " # add gene id / gene symbol to mapping variable\n", + " gene_id_symbol_map_tuples.update(\n", + " tissue_data[[\"gene_ens_id\", \"gene_symbol\"]].itertuples(index=False)\n", + " )\n", + "\n", + " tissue_data = tissue_data.drop(columns=[\"gene_symbol\"]).set_index(\"gene_ens_id\")\n", + "\n", + " # Data quality checks\n", + " assert not tissue_data.isna().any().any(), f\"NaN values found in {tissue_name}\"\n", + " assert tissue_data.index.is_unique, f\"Non-unique gene IDs in {tissue_name}\"\n", + " assert tissue_data.columns.is_unique, f\"Non-unique sample IDs in {tissue_name}\"\n", + "\n", + " # save\n", + " tissue_data.to_pickle(path=output_file)\n", + " \n", + " except Exception as e:\n", + " print(f\"Error processing {tissue_name}: {str(e)}\")\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Simple validations\n", + "_tmp = pd.read_pickle(TISSUE_DATA_DIR / \"gtex_v8_data_brain_cerebellar_hemisphere.pkl\")\n", + "\n", + "assert \"GTEX-11DXY-0011-R11a-SM-DNZZN\" in _tmp.columns\n", + "assert \"GTEX-WL46-0011-R11A-SM-3MJFT\" in _tmp.columns\n", + "assert \"GTEX-ZF28-0011-R11a-SM-4WWEI\" in _tmp.columns\n", + "\n", + "_v = _tmp.loc[\"ENSG00000223972.5\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n", + "assert _v == 0.04045, _v\n", + "_v = _tmp.loc[\"ENSG00000278267.1\", \"GTEX-11DXY-0011-R11a-SM-DNZZN\"]\n", + "assert _v == 0.0, _v\n", + "\n", + "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n", + "assert _v == 146.4000, _v\n", + "_v = _tmp.loc[\"ENSG00000237118.2\", \"GTEX-WL46-0011-R11A-SM-3MJFT\"]\n", + "assert _v == 0.3357, _v\n", + "\n", + "_v = _tmp.loc[\"ENSG00000233327.10\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n", + "assert _v == 30.7200, _v\n", + "_v = _tmp.loc[\"ENSG00000186907.7\", \"GTEX-ZF28-0011-R11a-SM-4WWEI\"]\n", + "assert _v == 0.94720, _v" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save gene mappings" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded existing gene mappings from /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n", + "gene_mappings.shape: (56200, 2)\n", + " gene_ens_id gene_symbol\n", + "0 ENSG00000144278.14 GALNT13\n", + "1 ENSG00000260976.1 LINC01633\n", + "2 ENSG00000186660.14 ZFP91\n", + "3 ENSG00000123560.13 PLP1\n", + "4 ENSG00000227371.1 RP11-3L10.2\n" + ] + } + ], + "source": [ + "output_gene_mappings = ANALYSIS_DIR / \"gtex_gene_id_symbol_mappings.pkl\"\n", + "\n", + "if output_gene_mappings.exists():\n", + " gene_mappings = pd.read_pickle(output_gene_mappings)\n", + " print(f\"Loaded existing gene mappings from {output_gene_mappings}\")\n", + "else:\n", + " gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)\n", + " gene_mappings.to_pickle(output_gene_mappings)\n", + " print(f\"Created and saved gene mappings to {output_gene_mappings}\")\n", + "\n", + "print(f\"gene_mappings.shape: {gene_mappings.shape}\")\n", + "print(gene_mappings.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# Simple validations\n", + "# no null\n", + "assert gene_mappings.dropna(how=\"any\").shape == gene_mappings.shape\n", + "# no duplicates\n", + "assert gene_mappings.drop_duplicates().shape == gene_mappings.shape\n", + "\n", + "_tmp = gene_mappings.set_index(\"gene_ens_id\").squeeze()\n", + "assert _tmp.loc[\"ENSG00000223972.5\"] == \"DDX11L1\"\n", + "assert _tmp.loc[\"ENSG00000243485.5\"] == \"MIR1302-2HG\"\n", + "assert _tmp.loc[\"ENSG00000274059.1\"] == \"5S_rRNA\" # repeated gene\n", + "assert _tmp.loc[\"ENSG00000275305.1\"] == \"5S_rRNA\" # repeated gene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute correlation coefficients" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We provide a command-line tool for computing CCC, Spearman, and Pearson correlations between two genes in a given tissue.\n", + "\n", + "```bash\n", + "usage: compute_single_gene_pair_correlations_cli.py [-h] [--tissue TISSUE] [--data-dir DATA_DIR] [--gene-mapping GENE_MAPPING] [--list-tissues] [--show-genes TISSUE] [--n-genes N_GENES] [--debug] [genes ...]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 11:33:38,498 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n", + "[2025-09-25 11:33:38,644 - root] INFO: Tissue data shape: (56200, 755)\n", + "[2025-09-25 11:33:38,644 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n", + "[2025-09-25 11:33:38,649 - root] INFO: Loaded 56200 gene mappings\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Tissue: whole_blood ===\n", + "Total genes: 56,200\n", + "Total samples: 755\n", + "\n", + "First 20 genes:\n", + "------------------------------------------------------------\n", + "# Gene Symbol Ensembl ID \n", + "------------------------------------------------------------\n", + "1 DDX11L1 ENSG00000223972.5 \n", + "2 WASH7P ENSG00000227232.5 \n", + "3 MIR6859-1 ENSG00000278267.1 \n", + "4 MIR1302-2HG ENSG00000243485.5 \n", + "5 FAM138A ENSG00000237613.2 \n", + "6 OR4G4P ENSG00000268020.3 \n", + "7 OR4G11P ENSG00000240361.1 \n", + "8 OR4F5 ENSG00000186092.4 \n", + "9 RP11-34P13.7 ENSG00000238009.6 \n", + "10 CICP27 ENSG00000233750.3 \n", + "11 RP11-34P13.15 ENSG00000268903.1 \n", + "12 RP11-34P13.16 ENSG00000269981.1 \n", + "13 RP11-34P13.14 ENSG00000239906.1 \n", + "14 RP11-34P13.13 ENSG00000241860.6 \n", + "15 RNU6-1100P ENSG00000222623.1 \n", + "16 RP11-34P13.9 ENSG00000241599.1 \n", + "17 ABC7-43046700E7.1 ENSG00000279928.2 \n", + "18 RP11-34P13.18 ENSG00000279457.4 \n", + "19 MIR6859-2 ENSG00000273874.1 \n", + "20 AP006222.2 ENSG00000228463.9 \n", + "... and 56,180 more genes\n", + "\n" + ] + } + ], + "source": [ + "# Make sure you start the notebook from the ROOT directory of the project\n", + "\n", + "# Preview genes in a tissue\n", + "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py --show-genes whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 11:33:38,676 - root] INFO: Loading gene mapping from: /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl\n", + "[2025-09-25 11:33:38,681 - root] INFO: Loaded 56200 gene mappings\n", + "[2025-09-25 11:33:38,686 - root] INFO: Loading tissue data from: /mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue/gtex_v8_data_whole_blood.pkl\n", + "[2025-09-25 11:33:38,824 - root] INFO: Tissue data shape: (56200, 755)\n", + "[2025-09-25 11:33:38,827 - root] INFO: Computing correlations for 755 samples\n", + "[2025-09-25 11:33:38,832 - root] INFO: Computing CCC correlation...\n", + "[2025-09-25 11:33:38,857 - root] INFO: Computing Pearson correlation...\n", + "[2025-09-25 11:33:38,871 - root] INFO: Computing Spearman correlation...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "GENE PAIR CORRELATION RESULTS\n", + "============================================================\n", + "Gene 1: DDX11L1 (ENSG00000223972.5)\n", + "Gene 2: WASH7P (ENSG00000227232.5)\n", + "Tissue: whole_blood\n", + "Samples: 755\n", + "------------------------------------------------------------\n", + " CCC: 0.005060\n", + " PEARSON: 0.063041\n", + " SPEARMAN: 0.040069\n", + "============================================================\n", + "\n" + ] + } + ], + "source": [ + "# Compute CCC, Spearman, and Pearson correlations between two genes in a given tissue\n", + "%run ./nbs/common/compute_single_gene_pair_correlations_cli.py DDX11L1 WASH7P --tissue whole_blood --data-dir {TISSUE_DATA_DIR} --gene-mapping {ANALYSIS_DIR}/gtex_gene_id_symbol_mappings.pkl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metadata Correlation\n", + "We will compute the correlation between the gene expression and the metadata for each tissue. Metadata is downloaded from: https://www.gtexportal.org/home/downloads/adult-gtex/metadata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(22951, 62)\n" + ] + } + ], + "source": [ + "# Load GTEx samples info\n", + "gtex_samples = pd.read_csv(file_paths[\"gtex_sample_attrs\"], sep=\"\\t\", index_col=\"SAMPID\")\n", + "print(gtex_samples.shape)\n", + "assert gtex_samples.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(980, 4)\n" + ] + } + ], + "source": [ + "# Load GTEx subject attributes\n", + "gtex_phenotypes = pd.read_csv(file_paths[\"gtex_subject_attrs\"], sep=\"\\t\")\n", + "print(gtex_phenotypes.shape)\n", + "assert gtex_phenotypes.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['GTEX-1117F-0003-SM-58Q7G', 'GTEX-1117F-0003-SM-5DWSB', 'GTEX-1117F-0003-SM-6WBT7', 'GTEX-1117F-0011-R10a-SM-AHZ7F', 'GTEX-1117F-0011-R10b-SM-CYKQ8']\n" + ] + } + ], + "source": [ + "# Get GTEx sample metadata\n", + "gtex_samples_ids = gtex_samples.index.to_list()\n", + "print(gtex_samples_ids[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GTEX-1117F-0003-SM-58Q7G\n", + "1 GTEX-1117F-0003-SM-5DWSB\n", + "2 GTEX-1117F-0003-SM-6WBT7\n", + "3 GTEX-1117F-0011-R10a-SM-AHZ7F\n", + "4 GTEX-1117F-0011-R10b-SM-CYKQ8\n", + " ... \n", + "22946 K-562-SM-E9EZC\n", + "22947 K-562-SM-E9EZI\n", + "22948 K-562-SM-E9EZO\n", + "22949 K-562-SM-E9EZT\n", + "22950 K-562-SM-E9EZZ\n", + "Name: SAMPID, Length: 22951, dtype: object" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_samples_ids = pd.Series(gtex_samples_ids).rename(\"SAMPID\")\n", + "gtex_samples_ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GTEX-1117F\n", + "1 GTEX-1117F\n", + "2 GTEX-1117F\n", + "3 GTEX-1117F\n", + "4 GTEX-1117F\n", + " ... \n", + "22946 K-562\n", + "22947 K-562\n", + "22948 K-562\n", + "22949 K-562\n", + "22950 K-562\n", + "Name: SUBJID, Length: 22951, dtype: object" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_subjects_ids = gtex_samples_ids.str.extract(\n", + " r\"([\\w\\d]+\\-[\\w\\d]+)\", flags=re.IGNORECASE, expand=True\n", + ")[0].rename(\"SUBJID\")\n", + "\n", + "gtex_subjects_ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SAMPIDSUBJID
0GTEX-1117F-0003-SM-58Q7GGTEX-1117F
1GTEX-1117F-0003-SM-5DWSBGTEX-1117F
2GTEX-1117F-0003-SM-6WBT7GTEX-1117F
3GTEX-1117F-0011-R10a-SM-AHZ7FGTEX-1117F
4GTEX-1117F-0011-R10b-SM-CYKQ8GTEX-1117F
.........
22946K-562-SM-E9EZCK-562
22947K-562-SM-E9EZIK-562
22948K-562-SM-E9EZOK-562
22949K-562-SM-E9EZTK-562
22950K-562-SM-E9EZZK-562
\n", + "

22951 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " SAMPID SUBJID\n", + "0 GTEX-1117F-0003-SM-58Q7G GTEX-1117F\n", + "1 GTEX-1117F-0003-SM-5DWSB GTEX-1117F\n", + "2 GTEX-1117F-0003-SM-6WBT7 GTEX-1117F\n", + "3 GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F\n", + "4 GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F\n", + "... ... ...\n", + "22946 K-562-SM-E9EZC K-562\n", + "22947 K-562-SM-E9EZI K-562\n", + "22948 K-562-SM-E9EZO K-562\n", + "22949 K-562-SM-E9EZT K-562\n", + "22950 K-562-SM-E9EZZ K-562\n", + "\n", + "[22951 rows x 2 columns]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)\n", + "gtex_metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUBJIDSEXAGEDTHHRDY
0GTEX-1117F260-694.0
1GTEX-111CU150-590.0
2GTEX-111FC160-691.0
3GTEX-111VG160-693.0
4GTEX-111YS160-690.0
...............
975GTEX-ZYY3260-694.0
976GTEX-ZZ64120-290.0
977GTEX-ZZPT150-594.0
978GTEX-ZZPU250-590.0
979K-562250-59NaN
\n", + "

980 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " SUBJID SEX AGE DTHHRDY\n", + "0 GTEX-1117F 2 60-69 4.0\n", + "1 GTEX-111CU 1 50-59 0.0\n", + "2 GTEX-111FC 1 60-69 1.0\n", + "3 GTEX-111VG 1 60-69 3.0\n", + "4 GTEX-111YS 1 60-69 0.0\n", + ".. ... ... ... ...\n", + "975 GTEX-ZYY3 2 60-69 4.0\n", + "976 GTEX-ZZ64 1 20-29 0.0\n", + "977 GTEX-ZZPT 1 50-59 4.0\n", + "978 GTEX-ZZPU 2 50-59 0.0\n", + "979 K-562 2 50-59 NaN\n", + "\n", + "[980 rows x 4 columns]" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_phenotypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUBJIDSEXAGEDTHHRDY
SAMPID
GTEX-1117F-0003-SM-58Q7GGTEX-1117F260-694.0
GTEX-1117F-0003-SM-5DWSBGTEX-1117F260-694.0
GTEX-1117F-0003-SM-6WBT7GTEX-1117F260-694.0
GTEX-1117F-0011-R10a-SM-AHZ7FGTEX-1117F260-694.0
GTEX-1117F-0011-R10b-SM-CYKQ8GTEX-1117F260-694.0
...............
K-562-SM-E9EZCK-562250-59NaN
K-562-SM-E9EZIK-562250-59NaN
K-562-SM-E9EZOK-562250-59NaN
K-562-SM-E9EZTK-562250-59NaN
K-562-SM-E9EZZK-562250-59NaN
\n", + "

22951 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " SUBJID SEX AGE DTHHRDY\n", + "SAMPID \n", + "GTEX-1117F-0003-SM-58Q7G GTEX-1117F 2 60-69 4.0\n", + "GTEX-1117F-0003-SM-5DWSB GTEX-1117F 2 60-69 4.0\n", + "GTEX-1117F-0003-SM-6WBT7 GTEX-1117F 2 60-69 4.0\n", + "GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F 2 60-69 4.0\n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F 2 60-69 4.0\n", + "... ... ... ... ...\n", + "K-562-SM-E9EZC K-562 2 50-59 NaN\n", + "K-562-SM-E9EZI K-562 2 50-59 NaN\n", + "K-562-SM-E9EZO K-562 2 50-59 NaN\n", + "K-562-SM-E9EZT K-562 2 50-59 NaN\n", + "K-562-SM-E9EZZ K-562 2 50-59 NaN\n", + "\n", + "[22951 rows x 4 columns]" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index(\"SAMPID\")\n", + "gtex_metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUBJIDSEXAGEDTHHRDYSMATSSCRSMCENTERSMPTHNTSSMRINSMTSSMTSD...SME1ANTISMSPLTRDSMBSMMRTSME1SNSESME1PCTSSMRRNARTSME1MPRTSMNUM5CDSMDPMPRTSME2PCTS
SAMPID
GTEX-1117F-0003-SM-58Q7GGTEX-1117FFemale60-694.0NaNB1NaNNaNBloodWhole Blood...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GTEX-1117F-0003-SM-5DWSBGTEX-1117FFemale60-694.0NaNB1NaNNaNBloodWhole Blood...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GTEX-1117F-0003-SM-6WBT7GTEX-1117FFemale60-694.0NaNB1NaNNaNBloodWhole Blood...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GTEX-1117F-0011-R10a-SM-AHZ7FGTEX-1117FFemale60-694.0NaNB1, A1NaNNaNBrainBrain - Frontal Cortex (BA9)...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GTEX-1117F-0011-R10b-SM-CYKQ8GTEX-1117FFemale60-694.0NaNB1, A1NaN7.2BrainBrain - Frontal Cortex (BA9)...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 66 columns

\n", + "
" + ], + "text/plain": [ + " SUBJID SEX AGE DTHHRDY SMATSSCR \\\n", + "SAMPID \n", + "GTEX-1117F-0003-SM-58Q7G GTEX-1117F Female 60-69 4.0 NaN \n", + "GTEX-1117F-0003-SM-5DWSB GTEX-1117F Female 60-69 4.0 NaN \n", + "GTEX-1117F-0003-SM-6WBT7 GTEX-1117F Female 60-69 4.0 NaN \n", + "GTEX-1117F-0011-R10a-SM-AHZ7F GTEX-1117F Female 60-69 4.0 NaN \n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 GTEX-1117F Female 60-69 4.0 NaN \n", + "\n", + " SMCENTER SMPTHNTS SMRIN SMTS \\\n", + "SAMPID \n", + "GTEX-1117F-0003-SM-58Q7G B1 NaN NaN Blood \n", + "GTEX-1117F-0003-SM-5DWSB B1 NaN NaN Blood \n", + "GTEX-1117F-0003-SM-6WBT7 B1 NaN NaN Blood \n", + "GTEX-1117F-0011-R10a-SM-AHZ7F B1, A1 NaN NaN Brain \n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 B1, A1 NaN 7.2 Brain \n", + "\n", + " SMTSD ... SME1ANTI \\\n", + "SAMPID ... \n", + "GTEX-1117F-0003-SM-58Q7G Whole Blood ... NaN \n", + "GTEX-1117F-0003-SM-5DWSB Whole Blood ... NaN \n", + "GTEX-1117F-0003-SM-6WBT7 Whole Blood ... NaN \n", + "GTEX-1117F-0011-R10a-SM-AHZ7F Brain - Frontal Cortex (BA9) ... NaN \n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 Brain - Frontal Cortex (BA9) ... NaN \n", + "\n", + " SMSPLTRD SMBSMMRT SME1SNSE SME1PCTS SMRRNART \\\n", + "SAMPID \n", + "GTEX-1117F-0003-SM-58Q7G NaN NaN NaN NaN NaN \n", + "GTEX-1117F-0003-SM-5DWSB NaN NaN NaN NaN NaN \n", + "GTEX-1117F-0003-SM-6WBT7 NaN NaN NaN NaN NaN \n", + "GTEX-1117F-0011-R10a-SM-AHZ7F NaN NaN NaN NaN NaN \n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 NaN NaN NaN NaN NaN \n", + "\n", + " SME1MPRT SMNUM5CD SMDPMPRT SME2PCTS \n", + "SAMPID \n", + "GTEX-1117F-0003-SM-58Q7G NaN NaN NaN NaN \n", + "GTEX-1117F-0003-SM-5DWSB NaN NaN NaN NaN \n", + "GTEX-1117F-0003-SM-6WBT7 NaN NaN NaN NaN \n", + "GTEX-1117F-0011-R10a-SM-AHZ7F NaN NaN NaN NaN \n", + "GTEX-1117F-0011-R10b-SM-CYKQ8 NaN NaN NaN NaN \n", + "\n", + "[5 rows x 66 columns]" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)\n", + "\n", + "gtex_metadata = gtex_metadata.replace(\n", + " {\n", + " \"SEX\": {\n", + " 1: \"Male\",\n", + " 2: \"Female\",\n", + " }\n", + " }\n", + ")\n", + "\n", + "gtex_metadata = gtex_metadata.sort_index()\n", + "\n", + "gtex_metadata.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Simple validations\n", + "assert not gtex_metadata[\"SUBJID\"].isna().any()\n", + "\n", + "assert not gtex_metadata[\"SMTS\"].isna().any()\n", + "assert not gtex_metadata[\"SMTSD\"].isna().any()\n", + "\n", + "assert not gtex_metadata[\"SEX\"].isna().any()\n", + "assert gtex_metadata[\"SEX\"].unique().shape[0] == 2\n", + "assert set(gtex_metadata[\"SEX\"].unique()) == {\"Female\", \"Male\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metadata\n", + "gtex_metadatadata_filename = ANALYSIS_DIR / \"gtex_v8-sample_metadata.pkl\"\n", + "gtex_metadata.to_pickle(gtex_metadatadata_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metadata correlation\n", + "We also provide a command-line tool `nbs/common/metadata_corr_cli.py` for computing the correlation between the gene expression and the metadata for each tissue.\n", + "\n", + "```bash\n", + "usage: metadata_corr_cli.py [-h] [--expr-data-dir EXPR_DATA_DIR] [--include [INCLUDE ...]] [--exclude [EXCLUDE ...]] [--permutations PERMUTATIONS]\n", + " [--n-jobs N_JOBS] [--list-metadata-columns] [--list-tissues] [--output-dir OUTPUT_DIR] [--quiet] [--no-csv-output]\n", + " [--no-individual-logs] [--data-dir DATA_DIR]\n", + " gene_symbols [gene_symbols ...]\n", + "\n", + "Analyze gene expression correlations with metadata using CCC across multiple tissues\n", + "\n", + "positional arguments:\n", + " gene_symbols Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + " --expr-data-dir EXPR_DATA_DIR\n", + " Directory containing expression data files (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex/gene_selection/all)\n", + " --include [INCLUDE ...]\n", + " Include only tissues matching these patterns (fuzzy match on tissue name) (default: None)\n", + " --exclude [EXCLUDE ...]\n", + " Exclude tissues matching these patterns (fuzzy match on tissue name) (default: None)\n", + " --permutations PERMUTATIONS\n", + " Number of permutations for p-value calculation (default: 100000)\n", + " --n-jobs N_JOBS Number of parallel jobs for computation (default: 4)\n", + " --list-metadata-columns\n", + " List available metadata columns and exit (default: False)\n", + " --list-tissues List available tissue files and exit (default: False)\n", + " --output-dir OUTPUT_DIR\n", + " Directory to save output files (default: current directory) (default: .)\n", + " --quiet Reduce output verbosity for batch processing (default: False)\n", + " --no-csv-output Skip CSV file generation (only create pickle files) (default: False)\n", + " --no-individual-logs Skip individual tissue log files (only keep summary logs) (default: False)\n", + " --data-dir DATA_DIR Directory containing GTEx data files (metadata and gene mappings) (default: /pividori_lab/haoyu_projects/ccc-gpu/data/gtex)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "METADATA_CORRELATIONS_RESULT_DIR = ANALYSIS_DIR / \"metadata_correlations\"\n", + "os.makedirs(METADATA_CORRELATIONS_RESULT_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:17,840 - summary] INFO: Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n", + "[2025-09-25 13:05:17,840 - summary] INFO: Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n", + "[2025-09-25 13:05:17,840 - summary] INFO: Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n", + "[2025-09-25 13:05:17,840 - summary] INFO: Gene symbols to analyze: RASSF2, CYTIP\n", + "[2025-09-25 13:05:17,857 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:17,858 - summary] INFO: PROCESSING GENE 1/2: RASSF2\n", + "[2025-09-25 13:05:17,858 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:17,858 - summary] INFO: \n", + "[1/1] Starting processing for RASSF2 in whole_blood...\n", + "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: \n", + "============================================================\n", + "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Processing tissue: whole_blood\n", + "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n", + "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + "[2025-09-25 13:05:17,859 - tissue_RASSF2_whole_blood] INFO: ============================================================\n", + "[2025-09-25 13:05:17,860 - tissue_RASSF2_whole_blood] INFO: Loading expression data...\n", + "[2025-09-25 13:05:18,013 - tissue_RASSF2_whole_blood] INFO: Expression data shape: (56200, 755)\n", + "[2025-09-25 13:05:18,016 - tissue_RASSF2_whole_blood] INFO: Gene ID for RASSF2: ENSG00000101265.15\n", + "[2025-09-25 13:05:18,019 - tissue_RASSF2_whole_blood] INFO: Number of samples: 755\n", + "[2025-09-25 13:05:18,021 - tissue_RASSF2_whole_blood] INFO: Common samples: 755\n", + "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Computing CCC between RASSF2 expression and all metadata columns...\n", + "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Using 100000 permutations and 4 jobs\n", + "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing 66 metadata columns...\n", + "[2025-09-25 13:05:18,023 - tissue_RASSF2_whole_blood] INFO: Processing column 1/66: SUBJID\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output directory: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations\n", + "Summary log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n", + "Summary tables file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n", + "Gene symbols to analyze: RASSF2, CYTIP\n", + "Found 1 expression files to process:\n", + " whole_blood: gtex_v8_data_whole_blood.pkl\n", + "Loading metadata and gene mapping files...\n", + "Loaded metadata: (22951, 66)\n", + "Loaded gene mapping: (56200, 2)\n", + "\n", + "====================================================================================================\n", + "PROCESSING GENE 1/2: RASSF2\n", + "====================================================================================================\n", + "\n", + "[1/1] Starting processing for RASSF2 in whole_blood...\n", + "\n", + "============================================================\n", + "Processing tissue: whole_blood\n", + "File: gtex_v8_data_whole_blood.pkl\n", + "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + "============================================================\n", + "Loading expression data...\n", + "Expression data shape: (56200, 755)\n", + "Gene ID for RASSF2: ENSG00000101265.15\n", + "Number of samples: 755\n", + "Common samples: 755\n", + "Computing CCC between RASSF2 expression and all metadata columns...\n", + "Using 100000 permutations and 4 jobs\n", + "Processing 66 metadata columns...\n", + "Processing column 1/66: SUBJID\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:18,143 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000000, p-value: 1.00e+00\n", + "[2025-09-25 13:05:18,144 - tissue_RASSF2_whole_blood] INFO: Processing column 2/66: SEX\n", + "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007134, p-value: 1.23e-02\n", + "[2025-09-25 13:05:18,217 - tissue_RASSF2_whole_blood] INFO: Processing column 3/66: AGE\n", + "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO: CCC: 0.039824, p-value: 1.00e-05\n", + "[2025-09-25 13:05:18,291 - tissue_RASSF2_whole_blood] INFO: Processing column 4/66: DTHHRDY\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.000000, p-value: 1.00e+00\n", + "Processing column 2/66: SEX\n", + " CCC: 0.007134, p-value: 1.23e-02\n", + "Processing column 3/66: AGE\n", + " CCC: 0.039824, p-value: 1.00e-05\n", + "Processing column 4/66: DTHHRDY\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:18,547 - tissue_RASSF2_whole_blood] INFO: CCC: 0.464582, p-value: 1.00e-05\n", + "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 5/66: SMATSSCR\n", + "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Skipping SMATSSCR: all values are NaN\n", + "[2025-09-25 13:05:18,548 - tissue_RASSF2_whole_blood] INFO: Processing column 6/66: SMCENTER\n", + "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO: CCC: 0.108148, p-value: 1.00e-05\n", + "[2025-09-25 13:05:18,618 - tissue_RASSF2_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n", + "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO: Skipping SMPTHNTS: all values are NaN\n", + "[2025-09-25 13:05:18,619 - tissue_RASSF2_whole_blood] INFO: Processing column 8/66: SMRIN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.464582, p-value: 1.00e-05\n", + "Processing column 5/66: SMATSSCR\n", + " Skipping SMATSSCR: all values are NaN\n", + "Processing column 6/66: SMCENTER\n", + " CCC: 0.108148, p-value: 1.00e-05\n", + "Processing column 7/66: SMPTHNTS\n", + " Skipping SMPTHNTS: all values are NaN\n", + "Processing column 8/66: SMRIN\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO: CCC: 0.048847, p-value: 1.00e-05\n", + "[2025-09-25 13:05:18,872 - tissue_RASSF2_whole_blood] INFO: Processing column 9/66: SMTS\n", + "[2025-09-25 13:05:18,873 - tissue_RASSF2_whole_blood] INFO: Skipping SMTS: only 1 unique value(s)\n", + "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 10/66: SMTSD\n", + "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Skipping SMTSD: only 1 unique value(s)\n", + "[2025-09-25 13:05:18,874 - tissue_RASSF2_whole_blood] INFO: Processing column 11/66: SMUBRID\n", + "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO: Skipping SMUBRID: only 1 unique value(s)\n", + "[2025-09-25 13:05:18,875 - tissue_RASSF2_whole_blood] INFO: Processing column 12/66: SMTSISCH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.048847, p-value: 1.00e-05\n", + "Processing column 9/66: SMTS\n", + " Skipping SMTS: only 1 unique value(s)\n", + "Processing column 10/66: SMTSD\n", + " Skipping SMTSD: only 1 unique value(s)\n", + "Processing column 11/66: SMUBRID\n", + " Skipping SMUBRID: only 1 unique value(s)\n", + "Processing column 12/66: SMTSISCH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:19,129 - tissue_RASSF2_whole_blood] INFO: CCC: 0.528125, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 13/66: SMTSPAX\n", + "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Skipping SMTSPAX: all values are NaN\n", + "[2025-09-25 13:05:19,130 - tissue_RASSF2_whole_blood] INFO: Processing column 14/66: SMNABTCH\n", + "[2025-09-25 13:05:19,194 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000884, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,195 - tissue_RASSF2_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n", + "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO: Skipping SMNABTCHT: only 1 unique value(s)\n", + "[2025-09-25 13:05:19,196 - tissue_RASSF2_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n", + "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000900, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,259 - tissue_RASSF2_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n", + "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO: CCC: 0.003663, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,316 - tissue_RASSF2_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.528125, p-value: 1.00e-05\n", + "Processing column 13/66: SMTSPAX\n", + " Skipping SMTSPAX: all values are NaN\n", + "Processing column 14/66: SMNABTCH\n", + " CCC: 0.000884, p-value: 1.00e-05\n", + "Processing column 15/66: SMNABTCHT\n", + " Skipping SMNABTCHT: only 1 unique value(s)\n", + "Processing column 16/66: SMNABTCHD\n", + " CCC: 0.000900, p-value: 1.00e-05\n", + "Processing column 17/66: SMGEBTCH\n", + " CCC: 0.003663, p-value: 1.00e-05\n", + "Processing column 18/66: SMGEBTCHD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO: CCC: 0.005827, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,374 - tissue_RASSF2_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n", + "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Skipping SMGEBTCHT: only 1 unique value(s)\n", + "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Processing column 20/66: SMAFRZE\n", + "[2025-09-25 13:05:19,375 - tissue_RASSF2_whole_blood] INFO: Skipping SMAFRZE: only 1 unique value(s)\n", + "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 21/66: SMGTC\n", + "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Skipping SMGTC: all values are NaN\n", + "[2025-09-25 13:05:19,376 - tissue_RASSF2_whole_blood] INFO: Processing column 22/66: SME2MPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.005827, p-value: 1.00e-05\n", + "Processing column 19/66: SMGEBTCHT\n", + " Skipping SMGEBTCHT: only 1 unique value(s)\n", + "Processing column 20/66: SMAFRZE\n", + " Skipping SMAFRZE: only 1 unique value(s)\n", + "Processing column 21/66: SMGTC\n", + " Skipping SMGTC: all values are NaN\n", + "Processing column 22/66: SME2MPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO: CCC: 0.172974, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,629 - tissue_RASSF2_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.172974, p-value: 1.00e-05\n", + "Processing column 23/66: SMCHMPRS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO: CCC: 0.143365, p-value: 1.00e-05\n", + "[2025-09-25 13:05:19,882 - tissue_RASSF2_whole_blood] INFO: Processing column 24/66: SMNTRART\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.143365, p-value: 1.00e-05\n", + "Processing column 24/66: SMNTRART\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:20,136 - tissue_RASSF2_whole_blood] INFO: CCC: 0.243071, p-value: 1.00e-05\n", + "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n", + "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Skipping SMNUMGPS: all values are NaN\n", + "[2025-09-25 13:05:20,137 - tissue_RASSF2_whole_blood] INFO: Processing column 26/66: SMMAPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.243071, p-value: 1.00e-05\n", + "Processing column 25/66: SMNUMGPS\n", + " Skipping SMNUMGPS: all values are NaN\n", + "Processing column 26/66: SMMAPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:20,392 - tissue_RASSF2_whole_blood] INFO: CCC: 0.168576, p-value: 1.00e-05\n", + "[2025-09-25 13:05:20,393 - tissue_RASSF2_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.168576, p-value: 1.00e-05\n", + "Processing column 27/66: SMEXNCRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:20,646 - tissue_RASSF2_whole_blood] INFO: CCC: 0.040140, p-value: 1.00e-05\n", + "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO: Processing column 28/66: SM550NRM\n", + "[2025-09-25 13:05:20,647 - tissue_RASSF2_whole_blood] INFO: Skipping SM550NRM: all values are NaN\n", + "[2025-09-25 13:05:20,648 - tissue_RASSF2_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.040140, p-value: 1.00e-05\n", + "Processing column 28/66: SM550NRM\n", + " Skipping SM550NRM: all values are NaN\n", + "Processing column 29/66: SMGNSDTC\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:20,902 - tissue_RASSF2_whole_blood] INFO: CCC: 0.043013, p-value: 1.00e-05\n", + "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n", + "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Skipping SMUNMPRT: only 1 unique value(s)\n", + "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Processing column 31/66: SM350NRM\n", + "[2025-09-25 13:05:20,903 - tissue_RASSF2_whole_blood] INFO: Skipping SM350NRM: all values are NaN\n", + "[2025-09-25 13:05:20,904 - tissue_RASSF2_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.043013, p-value: 1.00e-05\n", + "Processing column 30/66: SMUNMPRT\n", + " Skipping SMUNMPRT: only 1 unique value(s)\n", + "Processing column 31/66: SM350NRM\n", + " Skipping SM350NRM: all values are NaN\n", + "Processing column 32/66: SMRDLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:21,156 - tissue_RASSF2_whole_blood] INFO: CCC: 0.000028, p-value: 1.73e-01\n", + "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 33/66: SMMNCPB\n", + "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Skipping SMMNCPB: all values are NaN\n", + "[2025-09-25 13:05:21,157 - tissue_RASSF2_whole_blood] INFO: Processing column 34/66: SME1MMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.000028, p-value: 1.73e-01\n", + "Processing column 33/66: SMMNCPB\n", + " Skipping SMMNCPB: all values are NaN\n", + "Processing column 34/66: SME1MMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:21,411 - tissue_RASSF2_whole_blood] INFO: CCC: 0.018125, p-value: 1.40e-04\n", + "[2025-09-25 13:05:21,412 - tissue_RASSF2_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.018125, p-value: 1.40e-04\n", + "Processing column 35/66: SMSFLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:21,665 - tissue_RASSF2_whole_blood] INFO: CCC: 0.047258, p-value: 1.00e-05\n", + "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 36/66: SMESTLBS\n", + "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Skipping SMESTLBS: only 1 unique value(s)\n", + "[2025-09-25 13:05:21,666 - tissue_RASSF2_whole_blood] INFO: Processing column 37/66: SMMPPD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.047258, p-value: 1.00e-05\n", + "Processing column 36/66: SMESTLBS\n", + " Skipping SMESTLBS: only 1 unique value(s)\n", + "Processing column 37/66: SMMPPD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.43e-02\n", + "[2025-09-25 13:05:21,921 - tissue_RASSF2_whole_blood] INFO: Processing column 38/66: SMNTERRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007761, p-value: 3.43e-02\n", + "Processing column 38/66: SMNTERRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO: CCC: 0.250997, p-value: 1.00e-05\n", + "[2025-09-25 13:05:22,175 - tissue_RASSF2_whole_blood] INFO: Processing column 39/66: SMRRNANM\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.250997, p-value: 1.00e-05\n", + "Processing column 39/66: SMRRNANM\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO: CCC: 0.036631, p-value: 1.00e-05\n", + "[2025-09-25 13:05:22,430 - tissue_RASSF2_whole_blood] INFO: Processing column 40/66: SMRDTTL\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.036631, p-value: 1.00e-05\n", + "Processing column 40/66: SMRDTTL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO: CCC: 0.010388, p-value: 6.63e-03\n", + "[2025-09-25 13:05:22,686 - tissue_RASSF2_whole_blood] INFO: Processing column 41/66: SMVQCFL\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.010388, p-value: 6.63e-03\n", + "Processing column 41/66: SMVQCFL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:22,941 - tissue_RASSF2_whole_blood] INFO: CCC: 0.001442, p-value: 9.24e-01\n", + "[2025-09-25 13:05:22,942 - tissue_RASSF2_whole_blood] INFO: Processing column 42/66: SMMNCV\n", + "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO: Skipping SMMNCV: all values are NaN\n", + "[2025-09-25 13:05:22,943 - tissue_RASSF2_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.001442, p-value: 9.24e-01\n", + "Processing column 42/66: SMMNCV\n", + " Skipping SMMNCV: all values are NaN\n", + "Processing column 43/66: SMTRSCPT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO: CCC: 0.042714, p-value: 1.00e-05\n", + "[2025-09-25 13:05:23,199 - tissue_RASSF2_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.042714, p-value: 1.00e-05\n", + "Processing column 44/66: SMMPPDPR\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:23,453 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.48e-02\n", + "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n", + "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Skipping SMCGLGTH: all values are NaN\n", + "[2025-09-25 13:05:23,454 - tissue_RASSF2_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n", + "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Skipping SMGAPPCT: all values are NaN\n", + "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n", + "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Skipping SMUNPDRD: only 1 unique value(s)\n", + "[2025-09-25 13:05:23,455 - tissue_RASSF2_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007761, p-value: 3.48e-02\n", + "Processing column 45/66: SMCGLGTH\n", + " Skipping SMCGLGTH: all values are NaN\n", + "Processing column 46/66: SMGAPPCT\n", + " Skipping SMGAPPCT: all values are NaN\n", + "Processing column 47/66: SMUNPDRD\n", + " Skipping SMUNPDRD: only 1 unique value(s)\n", + "Processing column 48/66: SMNTRNRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO: CCC: 0.202936, p-value: 1.00e-05\n", + "[2025-09-25 13:05:23,710 - tissue_RASSF2_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.202936, p-value: 1.00e-05\n", + "Processing column 49/66: SMMPUNRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO: CCC: 0.168576, p-value: 1.00e-05\n", + "[2025-09-25 13:05:23,964 - tissue_RASSF2_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.168576, p-value: 1.00e-05\n", + "Processing column 50/66: SMEXPEFF\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO: CCC: 0.059931, p-value: 1.00e-05\n", + "[2025-09-25 13:05:24,219 - tissue_RASSF2_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.059931, p-value: 1.00e-05\n", + "Processing column 51/66: SMMPPDUN\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO: CCC: 0.007761, p-value: 3.43e-02\n", + "[2025-09-25 13:05:24,474 - tissue_RASSF2_whole_blood] INFO: Processing column 52/66: SME2MMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007761, p-value: 3.43e-02\n", + "Processing column 52/66: SME2MMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:24,730 - tissue_RASSF2_whole_blood] INFO: CCC: 0.003990, p-value: 4.05e-01\n", + "[2025-09-25 13:05:24,731 - tissue_RASSF2_whole_blood] INFO: Processing column 53/66: SME2ANTI\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.003990, p-value: 4.05e-01\n", + "Processing column 53/66: SME2ANTI\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:24,987 - tissue_RASSF2_whole_blood] INFO: CCC: 0.020742, p-value: 1.10e-04\n", + "[2025-09-25 13:05:24,988 - tissue_RASSF2_whole_blood] INFO: Processing column 54/66: SMALTALG\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.020742, p-value: 1.10e-04\n", + "Processing column 54/66: SMALTALG\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO: CCC: 0.177009, p-value: 1.00e-05\n", + "[2025-09-25 13:05:25,242 - tissue_RASSF2_whole_blood] INFO: Processing column 55/66: SME2SNSE\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.177009, p-value: 1.00e-05\n", + "Processing column 55/66: SME2SNSE\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:25,496 - tissue_RASSF2_whole_blood] INFO: CCC: 0.019048, p-value: 1.80e-04\n", + "[2025-09-25 13:05:25,497 - tissue_RASSF2_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.019048, p-value: 1.80e-04\n", + "Processing column 56/66: SMMFLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO: CCC: 0.019296, p-value: 1.50e-04\n", + "[2025-09-25 13:05:25,751 - tissue_RASSF2_whole_blood] INFO: Processing column 57/66: SME1ANTI\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.019296, p-value: 1.50e-04\n", + "Processing column 57/66: SME1ANTI\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:26,007 - tissue_RASSF2_whole_blood] INFO: CCC: 0.021058, p-value: 1.20e-04\n", + "[2025-09-25 13:05:26,008 - tissue_RASSF2_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.021058, p-value: 1.20e-04\n", + "Processing column 58/66: SMSPLTRD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:26,263 - tissue_RASSF2_whole_blood] INFO: CCC: 0.057786, p-value: 1.00e-05\n", + "[2025-09-25 13:05:26,264 - tissue_RASSF2_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.057786, p-value: 1.00e-05\n", + "Processing column 59/66: SMBSMMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO: CCC: 0.005333, p-value: 1.83e-01\n", + "[2025-09-25 13:05:26,518 - tissue_RASSF2_whole_blood] INFO: Processing column 60/66: SME1SNSE\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.005333, p-value: 1.83e-01\n", + "Processing column 60/66: SME1SNSE\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO: CCC: 0.022008, p-value: 5.00e-05\n", + "[2025-09-25 13:05:26,773 - tissue_RASSF2_whole_blood] INFO: Processing column 61/66: SME1PCTS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.022008, p-value: 5.00e-05\n", + "Processing column 61/66: SME1PCTS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO: CCC: 0.032073, p-value: 2.00e-05\n", + "[2025-09-25 13:05:27,030 - tissue_RASSF2_whole_blood] INFO: Processing column 62/66: SMRRNART\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.032073, p-value: 2.00e-05\n", + "Processing column 62/66: SMRRNART\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:27,285 - tissue_RASSF2_whole_blood] INFO: CCC: 0.048437, p-value: 1.00e-05\n", + "[2025-09-25 13:05:27,286 - tissue_RASSF2_whole_blood] INFO: Processing column 63/66: SME1MPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.048437, p-value: 1.00e-05\n", + "Processing column 63/66: SME1MPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: CCC: 0.181940, p-value: 1.00e-05\n", + "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n", + "[2025-09-25 13:05:27,541 - tissue_RASSF2_whole_blood] INFO: Skipping SMNUM5CD: all values are NaN\n", + "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n", + "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Skipping SMDPMPRT: only 1 unique value(s)\n", + "[2025-09-25 13:05:27,542 - tissue_RASSF2_whole_blood] INFO: Processing column 66/66: SME2PCTS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.181940, p-value: 1.00e-05\n", + "Processing column 64/66: SMNUM5CD\n", + " Skipping SMNUM5CD: all values are NaN\n", + "Processing column 65/66: SMDPMPRT\n", + " Skipping SMDPMPRT: only 1 unique value(s)\n", + "Processing column 66/66: SME2PCTS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:27,796 - tissue_RASSF2_whole_blood] INFO: CCC: 0.029344, p-value: 1.00e-05\n", + "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: \n", + "Completed processing whole_blood:\n", + "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Total metadata columns: 66\n", + "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Successful analyses: 44\n", + "[2025-09-25 13:05:27,798 - tissue_RASSF2_whole_blood] INFO: Skipped/Failed: 22\n", + "[2025-09-25 13:05:27,821 - summary] INFO: Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n", + "[2025-09-25 13:05:27,821 - summary] INFO: Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + "[2025-09-25 13:05:27,822 - summary] INFO: Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:27,823 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:27,823 - summary] INFO: COMBINED RESULTS SUMMARY\n", + "[2025-09-25 13:05:27,823 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:27,824 - summary] INFO: Gene Symbol: RASSF2\n", + "[2025-09-25 13:05:27,824 - summary] INFO: Gene ID: ENSG00000101265.15\n", + "[2025-09-25 13:05:27,824 - summary] INFO: Permutations: 100,000\n", + "[2025-09-25 13:05:27,824 - summary] INFO: Tissues processed: 1\n", + "[2025-09-25 13:05:27,825 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n", + "[2025-09-25 13:05:27,825 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n", + "[2025-09-25 13:05:27,826 - summary] INFO: \n", + "Total successful analyses across all tissues: 44\n", + "[2025-09-25 13:05:27,826 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:27,826 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n", + "[2025-09-25 13:05:27,826 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:27,827 - summary] INFO: Tissue Metadata Column CCC Value P-value Significance \n", + "[2025-09-25 13:05:27,827 - summary] INFO: ------------------------------------------------------------------------------------------\n", + "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood SMTSISCH 0.528125 1.00e-05 *** \n", + "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood DTHHRDY 0.464582 1.00e-05 *** \n", + "[2025-09-25 13:05:27,828 - summary] INFO: whole_blood SMNTERRT 0.250997 1.00e-05 *** \n", + "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMNTRART 0.243071 1.00e-05 *** \n", + "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n", + "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SME1MPRT 0.181940 1.00e-05 *** \n", + "[2025-09-25 13:05:27,829 - summary] INFO: whole_blood SMALTALG 0.177009 1.00e-05 *** \n", + "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SME2MPRT 0.172974 1.00e-05 *** \n", + "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n", + "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMMAPRT 0.168576 1.00e-05 *** \n", + "[2025-09-25 13:05:27,830 - summary] INFO: whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n", + "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood SMCENTER 0.108148 1.00e-05 *** \n", + "[2025-09-25 13:05:27,831 - summary] INFO: whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n", + "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n", + "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMRIN 0.048847 1.00e-05 *** \n", + "[2025-09-25 13:05:27,832 - summary] INFO: whole_blood SMRRNART 0.048437 1.00e-05 *** \n", + "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n", + "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n", + "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n", + "[2025-09-25 13:05:27,833 - summary] INFO: whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n", + "[2025-09-25 13:05:27,834 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:27,834 - summary] INFO: SUMMARY BY TISSUE\n", + "[2025-09-25 13:05:27,834 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:27,834 - summary] INFO: Tissue N Samples Successful Mean |CCC| Max |CCC| \n", + "[2025-09-25 13:05:27,835 - summary] INFO: ----------------------------------------------------------------------\n", + "[2025-09-25 13:05:27,835 - summary] INFO: whole_blood 755 44 0.079987 0.528125 \n", + "[2025-09-25 13:05:27,835 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:27,835 - summary] INFO: RUNTIME SUMMARY\n", + "[2025-09-25 13:05:27,836 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:27,836 - summary] INFO: Total runtime: 9.96 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:27,836 - summary] INFO: Average runtime per tissue: 9.96 seconds\n", + "[2025-09-25 13:05:27,836 - summary] INFO: \n", + "Runtime by tissue:\n", + "[2025-09-25 13:05:27,837 - summary] INFO: Tissue Runtime (sec) Runtime (min) Status \n", + "[2025-09-25 13:05:27,837 - summary] INFO: ----------------------------------------------------------------------\n", + "[2025-09-25 13:05:27,837 - summary] INFO: whole_blood 9.96 0.17 Success \n", + "[2025-09-25 13:05:27,837 - summary] INFO: \n", + "Fastest: whole_blood (9.96 seconds)\n", + "[2025-09-25 13:05:27,837 - summary] INFO: Slowest: whole_blood (9.96 seconds)\n", + "[2025-09-25 13:05:27,838 - summary] INFO: Speed ratio: 1.0x\n", + "[2025-09-25 13:05:27,838 - summary] INFO: Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:27,838 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:27,838 - summary] INFO: PROCESSING GENE 2/2: CYTIP\n", + "[2025-09-25 13:05:27,838 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:27,839 - summary] INFO: \n", + "[1/1] Starting processing for CYTIP in whole_blood...\n", + "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: \n", + "============================================================\n", + "[2025-09-25 13:05:27,840 - tissue_CYTIP_whole_blood] INFO: Processing tissue: whole_blood\n", + "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: File: gtex_v8_data_whole_blood.pkl\n", + "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: ============================================================\n", + "[2025-09-25 13:05:27,841 - tissue_CYTIP_whole_blood] INFO: Loading expression data...\n", + "[2025-09-25 13:05:27,981 - tissue_CYTIP_whole_blood] INFO: Expression data shape: (56200, 755)\n", + "[2025-09-25 13:05:27,984 - tissue_CYTIP_whole_blood] INFO: Gene ID for CYTIP: ENSG00000115165.9\n", + "[2025-09-25 13:05:27,986 - tissue_CYTIP_whole_blood] INFO: Number of samples: 755\n", + "[2025-09-25 13:05:27,987 - tissue_CYTIP_whole_blood] INFO: Common samples: 755\n", + "[2025-09-25 13:05:27,988 - tissue_CYTIP_whole_blood] INFO: Computing CCC between CYTIP expression and all metadata columns...\n", + "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Using 100000 permutations and 4 jobs\n", + "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing 66 metadata columns...\n", + "[2025-09-25 13:05:27,989 - tissue_CYTIP_whole_blood] INFO: Processing column 1/66: SUBJID\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.029344, p-value: 1.00e-05\n", + "\n", + "Completed processing whole_blood:\n", + " Total metadata columns: 66\n", + " Successful analyses: 44\n", + " Skipped/Failed: 22\n", + "Results for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood_correlation_results.pkl\n", + "Log file for RASSF2 in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + "Runtime for RASSF2 in whole_blood: 9.96 seconds (0.17 minutes)\n", + "\n", + "================================================================================\n", + "COMBINED RESULTS SUMMARY\n", + "================================================================================\n", + "Gene Symbol: RASSF2\n", + "Gene ID: ENSG00000101265.15\n", + "Permutations: 100,000\n", + "Tissues processed: 1\n", + "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.pkl\n", + "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_all_tissues_correlation_results.csv\n", + "\n", + "Total successful analyses across all tissues: 44\n", + "\n", + "================================================================================\n", + "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n", + "================================================================================\n", + "Tissue Metadata Column CCC Value P-value Significance \n", + "------------------------------------------------------------------------------------------\n", + "whole_blood SMTSISCH 0.528125 1.00e-05 *** \n", + "whole_blood DTHHRDY 0.464582 1.00e-05 *** \n", + "whole_blood SMNTERRT 0.250997 1.00e-05 *** \n", + "whole_blood SMNTRART 0.243071 1.00e-05 *** \n", + "whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n", + "whole_blood SME1MPRT 0.181940 1.00e-05 *** \n", + "whole_blood SMALTALG 0.177009 1.00e-05 *** \n", + "whole_blood SME2MPRT 0.172974 1.00e-05 *** \n", + "whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n", + "whole_blood SMMAPRT 0.168576 1.00e-05 *** \n", + "whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n", + "whole_blood SMCENTER 0.108148 1.00e-05 *** \n", + "whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n", + "whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n", + "whole_blood SMRIN 0.048847 1.00e-05 *** \n", + "whole_blood SMRRNART 0.048437 1.00e-05 *** \n", + "whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n", + "whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n", + "whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n", + "whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n", + "\n", + "================================================================================\n", + "SUMMARY BY TISSUE\n", + "================================================================================\n", + "Tissue N Samples Successful Mean |CCC| Max |CCC| \n", + "----------------------------------------------------------------------\n", + "whole_blood 755 44 0.079987 0.528125 \n", + "\n", + "================================================================================\n", + "RUNTIME SUMMARY\n", + "================================================================================\n", + "Total runtime: 9.96 seconds (0.17 minutes)\n", + "Average runtime per tissue: 9.96 seconds\n", + "\n", + "Runtime by tissue:\n", + "Tissue Runtime (sec) Runtime (min) Status \n", + "----------------------------------------------------------------------\n", + "whole_blood 9.96 0.17 Success \n", + "\n", + "Fastest: whole_blood (9.96 seconds)\n", + "Slowest: whole_blood (9.96 seconds)\n", + "Speed ratio: 1.0x\n", + "Runtime for RASSF2: 9.96 seconds (0.17 minutes)\n", + "\n", + "====================================================================================================\n", + "PROCESSING GENE 2/2: CYTIP\n", + "====================================================================================================\n", + "\n", + "[1/1] Starting processing for CYTIP in whole_blood...\n", + "\n", + "============================================================\n", + "Processing tissue: whole_blood\n", + "File: gtex_v8_data_whole_blood.pkl\n", + "Log file: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "============================================================\n", + "Loading expression data...\n", + "Expression data shape: (56200, 755)\n", + "Gene ID for CYTIP: ENSG00000115165.9\n", + "Number of samples: 755\n", + "Common samples: 755\n", + "Computing CCC between CYTIP expression and all metadata columns...\n", + "Using 100000 permutations and 4 jobs\n", + "Processing 66 metadata columns...\n", + "Processing column 1/66: SUBJID\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000000, p-value: 1.00e+00\n", + "[2025-09-25 13:05:28,086 - tissue_CYTIP_whole_blood] INFO: Processing column 2/66: SEX\n", + "[2025-09-25 13:05:28,156 - tissue_CYTIP_whole_blood] INFO: CCC: 0.001409, p-value: 3.98e-01\n", + "[2025-09-25 13:05:28,157 - tissue_CYTIP_whole_blood] INFO: Processing column 3/66: AGE\n", + "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO: CCC: 0.018997, p-value: 1.00e-05\n", + "[2025-09-25 13:05:28,228 - tissue_CYTIP_whole_blood] INFO: Processing column 4/66: DTHHRDY\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.000000, p-value: 1.00e+00\n", + "Processing column 2/66: SEX\n", + " CCC: 0.001409, p-value: 3.98e-01\n", + "Processing column 3/66: AGE\n", + " CCC: 0.018997, p-value: 1.00e-05\n", + "Processing column 4/66: DTHHRDY\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:28,481 - tissue_CYTIP_whole_blood] INFO: CCC: 0.184226, p-value: 1.00e-05\n", + "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 5/66: SMATSSCR\n", + "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Skipping SMATSSCR: all values are NaN\n", + "[2025-09-25 13:05:28,482 - tissue_CYTIP_whole_blood] INFO: Processing column 6/66: SMCENTER\n", + "[2025-09-25 13:05:28,551 - tissue_CYTIP_whole_blood] INFO: CCC: 0.084684, p-value: 1.00e-05\n", + "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 7/66: SMPTHNTS\n", + "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Skipping SMPTHNTS: all values are NaN\n", + "[2025-09-25 13:05:28,552 - tissue_CYTIP_whole_blood] INFO: Processing column 8/66: SMRIN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.184226, p-value: 1.00e-05\n", + "Processing column 5/66: SMATSSCR\n", + " Skipping SMATSSCR: all values are NaN\n", + "Processing column 6/66: SMCENTER\n", + " CCC: 0.084684, p-value: 1.00e-05\n", + "Processing column 7/66: SMPTHNTS\n", + " Skipping SMPTHNTS: all values are NaN\n", + "Processing column 8/66: SMRIN\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO: CCC: 0.003196, p-value: 5.68e-01\n", + "[2025-09-25 13:05:28,806 - tissue_CYTIP_whole_blood] INFO: Processing column 9/66: SMTS\n", + "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Skipping SMTS: only 1 unique value(s)\n", + "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Processing column 10/66: SMTSD\n", + "[2025-09-25 13:05:28,807 - tissue_CYTIP_whole_blood] INFO: Skipping SMTSD: only 1 unique value(s)\n", + "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 11/66: SMUBRID\n", + "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Skipping SMUBRID: only 1 unique value(s)\n", + "[2025-09-25 13:05:28,808 - tissue_CYTIP_whole_blood] INFO: Processing column 12/66: SMTSISCH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.003196, p-value: 5.68e-01\n", + "Processing column 9/66: SMTS\n", + " Skipping SMTS: only 1 unique value(s)\n", + "Processing column 10/66: SMTSD\n", + " Skipping SMTSD: only 1 unique value(s)\n", + "Processing column 11/66: SMUBRID\n", + " Skipping SMUBRID: only 1 unique value(s)\n", + "Processing column 12/66: SMTSISCH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO: CCC: 0.215092, p-value: 1.00e-05\n", + "[2025-09-25 13:05:29,062 - tissue_CYTIP_whole_blood] INFO: Processing column 13/66: SMTSPAX\n", + "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO: Skipping SMTSPAX: all values are NaN\n", + "[2025-09-25 13:05:29,063 - tissue_CYTIP_whole_blood] INFO: Processing column 14/66: SMNABTCH\n", + "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000304, p-value: 1.00e-05\n", + "[2025-09-25 13:05:29,128 - tissue_CYTIP_whole_blood] INFO: Processing column 15/66: SMNABTCHT\n", + "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO: Skipping SMNABTCHT: only 1 unique value(s)\n", + "[2025-09-25 13:05:29,129 - tissue_CYTIP_whole_blood] INFO: Processing column 16/66: SMNABTCHD\n", + "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000256, p-value: 1.00e-05\n", + "[2025-09-25 13:05:29,197 - tissue_CYTIP_whole_blood] INFO: Processing column 17/66: SMGEBTCH\n", + "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO: CCC: 0.001533, p-value: 1.00e-05\n", + "[2025-09-25 13:05:29,258 - tissue_CYTIP_whole_blood] INFO: Processing column 18/66: SMGEBTCHD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.215092, p-value: 1.00e-05\n", + "Processing column 13/66: SMTSPAX\n", + " Skipping SMTSPAX: all values are NaN\n", + "Processing column 14/66: SMNABTCH\n", + " CCC: 0.000304, p-value: 1.00e-05\n", + "Processing column 15/66: SMNABTCHT\n", + " Skipping SMNABTCHT: only 1 unique value(s)\n", + "Processing column 16/66: SMNABTCHD\n", + " CCC: 0.000256, p-value: 1.00e-05\n", + "Processing column 17/66: SMGEBTCH\n", + " CCC: 0.001533, p-value: 1.00e-05\n", + "Processing column 18/66: SMGEBTCHD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:29,317 - tissue_CYTIP_whole_blood] INFO: CCC: 0.002104, p-value: 1.00e-05\n", + "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO: Processing column 19/66: SMGEBTCHT\n", + "[2025-09-25 13:05:29,318 - tissue_CYTIP_whole_blood] INFO: Skipping SMGEBTCHT: only 1 unique value(s)\n", + "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 20/66: SMAFRZE\n", + "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Skipping SMAFRZE: only 1 unique value(s)\n", + "[2025-09-25 13:05:29,319 - tissue_CYTIP_whole_blood] INFO: Processing column 21/66: SMGTC\n", + "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO: Skipping SMGTC: all values are NaN\n", + "[2025-09-25 13:05:29,320 - tissue_CYTIP_whole_blood] INFO: Processing column 22/66: SME2MPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.002104, p-value: 1.00e-05\n", + "Processing column 19/66: SMGEBTCHT\n", + " Skipping SMGEBTCHT: only 1 unique value(s)\n", + "Processing column 20/66: SMAFRZE\n", + " Skipping SMAFRZE: only 1 unique value(s)\n", + "Processing column 21/66: SMGTC\n", + " Skipping SMGTC: all values are NaN\n", + "Processing column 22/66: SME2MPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:29,573 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021744, p-value: 5.00e-05\n", + "[2025-09-25 13:05:29,574 - tissue_CYTIP_whole_blood] INFO: Processing column 23/66: SMCHMPRS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.021744, p-value: 5.00e-05\n", + "Processing column 23/66: SMCHMPRS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO: CCC: 0.015946, p-value: 4.50e-04\n", + "[2025-09-25 13:05:29,828 - tissue_CYTIP_whole_blood] INFO: Processing column 24/66: SMNTRART\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.015946, p-value: 4.50e-04\n", + "Processing column 24/66: SMNTRART\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:30,082 - tissue_CYTIP_whole_blood] INFO: CCC: 0.024407, p-value: 3.00e-05\n", + "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO: Processing column 25/66: SMNUMGPS\n", + "[2025-09-25 13:05:30,083 - tissue_CYTIP_whole_blood] INFO: Skipping SMNUMGPS: all values are NaN\n", + "[2025-09-25 13:05:30,084 - tissue_CYTIP_whole_blood] INFO: Processing column 26/66: SMMAPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.024407, p-value: 3.00e-05\n", + "Processing column 25/66: SMNUMGPS\n", + " Skipping SMNUMGPS: all values are NaN\n", + "Processing column 26/66: SMMAPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:30,338 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021052, p-value: 7.00e-05\n", + "[2025-09-25 13:05:30,339 - tissue_CYTIP_whole_blood] INFO: Processing column 27/66: SMEXNCRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.021052, p-value: 7.00e-05\n", + "Processing column 27/66: SMEXNCRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: CCC: 0.126241, p-value: 1.00e-05\n", + "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: Processing column 28/66: SM550NRM\n", + "[2025-09-25 13:05:30,593 - tissue_CYTIP_whole_blood] INFO: Skipping SM550NRM: all values are NaN\n", + "[2025-09-25 13:05:30,594 - tissue_CYTIP_whole_blood] INFO: Processing column 29/66: SMGNSDTC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.126241, p-value: 1.00e-05\n", + "Processing column 28/66: SM550NRM\n", + " Skipping SM550NRM: all values are NaN\n", + "Processing column 29/66: SMGNSDTC\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO: CCC: 0.050841, p-value: 1.00e-05\n", + "[2025-09-25 13:05:30,847 - tissue_CYTIP_whole_blood] INFO: Processing column 30/66: SMUNMPRT\n", + "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Skipping SMUNMPRT: only 1 unique value(s)\n", + "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Processing column 31/66: SM350NRM\n", + "[2025-09-25 13:05:30,848 - tissue_CYTIP_whole_blood] INFO: Skipping SM350NRM: all values are NaN\n", + "[2025-09-25 13:05:30,849 - tissue_CYTIP_whole_blood] INFO: Processing column 32/66: SMRDLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.050841, p-value: 1.00e-05\n", + "Processing column 30/66: SMUNMPRT\n", + " Skipping SMUNMPRT: only 1 unique value(s)\n", + "Processing column 31/66: SM350NRM\n", + " Skipping SM350NRM: all values are NaN\n", + "Processing column 32/66: SMRDLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:31,103 - tissue_CYTIP_whole_blood] INFO: CCC: 0.000003, p-value: 9.42e-01\n", + "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 33/66: SMMNCPB\n", + "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Skipping SMMNCPB: all values are NaN\n", + "[2025-09-25 13:05:31,104 - tissue_CYTIP_whole_blood] INFO: Processing column 34/66: SME1MMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.000003, p-value: 9.42e-01\n", + "Processing column 33/66: SMMNCPB\n", + " Skipping SMMNCPB: all values are NaN\n", + "Processing column 34/66: SME1MMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005089, p-value: 2.16e-01\n", + "[2025-09-25 13:05:31,359 - tissue_CYTIP_whole_blood] INFO: Processing column 35/66: SMSFLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.005089, p-value: 2.16e-01\n", + "Processing column 35/66: SMSFLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:31,614 - tissue_CYTIP_whole_blood] INFO: CCC: 0.015322, p-value: 7.10e-04\n", + "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO: Processing column 36/66: SMESTLBS\n", + "[2025-09-25 13:05:31,615 - tissue_CYTIP_whole_blood] INFO: Skipping SMESTLBS: only 1 unique value(s)\n", + "[2025-09-25 13:05:31,616 - tissue_CYTIP_whole_blood] INFO: Processing column 37/66: SMMPPD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.015322, p-value: 7.10e-04\n", + "Processing column 36/66: SMESTLBS\n", + " Skipping SMESTLBS: only 1 unique value(s)\n", + "Processing column 37/66: SMMPPD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007147, p-value: 5.32e-02\n", + "[2025-09-25 13:05:31,870 - tissue_CYTIP_whole_blood] INFO: Processing column 38/66: SMNTERRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007147, p-value: 5.32e-02\n", + "Processing column 38/66: SMNTERRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:32,125 - tissue_CYTIP_whole_blood] INFO: CCC: 0.023433, p-value: 4.00e-05\n", + "[2025-09-25 13:05:32,126 - tissue_CYTIP_whole_blood] INFO: Processing column 39/66: SMRRNANM\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.023433, p-value: 4.00e-05\n", + "Processing column 39/66: SMRRNANM\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005677, p-value: 1.45e-01\n", + "[2025-09-25 13:05:32,379 - tissue_CYTIP_whole_blood] INFO: Processing column 40/66: SMRDTTL\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.005677, p-value: 1.45e-01\n", + "Processing column 40/66: SMRDTTL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:32,632 - tissue_CYTIP_whole_blood] INFO: CCC: 0.008033, p-value: 2.92e-02\n", + "[2025-09-25 13:05:32,633 - tissue_CYTIP_whole_blood] INFO: Processing column 41/66: SMVQCFL\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.008033, p-value: 2.92e-02\n", + "Processing column 41/66: SMVQCFL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:32,887 - tissue_CYTIP_whole_blood] INFO: CCC: 0.003136, p-value: 6.00e-01\n", + "[2025-09-25 13:05:32,888 - tissue_CYTIP_whole_blood] INFO: Processing column 42/66: SMMNCV\n", + "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO: Skipping SMMNCV: all values are NaN\n", + "[2025-09-25 13:05:32,889 - tissue_CYTIP_whole_blood] INFO: Processing column 43/66: SMTRSCPT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.003136, p-value: 6.00e-01\n", + "Processing column 42/66: SMMNCV\n", + " Skipping SMMNCV: all values are NaN\n", + "Processing column 43/66: SMTRSCPT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:33,142 - tissue_CYTIP_whole_blood] INFO: CCC: 0.051533, p-value: 1.00e-05\n", + "[2025-09-25 13:05:33,143 - tissue_CYTIP_whole_blood] INFO: Processing column 44/66: SMMPPDPR\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.051533, p-value: 1.00e-05\n", + "Processing column 44/66: SMMPPDPR\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: CCC: 0.005880, p-value: 1.29e-01\n", + "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: Processing column 45/66: SMCGLGTH\n", + "[2025-09-25 13:05:33,397 - tissue_CYTIP_whole_blood] INFO: Skipping SMCGLGTH: all values are NaN\n", + "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 46/66: SMGAPPCT\n", + "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Skipping SMGAPPCT: all values are NaN\n", + "[2025-09-25 13:05:33,398 - tissue_CYTIP_whole_blood] INFO: Processing column 47/66: SMUNPDRD\n", + "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO: Skipping SMUNPDRD: only 1 unique value(s)\n", + "[2025-09-25 13:05:33,399 - tissue_CYTIP_whole_blood] INFO: Processing column 48/66: SMNTRNRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.005880, p-value: 1.29e-01\n", + "Processing column 45/66: SMCGLGTH\n", + " Skipping SMCGLGTH: all values are NaN\n", + "Processing column 46/66: SMGAPPCT\n", + " Skipping SMGAPPCT: all values are NaN\n", + "Processing column 47/66: SMUNPDRD\n", + " Skipping SMUNPDRD: only 1 unique value(s)\n", + "Processing column 48/66: SMNTRNRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO: CCC: 0.261762, p-value: 1.00e-05\n", + "[2025-09-25 13:05:33,653 - tissue_CYTIP_whole_blood] INFO: Processing column 49/66: SMMPUNRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.261762, p-value: 1.00e-05\n", + "Processing column 49/66: SMMPUNRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021052, p-value: 7.00e-05\n", + "[2025-09-25 13:05:33,907 - tissue_CYTIP_whole_blood] INFO: Processing column 50/66: SMEXPEFF\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.021052, p-value: 7.00e-05\n", + "Processing column 50/66: SMEXPEFF\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO: CCC: 0.086945, p-value: 1.00e-05\n", + "[2025-09-25 13:05:34,163 - tissue_CYTIP_whole_blood] INFO: Processing column 51/66: SMMPPDUN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.086945, p-value: 1.00e-05\n", + "Processing column 51/66: SMMPPDUN\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007147, p-value: 5.32e-02\n", + "[2025-09-25 13:05:34,419 - tissue_CYTIP_whole_blood] INFO: Processing column 52/66: SME2MMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007147, p-value: 5.32e-02\n", + "Processing column 52/66: SME2MMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:34,674 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004187, p-value: 3.68e-01\n", + "[2025-09-25 13:05:34,675 - tissue_CYTIP_whole_blood] INFO: Processing column 53/66: SME2ANTI\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.004187, p-value: 3.68e-01\n", + "Processing column 53/66: SME2ANTI\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:34,929 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007334, p-value: 4.67e-02\n", + "[2025-09-25 13:05:34,930 - tissue_CYTIP_whole_blood] INFO: Processing column 54/66: SMALTALG\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007334, p-value: 4.67e-02\n", + "Processing column 54/66: SMALTALG\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:35,186 - tissue_CYTIP_whole_blood] INFO: CCC: 0.038381, p-value: 1.00e-05\n", + "[2025-09-25 13:05:35,187 - tissue_CYTIP_whole_blood] INFO: Processing column 55/66: SME2SNSE\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.038381, p-value: 1.00e-05\n", + "Processing column 55/66: SME2SNSE\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:35,441 - tissue_CYTIP_whole_blood] INFO: CCC: 0.006734, p-value: 7.08e-02\n", + "[2025-09-25 13:05:35,442 - tissue_CYTIP_whole_blood] INFO: Processing column 56/66: SMMFLGTH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.006734, p-value: 7.08e-02\n", + "Processing column 56/66: SMMFLGTH\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO: CCC: 0.010863, p-value: 4.63e-03\n", + "[2025-09-25 13:05:35,696 - tissue_CYTIP_whole_blood] INFO: Processing column 57/66: SME1ANTI\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.010863, p-value: 4.63e-03\n", + "Processing column 57/66: SME1ANTI\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:35,950 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007210, p-value: 5.04e-02\n", + "[2025-09-25 13:05:35,951 - tissue_CYTIP_whole_blood] INFO: Processing column 58/66: SMSPLTRD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007210, p-value: 5.04e-02\n", + "Processing column 58/66: SMSPLTRD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO: CCC: 0.030117, p-value: 1.00e-05\n", + "[2025-09-25 13:05:36,208 - tissue_CYTIP_whole_blood] INFO: Processing column 59/66: SMBSMMRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.030117, p-value: 1.00e-05\n", + "Processing column 59/66: SMBSMMRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:36,464 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004293, p-value: 3.48e-01\n", + "[2025-09-25 13:05:36,465 - tissue_CYTIP_whole_blood] INFO: Processing column 60/66: SME1SNSE\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.004293, p-value: 3.48e-01\n", + "Processing column 60/66: SME1SNSE\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007285, p-value: 4.87e-02\n", + "[2025-09-25 13:05:36,719 - tissue_CYTIP_whole_blood] INFO: Processing column 61/66: SME1PCTS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007285, p-value: 4.87e-02\n", + "Processing column 61/66: SME1PCTS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO: CCC: 0.004663, p-value: 2.81e-01\n", + "[2025-09-25 13:05:36,973 - tissue_CYTIP_whole_blood] INFO: Processing column 62/66: SMRRNART\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.004663, p-value: 2.81e-01\n", + "Processing column 62/66: SMRRNART\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:37,228 - tissue_CYTIP_whole_blood] INFO: CCC: 0.013729, p-value: 1.10e-03\n", + "[2025-09-25 13:05:37,229 - tissue_CYTIP_whole_blood] INFO: Processing column 63/66: SME1MPRT\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.013729, p-value: 1.10e-03\n", + "Processing column 63/66: SME1MPRT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO: CCC: 0.021952, p-value: 4.00e-05\n", + "[2025-09-25 13:05:37,483 - tissue_CYTIP_whole_blood] INFO: Processing column 64/66: SMNUM5CD\n", + "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO: Skipping SMNUM5CD: all values are NaN\n", + "[2025-09-25 13:05:37,484 - tissue_CYTIP_whole_blood] INFO: Processing column 65/66: SMDPMPRT\n", + "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO: Skipping SMDPMPRT: only 1 unique value(s)\n", + "[2025-09-25 13:05:37,485 - tissue_CYTIP_whole_blood] INFO: Processing column 66/66: SME2PCTS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.021952, p-value: 4.00e-05\n", + "Processing column 64/66: SMNUM5CD\n", + " Skipping SMNUM5CD: all values are NaN\n", + "Processing column 65/66: SMDPMPRT\n", + " Skipping SMDPMPRT: only 1 unique value(s)\n", + "Processing column 66/66: SME2PCTS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-09-25 13:05:37,739 - tissue_CYTIP_whole_blood] INFO: CCC: 0.007812, p-value: 3.38e-02\n", + "[2025-09-25 13:05:37,740 - tissue_CYTIP_whole_blood] INFO: \n", + "Completed processing whole_blood:\n", + "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Total metadata columns: 66\n", + "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Successful analyses: 44\n", + "[2025-09-25 13:05:37,741 - tissue_CYTIP_whole_blood] INFO: Skipped/Failed: 22\n", + "[2025-09-25 13:05:37,765 - summary] INFO: Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n", + "[2025-09-25 13:05:37,765 - summary] INFO: Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "[2025-09-25 13:05:37,766 - summary] INFO: Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:37,767 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:37,767 - summary] INFO: COMBINED RESULTS SUMMARY\n", + "[2025-09-25 13:05:37,767 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:37,768 - summary] INFO: Gene Symbol: CYTIP\n", + "[2025-09-25 13:05:37,768 - summary] INFO: Gene ID: ENSG00000115165.9\n", + "[2025-09-25 13:05:37,768 - summary] INFO: Permutations: 100,000\n", + "[2025-09-25 13:05:37,768 - summary] INFO: Tissues processed: 1\n", + "[2025-09-25 13:05:37,769 - summary] INFO: Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n", + "[2025-09-25 13:05:37,769 - summary] INFO: Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n", + "[2025-09-25 13:05:37,769 - summary] INFO: \n", + "Total successful analyses across all tissues: 44\n", + "[2025-09-25 13:05:37,769 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:37,770 - summary] INFO: TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n", + "[2025-09-25 13:05:37,771 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:37,771 - summary] INFO: Tissue Metadata Column CCC Value P-value Significance \n", + "[2025-09-25 13:05:37,771 - summary] INFO: ------------------------------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n", + "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood SMTSISCH 0.215092 1.00e-05 *** \n", + "[2025-09-25 13:05:37,772 - summary] INFO: whole_blood DTHHRDY 0.184226 1.00e-05 *** \n", + "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n", + "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n", + "[2025-09-25 13:05:37,773 - summary] INFO: whole_blood SMCENTER 0.084684 1.00e-05 *** \n", + "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n", + "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n", + "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMALTALG 0.038381 1.00e-05 *** \n", + "[2025-09-25 13:05:37,774 - summary] INFO: whole_blood SMSPLTRD 0.030117 1.00e-05 *** \n", + "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SMNTRART 0.024407 3.00e-05 *** \n", + "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SMNTERRT 0.023433 4.00e-05 *** \n", + "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SME1MPRT 0.021952 4.00e-05 *** \n", + "[2025-09-25 13:05:37,775 - summary] INFO: whole_blood SME2MPRT 0.021744 5.00e-05 *** \n", + "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood SMMAPRT 0.021052 7.00e-05 *** \n", + "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood SMMPUNRT 0.021052 7.00e-05 *** \n", + "[2025-09-25 13:05:37,776 - summary] INFO: whole_blood AGE 0.018997 1.00e-05 *** \n", + "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMCHMPRS 0.015946 4.50e-04 *** \n", + "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMSFLGTH 0.015322 7.10e-04 *** \n", + "[2025-09-25 13:05:37,777 - summary] INFO: whole_blood SMRRNART 0.013729 1.10e-03 ** \n", + "[2025-09-25 13:05:37,777 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:37,778 - summary] INFO: SUMMARY BY TISSUE\n", + "[2025-09-25 13:05:37,778 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:37,778 - summary] INFO: Tissue N Samples Successful Mean |CCC| Max |CCC| \n", + "[2025-09-25 13:05:37,778 - summary] INFO: ----------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,779 - summary] INFO: whole_blood 755 44 0.032699 0.261762 \n", + "[2025-09-25 13:05:37,779 - summary] INFO: \n", + "================================================================================\n", + "[2025-09-25 13:05:37,780 - summary] INFO: RUNTIME SUMMARY\n", + "[2025-09-25 13:05:37,780 - summary] INFO: ================================================================================\n", + "[2025-09-25 13:05:37,780 - summary] INFO: Total runtime: 9.93 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:37,780 - summary] INFO: Average runtime per tissue: 9.93 seconds\n", + "[2025-09-25 13:05:37,780 - summary] INFO: \n", + "Runtime by tissue:\n", + "[2025-09-25 13:05:37,781 - summary] INFO: Tissue Runtime (sec) Runtime (min) Status \n", + "[2025-09-25 13:05:37,781 - summary] INFO: ----------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,781 - summary] INFO: whole_blood 9.93 0.17 Success \n", + "[2025-09-25 13:05:37,781 - summary] INFO: \n", + "Fastest: whole_blood (9.93 seconds)\n", + "[2025-09-25 13:05:37,781 - summary] INFO: Slowest: whole_blood (9.93 seconds)\n", + "[2025-09-25 13:05:37,781 - summary] INFO: Speed ratio: 1.0x\n", + "[2025-09-25 13:05:37,782 - summary] INFO: Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:37,782 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:37,782 - summary] INFO: OVERALL RESULTS SUMMARY\n", + "[2025-09-25 13:05:37,782 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:37,783 - summary] INFO: Gene symbols processed: RASSF2, CYTIP\n", + "[2025-09-25 13:05:37,783 - summary] INFO: Total genes: 2\n", + "[2025-09-25 13:05:37,783 - summary] INFO: Permutations: 100,000\n", + "[2025-09-25 13:05:37,783 - summary] INFO: Tissues per gene: 1\n", + "[2025-09-25 13:05:37,784 - summary] INFO: All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n", + "[2025-09-25 13:05:37,785 - summary] INFO: All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n", + "[2025-09-25 13:05:37,785 - summary] INFO: \n", + "Log files created:\n", + "[2025-09-25 13:05:37,785 - summary] INFO: RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + "[2025-09-25 13:05:37,786 - summary] INFO: CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "[2025-09-25 13:05:37,786 - summary] INFO: \n", + "Total successful analyses across all genes and tissues: 88\n", + "[2025-09-25 13:05:37,786 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:37,787 - summary] INFO: TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n", + "[2025-09-25 13:05:37,787 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:37,788 - summary] INFO: Gene Tissue Metadata Column CCC Value P-value Significance \n", + "[2025-09-25 13:05:37,788 - summary] INFO: --------------------------------------------------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2 whole_blood SMTSISCH 0.528125 1.00e-05 *** \n", + "[2025-09-25 13:05:37,788 - summary] INFO: RASSF2 whole_blood DTHHRDY 0.464582 1.00e-05 *** \n", + "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n", + "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2 whole_blood SMNTERRT 0.250997 1.00e-05 *** \n", + "[2025-09-25 13:05:37,789 - summary] INFO: RASSF2 whole_blood SMNTRART 0.243071 1.00e-05 *** \n", + "[2025-09-25 13:05:37,789 - summary] INFO: CYTIP whole_blood SMTSISCH 0.215092 1.00e-05 *** \n", + "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2 whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n", + "[2025-09-25 13:05:37,790 - summary] INFO: CYTIP whole_blood DTHHRDY 0.184226 1.00e-05 *** \n", + "[2025-09-25 13:05:37,790 - summary] INFO: RASSF2 whole_blood SME1MPRT 0.181940 1.00e-05 *** \n", + "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMALTALG 0.177009 1.00e-05 *** \n", + "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SME2MPRT 0.172974 1.00e-05 *** \n", + "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n", + "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMMAPRT 0.168576 1.00e-05 *** \n", + "[2025-09-25 13:05:37,791 - summary] INFO: RASSF2 whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n", + "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n", + "[2025-09-25 13:05:37,792 - summary] INFO: RASSF2 whole_blood SMCENTER 0.108148 1.00e-05 *** \n", + "[2025-09-25 13:05:37,792 - summary] INFO: CYTIP whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n", + "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP whole_blood SMCENTER 0.084684 1.00e-05 *** \n", + "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2 whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n", + "[2025-09-25 13:05:37,793 - summary] INFO: RASSF2 whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n", + "[2025-09-25 13:05:37,793 - summary] INFO: CYTIP whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n", + "[2025-09-25 13:05:37,794 - summary] INFO: CYTIP whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n", + "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMRIN 0.048847 1.00e-05 *** \n", + "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMRRNART 0.048437 1.00e-05 *** \n", + "[2025-09-25 13:05:37,794 - summary] INFO: RASSF2 whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n", + "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n", + "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n", + "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n", + "[2025-09-25 13:05:37,795 - summary] INFO: RASSF2 whole_blood AGE 0.039824 1.00e-05 *** \n", + "[2025-09-25 13:05:37,796 - summary] INFO: CYTIP whole_blood SMALTALG 0.038381 1.00e-05 *** \n", + "[2025-09-25 13:05:37,796 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:37,796 - summary] INFO: SUMMARY BY GENE\n", + "[2025-09-25 13:05:37,796 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:37,797 - summary] INFO: \n", + "Gene: RASSF2 (ID: ENSG00000101265.15)\n", + "[2025-09-25 13:05:37,797 - summary] INFO: Tissues processed: 1\n", + "[2025-09-25 13:05:37,797 - summary] INFO: Successful analyses: 44\n", + "[2025-09-25 13:05:37,797 - summary] INFO: Mean |CCC|: 0.079987\n", + "[2025-09-25 13:05:37,797 - summary] INFO: Max |CCC|: 0.528125\n", + "[2025-09-25 13:05:37,798 - summary] INFO: Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n", + "[2025-09-25 13:05:37,798 - summary] INFO: Runtime: 9.96 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:37,799 - summary] INFO: \n", + "Gene: CYTIP (ID: ENSG00000115165.9)\n", + "[2025-09-25 13:05:37,799 - summary] INFO: Tissues processed: 1\n", + "[2025-09-25 13:05:37,799 - summary] INFO: Successful analyses: 44\n", + "[2025-09-25 13:05:37,800 - summary] INFO: Mean |CCC|: 0.032699\n", + "[2025-09-25 13:05:37,800 - summary] INFO: Max |CCC|: 0.261762\n", + "[2025-09-25 13:05:37,800 - summary] INFO: Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n", + "[2025-09-25 13:05:37,801 - summary] INFO: Runtime: 9.93 seconds (0.17 minutes)\n", + "[2025-09-25 13:05:37,801 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:37,801 - summary] INFO: SUMMARY BY TISSUE (across all genes)\n", + "[2025-09-25 13:05:37,801 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:37,801 - summary] INFO: Tissue N Genes Successful Mean |CCC| Max |CCC| \n", + "[2025-09-25 13:05:37,801 - summary] INFO: ---------------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,802 - summary] INFO: whole_blood 2 88 0.056343 0.528125 \n", + "[2025-09-25 13:05:37,802 - summary] INFO: \n", + "====================================================================================================\n", + "[2025-09-25 13:05:37,803 - summary] INFO: RUNTIME SUMMARY\n", + "[2025-09-25 13:05:37,803 - summary] INFO: ====================================================================================================\n", + "[2025-09-25 13:05:37,803 - summary] INFO: Total runtime: 19.92 seconds (0.33 minutes)\n", + "[2025-09-25 13:05:37,803 - summary] INFO: Average runtime per gene: 9.96 seconds\n", + "[2025-09-25 13:05:37,803 - summary] INFO: Total gene-tissue combinations: 2\n", + "[2025-09-25 13:05:37,803 - summary] INFO: \n", + "Runtime by gene:\n", + "[2025-09-25 13:05:37,803 - summary] INFO: Gene Runtime (sec) Runtime (min) Tissues Successful \n", + "[2025-09-25 13:05:37,804 - summary] INFO: ---------------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,804 - summary] INFO: RASSF2 9.96 0.17 1 1 \n", + "[2025-09-25 13:05:37,804 - summary] INFO: CYTIP 9.93 0.17 1 1 \n", + "[2025-09-25 13:05:37,804 - summary] INFO: \n", + "Average runtime by tissue (across all genes):\n", + "[2025-09-25 13:05:37,805 - summary] INFO: Tissue Avg Runtime (sec) Avg Runtime (min) N Runs Min Max \n", + "[2025-09-25 13:05:37,805 - summary] INFO: -----------------------------------------------------------------------------------------------\n", + "[2025-09-25 13:05:37,805 - summary] INFO: whole_blood 9.94 0.17 2 9.93 9.96 \n", + "[2025-09-25 13:05:37,805 - summary] INFO: \n", + "Fastest tissue (avg): whole_blood (9.94 seconds)\n", + "[2025-09-25 13:05:37,805 - summary] INFO: Slowest tissue (avg): whole_blood (9.94 seconds)\n", + "[2025-09-25 13:05:37,806 - summary] INFO: Speed ratio: 1.0x\n", + "[2025-09-25 13:05:37,806 - summary] INFO: \n", + "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n", + "[2025-09-25 13:05:37,806 - summary] INFO: Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CCC: 0.007812, p-value: 3.38e-02\n", + "\n", + "Completed processing whole_blood:\n", + " Total metadata columns: 66\n", + " Successful analyses: 44\n", + " Skipped/Failed: 22\n", + "Results for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood_correlation_results.pkl\n", + "Log file for CYTIP in whole_blood saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "Runtime for CYTIP in whole_blood: 9.93 seconds (0.17 minutes)\n", + "\n", + "================================================================================\n", + "COMBINED RESULTS SUMMARY\n", + "================================================================================\n", + "Gene Symbol: CYTIP\n", + "Gene ID: ENSG00000115165.9\n", + "Permutations: 100,000\n", + "Tissues processed: 1\n", + "Combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.pkl\n", + "Combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_all_tissues_correlation_results.csv\n", + "\n", + "Total successful analyses across all tissues: 44\n", + "\n", + "================================================================================\n", + "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)\n", + "================================================================================\n", + "Tissue Metadata Column CCC Value P-value Significance \n", + "------------------------------------------------------------------------------------------\n", + "whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n", + "whole_blood SMTSISCH 0.215092 1.00e-05 *** \n", + "whole_blood DTHHRDY 0.184226 1.00e-05 *** \n", + "whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n", + "whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n", + "whole_blood SMCENTER 0.084684 1.00e-05 *** \n", + "whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n", + "whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n", + "whole_blood SMALTALG 0.038381 1.00e-05 *** \n", + "whole_blood SMSPLTRD 0.030117 1.00e-05 *** \n", + "whole_blood SMNTRART 0.024407 3.00e-05 *** \n", + "whole_blood SMNTERRT 0.023433 4.00e-05 *** \n", + "whole_blood SME1MPRT 0.021952 4.00e-05 *** \n", + "whole_blood SME2MPRT 0.021744 5.00e-05 *** \n", + "whole_blood SMMAPRT 0.021052 7.00e-05 *** \n", + "whole_blood SMMPUNRT 0.021052 7.00e-05 *** \n", + "whole_blood AGE 0.018997 1.00e-05 *** \n", + "whole_blood SMCHMPRS 0.015946 4.50e-04 *** \n", + "whole_blood SMSFLGTH 0.015322 7.10e-04 *** \n", + "whole_blood SMRRNART 0.013729 1.10e-03 ** \n", + "\n", + "================================================================================\n", + "SUMMARY BY TISSUE\n", + "================================================================================\n", + "Tissue N Samples Successful Mean |CCC| Max |CCC| \n", + "----------------------------------------------------------------------\n", + "whole_blood 755 44 0.032699 0.261762 \n", + "\n", + "================================================================================\n", + "RUNTIME SUMMARY\n", + "================================================================================\n", + "Total runtime: 9.93 seconds (0.17 minutes)\n", + "Average runtime per tissue: 9.93 seconds\n", + "\n", + "Runtime by tissue:\n", + "Tissue Runtime (sec) Runtime (min) Status \n", + "----------------------------------------------------------------------\n", + "whole_blood 9.93 0.17 Success \n", + "\n", + "Fastest: whole_blood (9.93 seconds)\n", + "Slowest: whole_blood (9.93 seconds)\n", + "Speed ratio: 1.0x\n", + "Runtime for CYTIP: 9.93 seconds (0.17 minutes)\n", + "\n", + "====================================================================================================\n", + "OVERALL RESULTS SUMMARY\n", + "====================================================================================================\n", + "Gene symbols processed: RASSF2, CYTIP\n", + "Total genes: 2\n", + "Permutations: 100,000\n", + "Tissues per gene: 1\n", + "All genes combined results saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.pkl\n", + "All genes combined results (CSV) saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_all_genes_all_tissues_correlation_results.csv\n", + "\n", + "Log files created:\n", + " RASSF2 - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/RASSF2_whole_blood.log\n", + " CYTIP - whole_blood: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/CYTIP_whole_blood.log\n", + "\n", + "Total successful analyses across all genes and tissues: 88\n", + "\n", + "====================================================================================================\n", + "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)\n", + "====================================================================================================\n", + "Gene Tissue Metadata Column CCC Value P-value Significance \n", + "--------------------------------------------------------------------------------------------------------------\n", + "RASSF2 whole_blood SMTSISCH 0.528125 1.00e-05 *** \n", + "RASSF2 whole_blood DTHHRDY 0.464582 1.00e-05 *** \n", + "CYTIP whole_blood SMNTRNRT 0.261762 1.00e-05 *** \n", + "RASSF2 whole_blood SMNTERRT 0.250997 1.00e-05 *** \n", + "RASSF2 whole_blood SMNTRART 0.243071 1.00e-05 *** \n", + "CYTIP whole_blood SMTSISCH 0.215092 1.00e-05 *** \n", + "RASSF2 whole_blood SMNTRNRT 0.202936 1.00e-05 *** \n", + "CYTIP whole_blood DTHHRDY 0.184226 1.00e-05 *** \n", + "RASSF2 whole_blood SME1MPRT 0.181940 1.00e-05 *** \n", + "RASSF2 whole_blood SMALTALG 0.177009 1.00e-05 *** \n", + "RASSF2 whole_blood SME2MPRT 0.172974 1.00e-05 *** \n", + "RASSF2 whole_blood SMMPUNRT 0.168576 1.00e-05 *** \n", + "RASSF2 whole_blood SMMAPRT 0.168576 1.00e-05 *** \n", + "RASSF2 whole_blood SMCHMPRS 0.143365 1.00e-05 *** \n", + "CYTIP whole_blood SMEXNCRT 0.126241 1.00e-05 *** \n", + "RASSF2 whole_blood SMCENTER 0.108148 1.00e-05 *** \n", + "CYTIP whole_blood SMEXPEFF 0.086945 1.00e-05 *** \n", + "CYTIP whole_blood SMCENTER 0.084684 1.00e-05 *** \n", + "RASSF2 whole_blood SMEXPEFF 0.059931 1.00e-05 *** \n", + "RASSF2 whole_blood SMSPLTRD 0.057786 1.00e-05 *** \n", + "CYTIP whole_blood SMTRSCPT 0.051533 1.00e-05 *** \n", + "CYTIP whole_blood SMGNSDTC 0.050841 1.00e-05 *** \n", + "RASSF2 whole_blood SMRIN 0.048847 1.00e-05 *** \n", + "RASSF2 whole_blood SMRRNART 0.048437 1.00e-05 *** \n", + "RASSF2 whole_blood SMSFLGTH 0.047258 1.00e-05 *** \n", + "RASSF2 whole_blood SMGNSDTC 0.043013 1.00e-05 *** \n", + "RASSF2 whole_blood SMTRSCPT 0.042714 1.00e-05 *** \n", + "RASSF2 whole_blood SMEXNCRT 0.040140 1.00e-05 *** \n", + "RASSF2 whole_blood AGE 0.039824 1.00e-05 *** \n", + "CYTIP whole_blood SMALTALG 0.038381 1.00e-05 *** \n", + "\n", + "====================================================================================================\n", + "SUMMARY BY GENE\n", + "====================================================================================================\n", + "\n", + "Gene: RASSF2 (ID: ENSG00000101265.15)\n", + " Tissues processed: 1\n", + " Successful analyses: 44\n", + " Mean |CCC|: 0.079987\n", + " Max |CCC|: 0.528125\n", + " Top correlation: SMTSISCH in whole_blood (CCC: 0.528125, p: 1.00e-05)\n", + " Runtime: 9.96 seconds (0.17 minutes)\n", + "\n", + "Gene: CYTIP (ID: ENSG00000115165.9)\n", + " Tissues processed: 1\n", + " Successful analyses: 44\n", + " Mean |CCC|: 0.032699\n", + " Max |CCC|: 0.261762\n", + " Top correlation: SMNTRNRT in whole_blood (CCC: 0.261762, p: 1.00e-05)\n", + " Runtime: 9.93 seconds (0.17 minutes)\n", + "\n", + "====================================================================================================\n", + "SUMMARY BY TISSUE (across all genes)\n", + "====================================================================================================\n", + "Tissue N Genes Successful Mean |CCC| Max |CCC| \n", + "---------------------------------------------------------------------------\n", + "whole_blood 2 88 0.056343 0.528125 \n", + "\n", + "====================================================================================================\n", + "RUNTIME SUMMARY\n", + "====================================================================================================\n", + "Total runtime: 19.92 seconds (0.33 minutes)\n", + "Average runtime per gene: 9.96 seconds\n", + "Total gene-tissue combinations: 2\n", + "\n", + "Runtime by gene:\n", + "Gene Runtime (sec) Runtime (min) Tissues Successful \n", + "---------------------------------------------------------------------------\n", + "RASSF2 9.96 0.17 1 1 \n", + "CYTIP 9.93 0.17 1 1 \n", + "\n", + "Average runtime by tissue (across all genes):\n", + "Tissue Avg Runtime (sec) Avg Runtime (min) N Runs Min Max \n", + "-----------------------------------------------------------------------------------------------\n", + "whole_blood 9.94 0.17 2 9.93 9.96 \n", + "\n", + "Fastest tissue (avg): whole_blood (9.94 seconds)\n", + "Slowest tissue (avg): whole_blood (9.94 seconds)\n", + "Speed ratio: 1.0x\n", + "\n", + "Summary log saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_execution.log\n", + "Summary tables saved to: /mnt/data/proj_data/ccc-gpu/data/tutorial/metadata_correlations/_RASSF2_CYTIP_summary_tables.log\n" + ] + } + ], + "source": [ + "%run ./nbs/common/metadata_corr_cli.py RASSF2 CYTIP --include whole_blood --expr-data-dir {TISSUE_DATA_DIR} --data-dir {ANALYSIS_DIR} --output-dir {METADATA_CORRELATIONS_RESULT_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['RASSF2_all_tissues_correlation_results.csv',\n", + " 'CYTIP_whole_blood.log',\n", + " 'RASSF2_whole_blood_correlation_results.pkl',\n", + " 'RASSF2_all_tissues_correlation_results.pkl',\n", + " 'RASSF2_whole_blood.log',\n", + " 'CYTIP_all_tissues_correlation_results.pkl',\n", + " 'CYTIP_whole_blood_correlation_results.pkl',\n", + " '_all_genes_all_tissues_correlation_results.csv',\n", + " '_RASSF2_CYTIP_summary_tables.log',\n", + " '_RASSF2_CYTIP_summary_execution.log',\n", + " 'CYTIP_all_tissues_correlation_results.csv',\n", + " '_all_genes_all_tissues_correlation_results.pkl']" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can find the results in the `METADATA_CORRELATIONS_RESULT_DIR` directory\n", + "os.listdir(METADATA_CORRELATIONS_RESULT_DIR)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/common/README.md b/nbs/common/README.md new file mode 100644 index 00000000..ea9f6262 --- /dev/null +++ b/nbs/common/README.md @@ -0,0 +1,527 @@ +# Common Analysis Tools + +This directory contains command-line tools for gene expression analysis using the CCC-GPU package. + +## Available Tools + +1. **[Single Gene Pair Correlation Analysis](#single-gene-pair-correlation-analysis)** (`compute_single_gene_pair_correlations_cli.py`) +2. **[Gene Expression-Metadata Correlation Analysis](#gene-expression-metadata-correlation-analysis)** (`metadata_corr_cli.py`) + +--- + +# Single Gene Pair Correlation Analysis + +A command-line tool for exploring gene expression data and computing correlations between specific gene pairs using CCC (Clustered Correlation Coefficient), Spearman, and Pearson correlation methods. + +## Features + +- **Data Exploration**: Browse available tissues and genes with their symbols +- **Gene Pair Correlation**: Compute three correlation coefficients (CCC, Pearson, Spearman) for any gene pair +- **Flexible Gene Input**: Accept both gene symbols (e.g., TP53) and Ensembl IDs (e.g., ENSG00000141510.16) +- **Tissue-Specific Analysis**: Analyze correlations within specific tissue contexts +- **Robust Gene Resolution**: Handle version numbers and case-insensitive matching +- **Comprehensive Error Handling**: Clear error messages and debugging support + +## Installation Requirements + +```bash +# Required packages +pip install pandas numpy +# CCC-GPU package (install from source as per project instructions) +``` + +## Quick Start + +### 1. Explore Available Data + +```bash +# List all available tissues +python compute_single_gene_pair_correlations_cli.py --list-tissues + +# Show genes available in whole blood tissue +python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood + +# Show more genes (default is 20) +python compute_single_gene_pair_correlations_cli.py --show-genes liver --n-genes 50 +``` + +### 2. Compute Gene Pair Correlations + +```bash +# Basic correlation analysis between TP53 and BRCA1 in whole blood +python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood + +# Use Ensembl IDs instead of symbols +python compute_single_gene_pair_correlations_cli.py ENSG00000141510.16 ENSG00000012048.20 --tissue liver + +# Mixed input (symbol and Ensembl ID) +python compute_single_gene_pair_correlations_cli.py TP53 ENSG00000012048.20 --tissue brain_cortex +``` + +### 3. Save Results and Logs + +```bash +# Save results and logs to a specific directory +python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \ + --output-dir ./results + +# Combine with debug logging for detailed output +python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \ + --output-dir ./detailed_analysis --debug +``` + +### 4. Custom Data Paths + +```bash +# Use custom data directory and gene mapping file +python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood \ + --data-dir /custom/path/to/tissue/data \ + --gene-mapping /custom/path/to/gene_mappings.pkl \ + --output-dir ./custom_results +``` + +## Command Line Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `genes` | str+ | Required | Two gene symbols or Ensembl IDs for correlation analysis | +| `--tissue` | str | Required | Tissue name for correlation analysis | +| `--data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing tissue expression data | +| `--gene-mapping` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl` | Gene mapping file path | +| `--output-dir` | str | None | Directory to save output files and logs (optional) | +| `--list-tissues` | flag | False | List all available tissues and exit | +| `--show-genes` | str | None | Show genes for specified tissue and exit | +| `--n-genes` | int | 20 | Number of genes to display | +| `--debug` | flag | False | Enable debug logging | + +## Output Format + +### Tissue and Gene Discovery +``` +=== Available Tissues (49) === + 1. adipose_subcutaneous + 2. adipose_visceral_omentum + 3. adrenal_gland + ... + +=== Tissue: whole_blood === +Total genes: 56,200 +Total samples: 755 + +First 20 genes: +------------------------------------------------------------ +# Gene Symbol Ensembl ID +------------------------------------------------------------ +1 DDX11L1 ENSG00000223972.5 +2 WASH7P ENSG00000227232.5 +3 MIR6859-1 ENSG00000278267.1 +... +``` + +### Correlation Results +``` +============================================================ +GENE PAIR CORRELATION RESULTS +============================================================ +Gene 1: TP53 (ENSG00000141510.16) +Gene 2: BRCA1 (ENSG00000012048.20) +Tissue: whole_blood +Samples: 755 +------------------------------------------------------------ + CCC: 0.123456 + PEARSON: 0.234567 + SPEARMAN: 0.345678 +============================================================ +Results saved to: + JSON: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.json + Pickle: TP53_BRCA1_whole_blood_20240925_143022_correlation_results.pkl +Log file: gene_pair_correlation_analysis_20240925_143022.log +``` + +### Output Files (when --output-dir is used) + +1. **JSON Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.json` + - Human-readable format with all correlation results + - Can be easily imported into other tools or scripts + +2. **Pickle Results File**: `{gene1}_{gene2}_{tissue}_{timestamp}_correlation_results.pkl` + - Python-specific format preserving exact data types + - Optimal for downstream analysis in Python + +3. **Log File**: `gene_pair_correlation_analysis_{timestamp}.log` + - Detailed processing information and debug messages + - Useful for troubleshooting and audit trails + +Example JSON output: +```json +{ + "gene1_symbol": "TP53", + "gene1_ensembl_id": "ENSG00000141510.16", + "gene2_symbol": "BRCA1", + "gene2_ensembl_id": "ENSG00000012048.20", + "tissue": "whole_blood", + "n_samples": 755, + "ccc": 0.123456, + "pearson": 0.234567, + "spearman": 0.345678 +} +``` + +## Input Data Format + +### Tissue Expression Files +- **Format**: Pickle (.pkl) files +- **Naming**: `gtex_v8_data_{tissue_name}.pkl` +- **Structure**: DataFrame with Ensembl gene IDs as index, sample IDs as columns +- **Content**: Log2-transformed gene expression values + +### Gene Mapping File +- **Format**: Pickle (.pkl) file +- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol` +- **Content**: Mapping between Ensembl gene IDs and HUGO gene symbols + +## Statistical Methods + +### Correlation Coefficients + +1. **CCC (Clustered Correlation Coefficient)** + - GPU-accelerated implementation + - Robust to outliers and non-linear relationships + - Particularly suited for detecting complex correlation patterns + +2. **Pearson Correlation** + - Standard linear correlation coefficient + - Measures linear relationship strength + +3. **Spearman Correlation** + - Rank-based correlation coefficient + - Robust to outliers and monotonic relationships + +## Example Workflows + +### 1. Cancer Gene Analysis +```bash +# Explore brain tissues for TP53-related genes +python compute_single_gene_pair_correlations_cli.py --list-tissues | grep brain + +# Analyze TP53 interactions in different brain regions with output saving +python compute_single_gene_pair_correlations_cli.py TP53 MDM2 --tissue brain_cortex \ + --output-dir ./cancer_gene_analysis --debug +python compute_single_gene_pair_correlations_cli.py TP53 CDKN1A --tissue brain_hippocampus \ + --output-dir ./cancer_gene_analysis --debug +``` + +### 2. Housekeeping Gene Analysis +```bash +# Compare expression correlation of housekeeping genes +python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue whole_blood +python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue liver +python compute_single_gene_pair_correlations_cli.py GAPDH ACTB --tissue muscle_skeletal +``` + +### 3. Tissue-Specific Gene Discovery +```bash +# Find genes in specific tissues and analyze their relationships +python compute_single_gene_pair_correlations_cli.py --show-genes heart_left_ventricle --n-genes 100 | grep MYH +python compute_single_gene_pair_correlations_cli.py MYH6 MYH7 --tissue heart_left_ventricle +``` + +## Error Handling + +The tool provides comprehensive error handling: + +- **Gene not found**: Suggestions to check spelling or use `--show-genes` +- **Tissue not found**: List of available tissues +- **Data issues**: Clear messages about insufficient samples or missing data +- **Path issues**: Validation of data directory and gene mapping file + +## Performance Considerations + +- **Memory usage**: ~100-500MB depending on tissue size +- **Computation time**: 1-5 seconds per gene pair +- **CCC computation**: GPU-accelerated when available + +--- + +# Gene Expression-Metadata Correlation Analysis + +## Overview + +This tool computes correlations between specific gene expression levels and all available metadata columns across multiple GTEx tissues. It uses the **Clustered Correlation Coefficient (CCC)** method, which is particularly suited for detecting non-linear relationships and complex correlation patterns. + +### Key Features + +- **Multi-gene Analysis**: Process multiple genes simultaneously +- **Cross-tissue Analysis**: Analyze correlations across all available GTEx tissues +- **Comprehensive Metadata Coverage**: Correlate against all metadata columns automatically +- **Statistical Significance**: Permutation-based p-value calculation with customizable iterations +- **Flexible Tissue Filtering**: Include/exclude tissues using pattern matching +- **Parallel Processing**: Multi-threaded computation support +- **Detailed Logging**: Individual logs per gene-tissue combination plus comprehensive summaries +- **Multiple Output Formats**: Results in both pickle (.pkl) and CSV formats +- **Runtime Tracking**: Detailed performance monitoring and optimization insights + +## Requirements + +### Dependencies + +```python +pandas +numpy +ccc # Clustered Correlation Coefficient library +``` + +### Required Data Files + +The tool expects specific data files in predetermined locations: + +1. **Expression Data**: GTEx v8 expression files in the format `gtex_v8_data_{tissue_name}-var_pc_log2.pkl` +2. **Metadata**: GTEx v8 sample metadata (`gtex_v8-sample_metadata.pkl`) +3. **Gene Mappings**: Gene ID to symbol mappings (`gtex_gene_id_symbol_mappings.pkl`) + +## Installation + +```bash +# Clone or download the script +# Ensure all required Python packages are installed +pip install pandas numpy ccc +``` + +## Usage + +### Basic Usage + +```bash +# Analyze single gene across all tissues +python metadata_corr_cli.py RASSF2 + +# Analyze multiple genes +python metadata_corr_cli.py RASSF2 TP53 BRCA1 + +# Specify custom output directory +python metadata_corr_cli.py RASSF2 --output-dir ./results +``` + +### Advanced Usage + +```bash +# Include only specific tissues (pattern matching) +python metadata_corr_cli.py RASSF2 --include brain liver + +# Exclude specific tissues +python metadata_corr_cli.py RASSF2 --exclude cells brain + +# Custom permutation settings and parallel processing +python metadata_corr_cli.py RASSF2 --permutations 500000 --n-jobs 16 + +# Combined filtering and custom settings +python metadata_corr_cli.py TP53 BRCA1 \ + --include muscle heart \ + --exclude cells \ + --permutations 1000000 \ + --n-jobs 32 \ + --output-dir ./tp53_brca1_analysis +``` + +### Discovery Commands + +```bash +# List all available tissues +python metadata_corr_cli.py GENE --list-tissues + +# List all available metadata columns +python metadata_corr_cli.py GENE --list-metadata-columns +``` + +## Command Line Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `gene_symbols` | str+ | Required | Gene symbol(s) to analyze (e.g., RASSF2 TP53) | +| `--expr-data-dir` | str | `/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue` | Directory containing expression data files | +| `--include` | str* | None | Include only tissues matching these patterns | +| `--exclude` | str* | None | Exclude tissues matching these patterns | +| `--permutations` | int | 100,000 | Number of permutations for p-value calculation | +| `--n-jobs` | int | 24 | Number of parallel jobs for computation | +| `--output-dir` | str | `.` | Directory to save output files | +| `--list-metadata-columns` | flag | False | List available metadata columns and exit | +| `--list-tissues` | flag | False | List available tissue files and exit | + +## Input File Formats + +### Expression Data Files +- **Format**: Pickle (.pkl) files +- **Structure**: DataFrame with genes as rows, samples as columns +- **Naming**: `gtex_v8_data_{tissue_name}-var_pc_log2.pkl` +- **Content**: Log2-transformed, variance-filtered gene expression data + +### Metadata File +- **Format**: Pickle (.pkl) file +- **Structure**: DataFrame with samples as rows, metadata columns as columns +- **Content**: All GTEx v8 sample metadata including demographics, sampling info, etc. + +### Gene Mapping File +- **Format**: Pickle (.pkl) file +- **Structure**: DataFrame with columns `gene_ens_id` and `gene_symbol` +- **Content**: Mapping between Ensembl gene IDs and gene symbols + +## Output Files + +### Per Gene-Tissue Results +- **Individual Results**: `{gene}_{tissue}_correlation_results.pkl` +- **Individual Logs**: `{gene}_{tissue}.log` +- **Content**: Correlation results for each metadata column + +### Per Gene Summaries +- **Combined Results**: `{gene}_all_tissues_correlation_results.pkl` +- **Combined CSV**: `{gene}_all_tissues_correlation_results.csv` +- **Content**: All tissues combined for single gene + +### Overall Results +- **Mega Results**: `_all_genes_all_tissues_correlation_results.pkl` +- **Mega CSV**: `_all_genes_all_tissues_correlation_results.csv` +- **Summary Log**: `_{genes}_summary_execution.log` +- **Summary Tables**: `_{genes}_summary_tables.log` + +### Result DataFrame Structure + +```python +# Each results DataFrame contains: +{ + 'ccc_value': float, # CCC correlation coefficient + 'p_value': float, # Permutation-based p-value + 'status': str, # 'success', 'all_nan', 'insufficient_variation', or 'error' + 'tissue': str, # Tissue name + 'gene_symbol': str, # Gene symbol + 'gene_id': str, # Ensembl gene ID + 'n_samples': int # Number of samples used +} +``` + +## Analysis Workflow + +### 1. **Gene Discovery** +- Converts gene symbols to Ensembl IDs using gene mapping +- Validates gene existence across tissues + +### 2. **Tissue Processing** +- Loads expression data for each tissue +- Filters to common samples between expression and metadata +- Handles missing data and insufficient variation gracefully + +### 3. **Correlation Analysis** +- Computes CCC between gene expression and each metadata column +- Calculates statistical significance via permutation testing +- Handles various data types and edge cases + +### 4. **Results Compilation** +- Aggregates results across tissues and genes +- Generates comprehensive summary statistics +- Creates ranked lists of strongest correlations + +### 5. **Performance Monitoring** +- Tracks runtime for each gene-tissue combination +- Identifies computational bottlenecks +- Provides optimization recommendations + +## Statistical Methods + +### Clustered Correlation Coefficient (CCC) +- **Purpose**: Detects both linear and non-linear relationships +- **Advantages**: Robust to outliers, captures complex patterns +- **Implementation**: Uses permutation-based significance testing + +### Significance Levels +- `***`: p < 0.001 (highly significant) +- `**`: p < 0.01 (significant) +- `*`: p < 0.05 (marginally significant) +- `ns`: p ≥ 0.05 (not significant) + +## Performance Considerations + +### Computational Requirements +- **Memory**: ~2-8 GB depending on tissue size and number of genes +- **CPU**: Benefits from multi-core systems (default: 24 cores) +- **Time**: ~1-5 minutes per gene-tissue combination + +### Optimization Tips +- **Parallel Processing**: Increase `--n-jobs` for faster computation +- **Permutations**: Reduce `--permutations` for faster (less precise) p-values +- **Tissue Filtering**: Use `--include`/`--exclude` to focus on relevant tissues +- **Batch Processing**: Process multiple genes together for efficiency + +## Example Workflows + +### 1. Cancer Gene Analysis +```bash +# Analyze tumor suppressor genes across cancer-relevant tissues +python metadata_corr_cli.py TP53 BRCA1 BRCA2 PTEN \ + --include breast ovary lung liver \ + --permutations 1000000 \ + --n-jobs 32 \ + --output-dir ./cancer_genes_analysis +``` + +### 2. Brain-Specific Gene Study +```bash +# Focus on brain tissues for neurological genes +python metadata_corr_cli.py APOE MAPT SNCA \ + --include brain \ + --exclude cells \ + --output-dir ./brain_genes +``` + +### 3. Exploratory Analysis +```bash +# Quick exploration with reduced permutations +python metadata_corr_cli.py GENE_OF_INTEREST \ + --permutations 10000 \ + --n-jobs 8 \ + --output-dir ./exploratory +``` + +## Troubleshooting + +### Common Issues + +1. **Gene Not Found**: Check gene symbol spelling and availability in gene mapping +2. **No Expression Data**: Verify gene is expressed in selected tissues +3. **Memory Errors**: Reduce number of parallel jobs or process fewer genes at once +4. **File Not Found**: Ensure all required data files exist in expected locations + +### Error Codes +- **Gene symbol not found**: Gene not in mapping file +- **No common samples**: Expression and metadata samples don't overlap +- **All NaN values**: Metadata column contains only missing values +- **Insufficient variation**: Metadata column has ≤1 unique values + +## Output Interpretation + +### Top Results Tables +- Results ranked by absolute CCC value +- Include significance levels and tissue information +- Show strongest correlations across all analyses + +### Summary Statistics +- **Mean |CCC|**: Average absolute correlation strength +- **Max |CCC|**: Strongest correlation found +- **Success Rate**: Proportion of successful analyses +- **Runtime Metrics**: Performance characteristics + +## Citation + +If you use this tool in your research, please cite the CCC method and relevant GTEx publications. + +## Version Information + +- **Script**: metadata_corr_cli.py +- **Converted from**: 00-data-exploration.ipynb +- **GTEx Version**: v8 +- **CCC Implementation**: Uses ccc.coef module + +## Support + +For issues related to: +- **CCC Method**: Refer to CCC library documentation +- **GTEx Data**: Consult GTEx consortium resources +- **Script Usage**: Check this README or examine log files for detailed error messages \ No newline at end of file diff --git a/nbs/common/compute_single_gene_pair_correlations_cli.py b/nbs/common/compute_single_gene_pair_correlations_cli.py new file mode 100755 index 00000000..2cd2c690 --- /dev/null +++ b/nbs/common/compute_single_gene_pair_correlations_cli.py @@ -0,0 +1,624 @@ +#!/usr/bin/env python3 +""" +Single Gene Pair Correlation Analysis Tool + +A command-line tool for exploring gene expression data and computing correlations +between specific gene pairs using CCC (Clustered Correlation Coefficient), +Spearman, and Pearson correlation methods. + +This script provides two main functionalities: +1. Data exploration: Show available genes and their symbols for a tissue +2. Correlation analysis: Compute correlations for a specific gene pair in a tissue + +Author: Generated for CCC-GPU project +Version: 1.0 +""" + +import argparse +import logging +import sys +from pathlib import Path +from typing import Dict, Optional, Tuple, Union + +import pandas as pd +import numpy as np + +# Import correlation methods +try: + from ccc.corr import ccc_gpu, pearson, spearman +except ImportError: + print("Error: CCC library not found. Please install the ccc package.") + sys.exit(1) + + +def setup_logging(debug: bool = False, output_dir: Optional[Path] = None) -> Optional[Path]: + """Configure logging for the script. + + Args: + debug: Enable debug level logging if True + output_dir: Directory to write log files to (optional) + + Returns: + Path to log file if output_dir provided, None otherwise + """ + level = logging.DEBUG if debug else logging.INFO + + # Clear any existing handlers + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + + # Setup formatters + console_formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s', + datefmt='%H:%M:%S' + ) + file_formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Setup handlers + handlers = [] + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(console_formatter) + handlers.append(console_handler) + + # File handler (if output directory provided) + log_file = None + if output_dir: + output_dir.mkdir(parents=True, exist_ok=True) + from datetime import datetime + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_file = output_dir / f"gene_pair_correlation_analysis_{timestamp}.log" + + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(file_formatter) + handlers.append(file_handler) + + # Configure root logger + logging.basicConfig( + level=level, + handlers=handlers, + force=True + ) + + if log_file: + logging.info(f"Log file created: {log_file}") + + return log_file + + +class GeneExpressionAnalyzer: + """Main class for gene expression analysis and correlation computation.""" + + def __init__(self, data_dir: str, gene_mapping_file: str): + """Initialize the analyzer with data directory and gene mapping file. + + Args: + data_dir: Directory containing tissue expression data files + gene_mapping_file: Path to gene ID to symbol mapping file + """ + self.data_dir = Path(data_dir) + self.gene_mapping_file = Path(gene_mapping_file) + self._gene_mapping = None + self._validate_inputs() + + def _validate_inputs(self) -> None: + """Validate that input paths exist and are accessible.""" + if not self.data_dir.exists(): + raise FileNotFoundError(f"Data directory not found: {self.data_dir}") + + if not self.gene_mapping_file.exists(): + raise FileNotFoundError(f"Gene mapping file not found: {self.gene_mapping_file}") + + @property + def gene_mapping(self) -> pd.DataFrame: + """Load and cache gene mapping data.""" + if self._gene_mapping is None: + logging.info(f"Loading gene mapping from: {self.gene_mapping_file}") + self._gene_mapping = pd.read_pickle(self.gene_mapping_file) + logging.info(f"Loaded {len(self._gene_mapping)} gene mappings") + return self._gene_mapping + + def list_available_tissues(self) -> list: + """Get list of available tissue files. + + Returns: + List of tissue names (without file extensions) + """ + tissue_files = list(self.data_dir.glob("gtex_v8_data_*.pkl")) + tissues = [f.stem.replace("gtex_v8_data_", "") for f in tissue_files] + return sorted(tissues) + + def _find_tissue_file(self, tissue: str) -> Path: + """Find the tissue file for a given tissue name. + + Args: + tissue: Tissue name + + Returns: + Path to tissue file + + Raises: + FileNotFoundError: If tissue file is not found + """ + # Try exact match first + exact_file = self.data_dir / f"gtex_v8_data_{tissue}.pkl" + if exact_file.exists(): + return exact_file + + # Try partial matching + tissue_files = list(self.data_dir.glob(f"gtex_v8_data_*{tissue}*.pkl")) + if len(tissue_files) == 1: + return tissue_files[0] + elif len(tissue_files) > 1: + matches = [f.stem for f in tissue_files] + raise ValueError( + f"Multiple tissue files match '{tissue}': {matches}. " + "Please be more specific." + ) + else: + available = self.list_available_tissues() + raise FileNotFoundError( + f"No tissue file found for '{tissue}'. " + f"Available tissues: {available[:10]}..." if len(available) > 10 + else f"Available tissues: {available}" + ) + + def show_tissue_genes(self, tissue: str, n_genes: int = 20) -> None: + """Display available genes and their symbols for a tissue. + + Args: + tissue: Tissue name + n_genes: Number of genes to display (default: 20) + """ + # Load tissue data + tissue_file = self._find_tissue_file(tissue) + logging.info(f"Loading tissue data from: {tissue_file}") + + tissue_data = pd.read_pickle(tissue_file) + logging.info(f"Tissue data shape: {tissue_data.shape}") + + # Get gene IDs and map to symbols + gene_ids = tissue_data.index.tolist() + + # Create mapping lookup for faster access + gene_mapping = self.gene_mapping.set_index('gene_ens_id') + + print(f"\n=== Tissue: {tissue} ===") + print(f"Total genes: {len(gene_ids):,}") + print(f"Total samples: {tissue_data.shape[1]:,}") + print(f"\nFirst {n_genes} genes:") + print("-" * 60) + print(f"{'#':<4} {'Gene Symbol':<15} {'Ensembl ID':<20}") + print("-" * 60) + + for i, gene_id in enumerate(gene_ids[:n_genes], 1): + # Remove version from gene ID for mapping lookup + clean_gene_id = gene_id.split('.')[0] if '.' in gene_id else gene_id + + # Look up symbol + symbol = "N/A" + if gene_id in gene_mapping.index: + symbol = gene_mapping.loc[gene_id, 'gene_symbol'] + elif clean_gene_id in gene_mapping.index: + symbol = gene_mapping.loc[clean_gene_id, 'gene_symbol'] + else: + # Search in original mapping + matches = self.gene_mapping[ + self.gene_mapping['gene_ens_id'].str.startswith(clean_gene_id) + ] + if len(matches) > 0: + symbol = matches.iloc[0]['gene_symbol'] + + print(f"{i:<4} {symbol:<15} {gene_id:<20}") + + if len(gene_ids) > n_genes: + print(f"... and {len(gene_ids) - n_genes:,} more genes") + print() + + def _resolve_gene(self, gene_input: str) -> Tuple[str, str]: + """Resolve gene input to Ensembl ID and symbol. + + Args: + gene_input: Gene symbol or Ensembl ID + + Returns: + Tuple of (ensembl_id, gene_symbol) + + Raises: + ValueError: If gene cannot be resolved + """ + # Check if it's already an Ensembl ID + if gene_input.startswith('ENSG'): + # Look up the symbol + matches = self.gene_mapping[self.gene_mapping['gene_ens_id'] == gene_input] + if len(matches) == 0: + # Try without version + base_id = gene_input.split('.')[0] + matches = self.gene_mapping[ + self.gene_mapping['gene_ens_id'].str.startswith(base_id) + ] + + if len(matches) > 0: + return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol'] + else: + raise ValueError(f"Ensembl ID '{gene_input}' not found in mapping") + else: + # Assume it's a gene symbol + matches = self.gene_mapping[self.gene_mapping['gene_symbol'] == gene_input] + if len(matches) > 0: + return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol'] + else: + # Try case-insensitive search + matches = self.gene_mapping[ + self.gene_mapping['gene_symbol'].str.upper() == gene_input.upper() + ] + if len(matches) > 0: + return matches.iloc[0]['gene_ens_id'], matches.iloc[0]['gene_symbol'] + else: + raise ValueError( + f"Gene symbol '{gene_input}' not found. " + "Use --show-genes to see available genes." + ) + + def compute_gene_pair_correlations( + self, + gene1: str, + gene2: str, + tissue: str + ) -> Dict[str, Union[float, str]]: + """Compute correlations between two genes in a specific tissue. + + Args: + gene1: First gene (symbol or Ensembl ID) + gene2: Second gene (symbol or Ensembl ID) + tissue: Tissue name + + Returns: + Dictionary with correlation results + """ + # Resolve genes + gene1_id, gene1_symbol = self._resolve_gene(gene1) + gene2_id, gene2_symbol = self._resolve_gene(gene2) + + # Load tissue data + tissue_file = self._find_tissue_file(tissue) + logging.info(f"Loading tissue data from: {tissue_file}") + + tissue_data = pd.read_pickle(tissue_file) + logging.info(f"Tissue data shape: {tissue_data.shape}") + + # Extract gene expression data + gene1_expr = self._extract_gene_expression(tissue_data, gene1_id, gene1_symbol) + gene2_expr = self._extract_gene_expression(tissue_data, gene2_id, gene2_symbol) + + # Ensure we have the same samples + common_samples = gene1_expr.index.intersection(gene2_expr.index) + if len(common_samples) == 0: + raise ValueError("No common samples between the two genes") + + gene1_values = gene1_expr.loc[common_samples].values + gene2_values = gene2_expr.loc[common_samples].values + + # Remove any NaN values + mask = ~(np.isnan(gene1_values) | np.isnan(gene2_values)) + gene1_clean = gene1_values[mask] + gene2_clean = gene2_values[mask] + + if len(gene1_clean) < 3: + raise ValueError("Insufficient valid data points for correlation analysis") + + logging.info(f"Computing correlations for {len(gene1_clean)} samples") + + # Compute correlations + results = { + 'gene1_symbol': gene1_symbol, + 'gene1_ensembl_id': gene1_id, + 'gene2_symbol': gene2_symbol, + 'gene2_ensembl_id': gene2_id, + 'tissue': tissue, + 'n_samples': len(gene1_clean), + } + + # Create DataFrame for correlation computation (genes as rows, samples as columns) + # This matches the format expected by ccc.corr functions + data_df = pd.DataFrame({ + f'sample_{i}': [gene1_clean[i], gene2_clean[i]] + for i in range(len(gene1_clean)) + }, index=[gene1_symbol, gene2_symbol]) + + try: + # Compute CCC + logging.info("Computing CCC correlation...") + ccc_result = ccc_gpu(data_df, n_jobs=1) # Use single job for pair + results['ccc'] = float(ccc_result.iloc[0, 1]) # Off-diagonal element + except Exception as e: + logging.warning(f"CCC computation failed: {e}") + results['ccc'] = None + + try: + # Compute Pearson correlation + logging.info("Computing Pearson correlation...") + pearson_result = pearson(data_df) + results['pearson'] = float(pearson_result.iloc[0, 1]) + except Exception as e: + logging.warning(f"Pearson computation failed: {e}") + results['pearson'] = None + + try: + # Compute Spearman correlation + logging.info("Computing Spearman correlation...") + spearman_result = spearman(data_df) + results['spearman'] = float(spearman_result.iloc[0, 1]) + except Exception as e: + logging.warning(f"Spearman computation failed: {e}") + results['spearman'] = None + + return results + + def _extract_gene_expression(self, tissue_data: pd.DataFrame, gene_id: str, gene_symbol: str) -> pd.Series: + """Extract expression data for a specific gene. + + Args: + tissue_data: Tissue expression DataFrame + gene_id: Ensembl gene ID + gene_symbol: Gene symbol + + Returns: + Series with gene expression values + + Raises: + ValueError: If gene is not found in tissue data + """ + # Try exact match first + if gene_id in tissue_data.index: + return tissue_data.loc[gene_id] + + # Try without version + base_id = gene_id.split('.')[0] + matches = [idx for idx in tissue_data.index if idx.startswith(base_id)] + + if len(matches) == 1: + return tissue_data.loc[matches[0]] + elif len(matches) > 1: + logging.warning(f"Multiple matches for {gene_symbol} ({gene_id}), using first match") + return tissue_data.loc[matches[0]] + else: + raise ValueError(f"Gene {gene_symbol} ({gene_id}) not found in tissue data") + + +def save_results(results: Dict[str, Union[float, str]], output_dir: Path) -> Tuple[Path, Path]: + """Save correlation results to files. + + Args: + results: Dictionary containing correlation results + output_dir: Directory to save files + + Returns: + Tuple of (json_file_path, pickle_file_path) + """ + import json + import pickle + from datetime import datetime + + # Create filenames + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + gene1_symbol = results['gene1_symbol'] + gene2_symbol = results['gene2_symbol'] + tissue = results['tissue'] + + base_filename = f"{gene1_symbol}_{gene2_symbol}_{tissue}_{timestamp}" + json_file = output_dir / f"{base_filename}_correlation_results.json" + pickle_file = output_dir / f"{base_filename}_correlation_results.pkl" + + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Save as JSON (human readable) + json_data = {} + for key, value in results.items(): + if isinstance(value, (int, float, str)): + json_data[key] = value + else: + json_data[key] = str(value) + + with open(json_file, 'w') as f: + json.dump(json_data, f, indent=2) + + # Save as pickle (preserves data types) + with open(pickle_file, 'wb') as f: + pickle.dump(results, f) + + logging.info(f"Results saved to: {json_file}") + logging.info(f"Results saved to: {pickle_file}") + + return json_file, pickle_file + + +def main(): + """Main function to handle command line arguments and execute analysis.""" + parser = argparse.ArgumentParser( + description="Single Gene Pair Correlation Analysis Tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Show available tissues + python compute_single_gene_pair_correlations_cli.py --list-tissues + + # Show genes in whole blood tissue + python compute_single_gene_pair_correlations_cli.py --show-genes whole_blood + + # Compute correlations between TP53 and BRCA1 in whole blood + python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue whole_blood + + # Save results and logs to output directory + python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\ + --output-dir ./results --debug + + # Use custom data directory and gene mapping + python compute_single_gene_pair_correlations_cli.py TP53 BRCA1 --tissue liver \\ + --data-dir /custom/path/data \\ + --gene-mapping /custom/path/mappings.pkl \\ + --output-dir ./custom_results + """ + ) + + # Positional arguments for gene pair analysis + parser.add_argument( + 'genes', + nargs='*', + help='Two gene symbols or Ensembl IDs for correlation analysis (e.g., TP53 BRCA1)' + ) + + # Main options + parser.add_argument( + '--tissue', + type=str, + help='Tissue name for analysis (required for correlation analysis)' + ) + + parser.add_argument( + '--data-dir', + type=str, + default='/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue', + help='Directory containing tissue expression data files' + ) + + parser.add_argument( + '--gene-mapping', + type=str, + default='/mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl', + help='Path to gene ID to symbol mapping file' + ) + + # Discovery options + parser.add_argument( + '--list-tissues', + action='store_true', + help='List all available tissues and exit' + ) + + parser.add_argument( + '--show-genes', + type=str, + metavar='TISSUE', + help='Show available genes for specified tissue and exit' + ) + + parser.add_argument( + '--n-genes', + type=int, + default=20, + help='Number of genes to show (default: 20)' + ) + + # Output options + parser.add_argument( + '--output-dir', + type=str, + help='Directory to save output files and logs (optional)' + ) + + # Utility options + parser.add_argument( + '--debug', + action='store_true', + help='Enable debug logging' + ) + + args = parser.parse_args() + + # Setup output directory + output_dir = Path(args.output_dir) if args.output_dir else None + + # Setup logging + log_file = setup_logging(debug=args.debug, output_dir=output_dir) + + try: + # Initialize analyzer + analyzer = GeneExpressionAnalyzer(args.data_dir, args.gene_mapping) + + # Handle discovery commands + if args.list_tissues: + tissues = analyzer.list_available_tissues() + print(f"\n=== Available Tissues ({len(tissues)}) ===") + for i, tissue in enumerate(tissues, 1): + print(f"{i:2d}. {tissue}") + print() + return + + if args.show_genes: + analyzer.show_tissue_genes(args.show_genes, args.n_genes) + return + + # Handle correlation analysis + if len(args.genes) != 2: + parser.error( + "Exactly two genes are required for correlation analysis. " + "Use --show-genes to see available genes, or --list-tissues to see available tissues." + ) + + if not args.tissue: + parser.error( + "Tissue is required for correlation analysis. " + "Use --list-tissues to see available tissues." + ) + + gene1, gene2 = args.genes + results = analyzer.compute_gene_pair_correlations(gene1, gene2, args.tissue) + + # Save results to files if output directory provided + saved_files = None + if output_dir: + try: + saved_files = save_results(results, output_dir) + logging.info(f"Results saved to output directory: {output_dir}") + except Exception as e: + logging.error(f"Failed to save results: {e}") + + # Print results + print("\n" + "="*60) + print("GENE PAIR CORRELATION RESULTS") + print("="*60) + print(f"Gene 1: {results['gene1_symbol']} ({results['gene1_ensembl_id']})") + print(f"Gene 2: {results['gene2_symbol']} ({results['gene2_ensembl_id']})") + print(f"Tissue: {results['tissue']}") + print(f"Samples: {results['n_samples']:,}") + print("-" * 60) + + for method in ['ccc', 'pearson', 'spearman']: + value = results.get(method) + if value is not None: + print(f"{method.upper():>12}: {value:.6f}") + else: + print(f"{method.upper():>12}: Failed to compute") + + print("="*60) + + # Show saved files info + if saved_files: + print(f"Results saved to:") + print(f" JSON: {saved_files[0].name}") + print(f" Pickle: {saved_files[1].name}") + + if log_file: + print(f"Log file: {log_file.name}") + + print() + + # Also return as dict for programmatic use + return results + + except Exception as e: + logging.error(f"Error: {e}") + if args.debug: + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/nbs/common/metadata_corr_cli.py b/nbs/common/metadata_corr_cli.py new file mode 100755 index 00000000..669c9ccd --- /dev/null +++ b/nbs/common/metadata_corr_cli.py @@ -0,0 +1,1191 @@ +#!/usr/bin/env python3 +""" +CLI tool for exploring gene expression correlations with metadata. +Converted from 00-data-exploration.ipynb +""" + +import argparse +import sys +import warnings +import re +import time +import logging +from pathlib import Path +import pandas as pd +import numpy as np +# from ccc.coef import ccc +from ccc.coef.impl_gpu import ccc + +# Suppress specific NumPy warnings +warnings.filterwarnings("ignore", message="invalid value encountered in cast") +warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy") + +# Global quiet flag for batch processing +QUIET_MODE = False + + +def find_expression_files(expr_data_dir, include_patterns=None, exclude_patterns=None, quiet=False): + """Find expression files matching include/exclude patterns.""" + expr_data_dir = Path(expr_data_dir) + + if not expr_data_dir.exists(): + raise FileNotFoundError(f"Expression data directory not found: {expr_data_dir}") + + # Find all .pkl files with the expected pattern + pattern = re.compile(r"gtex_v8_data_(.+)\.pkl$") + all_files = [] + + for file_path in expr_data_dir.glob("*.pkl"): + match = pattern.match(file_path.name) + if match: + tissue_name = match.group(1) + all_files.append((file_path, tissue_name)) + + if not all_files: + raise FileNotFoundError( + f"No matching expression files found in {expr_data_dir}" + ) + + # Apply include patterns + if include_patterns: + filtered_files = [] + for file_path, tissue_name in all_files: + for pattern in include_patterns: + if re.search(pattern.lower(), tissue_name.lower()) or re.search( + pattern.lower(), file_path.name.lower() + ): + filtered_files.append((file_path, tissue_name)) + break + all_files = filtered_files + + # Apply exclude patterns + if exclude_patterns: + filtered_files = [] + for file_path, tissue_name in all_files: + excluded = False + for pattern in exclude_patterns: + if re.search(pattern.lower(), tissue_name.lower()) or re.search( + pattern.lower(), file_path.name.lower() + ): + excluded = True + break + if not excluded: + filtered_files.append((file_path, tissue_name)) + all_files = filtered_files + + if not quiet: + print(f"Found {len(all_files)} expression files to process:") + for file_path, tissue_name in all_files: + print(f" {tissue_name}: {file_path.name}") + + return all_files + + +def load_metadata_and_gene_map(data_dir, quiet=False): + """Load metadata and gene mapping files.""" + # Define paths + DATA_DIR = Path(data_dir) + + # File paths + METADATA_FILE = DATA_DIR / "gtex_v8-sample_metadata.pkl" + GENE_MAP_FILE = DATA_DIR / "gtex_gene_id_symbol_mappings.pkl" + + # Check if files exist + for file_path in [METADATA_FILE, GENE_MAP_FILE]: + if not file_path.exists(): + raise FileNotFoundError(f"Required file not found: {file_path}") + + if not quiet: + print("Loading metadata and gene mapping files...") + + # Load data + gtex_metadata = pd.read_pickle(METADATA_FILE) + gene_map = pd.read_pickle(GENE_MAP_FILE) + + if not quiet: + print(f"Loaded metadata: {gtex_metadata.shape}") + print(f"Loaded gene mapping: {gene_map.shape}") + + return gtex_metadata, gene_map + + +def setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs=False): + """Set up a logger for a specific gene-tissue combination.""" + logger_name = f"tissue_{gene_symbol}_{tissue_name}" + logger = logging.getLogger(logger_name) + + # Clear any existing handlers + logger.handlers.clear() + + # Set level + logger.setLevel(logging.INFO) + + log_file = None + if not no_individual_logs: + # Create file handler + log_file = output_dir / f"{gene_symbol}_{tissue_name}.log" + file_handler = logging.FileHandler(log_file, mode="w") + file_handler.setLevel(logging.INFO) + + # Create formatter + formatter = logging.Formatter( + "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + file_handler.setFormatter(formatter) + + # Add handler to logger + logger.addHandler(file_handler) + + # Always return a logger (may have no handlers if individual logs disabled) + return logger, log_file + + +def setup_summary_logger(gene_symbols, output_dir): + """Set up a logger for the main function summary.""" + logger_name = "summary" + logger = logging.getLogger(logger_name) + + # Clear any existing handlers + logger.handlers.clear() + + # Set level + logger.setLevel(logging.INFO) + + # Create file handler + genes_connected = "_".join(gene_symbols) + log_file = output_dir / f"_{genes_connected}_summary_execution.log" + file_handler = logging.FileHandler(log_file, mode="w") + file_handler.setLevel(logging.INFO) + + # Create formatter + formatter = logging.Formatter( + "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + file_handler.setFormatter(formatter) + + # Add handler to logger + logger.addHandler(file_handler) + + return logger, log_file + + +def log_and_print(message, logger=None, summary_file=None, quiet=None): + """Print message and log it if logger is provided, optionally write to summary file.""" + # Use global quiet mode if not explicitly specified + if quiet is None: + quiet = QUIET_MODE + + if not quiet: + print(message) + if logger: + logger.info(message) + if summary_file: + summary_file.write(message + "\n") + summary_file.flush() # Ensure immediate write to disk + + +def get_gene_id(gene_symbol, gene_map): + """Get gene ID from gene symbol.""" + matches = gene_map.loc[gene_map["gene_symbol"] == gene_symbol, "gene_ens_id"] + + if len(matches) == 0: + raise ValueError(f"Gene symbol '{gene_symbol}' not found in gene mapping") + elif len(matches) > 1: + print( + f"Warning: Multiple matches found for '{gene_symbol}': {matches.tolist()}" + ) + print(f"Using first match: {matches.iloc[0]}") + + return matches.iloc[0] + + +def compute_correlations_for_tissue( + gene_symbol, + tissue_name, + expr_file_path, + gtex_metadata, + gene_map, + output_dir, + pvalue_n_perms=1000000, + n_jobs=1, + no_individual_logs=False, +): + """Compute correlation between gene expression and all metadata columns for a specific tissue.""" + + # Set up logging for this tissue + logger, log_file = setup_tissue_logger(gene_symbol, tissue_name, output_dir, no_individual_logs) + + log_and_print(f"\n{'='*60}", logger) + log_and_print(f"Processing tissue: {tissue_name}", logger) + log_and_print(f"File: {expr_file_path.name}", logger) + log_and_print(f"Log file: {log_file}", logger) + log_and_print(f"{'='*60}", logger) + + # Load expression data + log_and_print("Loading expression data...", logger) + expr_data = pd.read_pickle(expr_file_path) + log_and_print(f"Expression data shape: {expr_data.shape}", logger) + + # Get gene ID + gene_id = get_gene_id(gene_symbol, gene_map) + log_and_print(f"Gene ID for {gene_symbol}: {gene_id}", logger) + + # Check if gene exists in this tissue + if gene_id not in expr_data.index: + log_and_print( + f"Warning: Gene ID '{gene_id}' not found in {tissue_name} expression data", + logger, + ) + return None, gene_id + + # Get sample IDs from expression data + sample_ids = expr_data.columns + log_and_print(f"Number of samples: {len(sample_ids)}", logger) + + # Get gene expression data + gene_expr_row = expr_data.loc[gene_id] + + # Get metadata for these samples (only for samples that exist in both datasets) + common_samples = sample_ids.intersection(gtex_metadata.index) + if len(common_samples) == 0: + log_and_print( + f"Warning: No common samples found between {tissue_name} expression data and metadata", + logger, + ) + return None, gene_id + + log_and_print(f"Common samples: {len(common_samples)}", logger) + + # Filter to common samples + gene_expr_filtered = gene_expr_row.loc[common_samples] + sample_metadata = gtex_metadata.loc[common_samples] + + log_and_print( + f"Computing CCC between {gene_symbol} expression and all metadata columns...", + logger, + ) + log_and_print(f"Using {pvalue_n_perms} permutations and {n_jobs} jobs", logger) + log_and_print( + f"Processing {len(sample_metadata.columns)} metadata columns...", logger + ) + + # Initialize results + results = [] + + # Iterate through all metadata columns + for i, column in enumerate(sample_metadata.columns, 1): + log_and_print( + f"Processing column {i}/{len(sample_metadata.columns)}: {column}", logger + ) + + try: + metadata_vector = sample_metadata[column] + + # Skip columns with all NaN values + if metadata_vector.isna().all(): + log_and_print(f" Skipping {column}: all values are NaN", logger) + results.append( + { + "metadata_column": column, + "ccc_value": np.nan, + "p_value": np.nan, + "status": "all_nan", + } + ) + continue + + # Skip columns with only one unique value (after removing NaN) + unique_values = metadata_vector.dropna().nunique() + if unique_values <= 1: + log_and_print( + f" Skipping {column}: only {unique_values} unique value(s)", logger + ) + results.append( + { + "metadata_column": column, + "ccc_value": np.nan, + "p_value": np.nan, + "status": "insufficient_variation", + } + ) + continue + + # Compute CCC (suppress numpy warnings during computation) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + ccc_val, ccc_pval = ccc( + gene_expr_filtered, + metadata_vector, + pvalue_n_perms=pvalue_n_perms, + n_jobs=n_jobs, + ) + + results.append( + { + "metadata_column": column, + "ccc_value": ccc_val, + "p_value": ccc_pval, + "status": "success", + } + ) + + log_and_print(f" CCC: {ccc_val:.6f}, p-value: {ccc_pval:.2e}", logger) + + except Exception as e: + log_and_print(f" Error processing {column}: {e}", logger) + results.append( + { + "metadata_column": column, + "ccc_value": np.nan, + "p_value": np.nan, + "status": f"error: {str(e)}", + } + ) + + # Convert to DataFrame with metadata column names as index + results_df = pd.DataFrame(results) + results_df.set_index("metadata_column", inplace=True) + + # Add tissue information + results_df["tissue"] = tissue_name + results_df["gene_symbol"] = gene_symbol + results_df["gene_id"] = gene_id + results_df["n_samples"] = len(common_samples) + + # Log completion + successful_analyses = results_df[results_df["status"] == "success"] + log_and_print(f"\nCompleted processing {tissue_name}:", logger) + log_and_print(f" Total metadata columns: {len(results_df)}", logger) + log_and_print(f" Successful analyses: {len(successful_analyses)}", logger) + log_and_print( + f" Skipped/Failed: {len(results_df) - len(successful_analyses)}", logger + ) + + # Close the logger + for handler in logger.handlers: + handler.close() + logger.removeHandler(handler) + + return results_df, gene_id + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze gene expression correlations with metadata using CCC across multiple tissues", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "gene_symbols", + nargs="+", + help="Gene symbol(s) to analyze (e.g., RASSF2 TP53 BRCA1)", + ) + + parser.add_argument( + "--expr-data-dir", + default="/mnt/data/proj_data/ccc-gpu/data/tutorial/data_by_tissue", + help="Directory containing expression data files", + ) + + parser.add_argument( + "--include", + nargs="*", + help="Include only tissues matching these patterns (fuzzy match on tissue name)", + ) + + parser.add_argument( + "--exclude", + nargs="*", + help="Exclude tissues matching these patterns (fuzzy match on tissue name)", + ) + + parser.add_argument( + "--permutations", + type=int, + # default=1000000, + default=100000, + help="Number of permutations for p-value calculation", + ) + + parser.add_argument( + "--n-jobs", type=int, default=4, help="Number of parallel jobs for computation" + ) + + parser.add_argument( + "--list-metadata-columns", + action="store_true", + help="List available metadata columns and exit", + ) + + parser.add_argument( + "--list-tissues", + action="store_true", + help="List available tissue files and exit", + ) + + parser.add_argument( + "--output-dir", + default=".", + help="Directory to save output files (default: current directory)", + ) + + parser.add_argument( + "--quiet", + action="store_true", + help="Reduce output verbosity for batch processing", + ) + + parser.add_argument( + "--no-csv-output", + action="store_true", + help="Skip CSV file generation (only create pickle files)", + ) + + parser.add_argument( + "--no-individual-logs", + action="store_true", + help="Skip individual tissue log files (only keep summary logs)", + ) + + parser.add_argument( + "--data-dir", + default="/mnt/data/proj_data/ccc-gpu/data/tutorial", + help="Directory containing GTEx data files (metadata and gene mappings)", + ) + + args = parser.parse_args() + + # Set global quiet mode + global QUIET_MODE + QUIET_MODE = args.quiet + + try: + # Create output directory if it doesn't exist + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Set up summary logger + summary_logger, summary_log_file = setup_summary_logger( + args.gene_symbols, output_dir + ) + + # Set up summary tables file + genes_connected = "_".join(args.gene_symbols) + summary_tables_file_path = output_dir / f"_{genes_connected}_summary_tables.log" + summary_tables_file = open(summary_tables_file_path, "w") + + log_and_print(f"Output directory: {output_dir.absolute()}", summary_logger) + log_and_print(f"Summary log file: {summary_log_file}", summary_logger) + log_and_print( + f"Summary tables file: {summary_tables_file_path}", summary_logger + ) + log_and_print( + f"Gene symbols to analyze: {', '.join(args.gene_symbols)}", summary_logger + ) + + # Find expression files + expression_files = find_expression_files( + args.expr_data_dir, + include_patterns=args.include, + exclude_patterns=args.exclude, + quiet=args.quiet, + ) + + # If user wants to list tissues + if args.list_tissues: + log_and_print( + f"Available expression files in {args.expr_data_dir}:", summary_logger + ) + for file_path, tissue_name in expression_files: + log_and_print(f" {tissue_name}: {file_path.name}", summary_logger) + summary_tables_file.close() + return + + # Load metadata and gene mapping + gtex_metadata, gene_map = load_metadata_and_gene_map(args.data_dir, quiet=args.quiet) + + # If user wants to list metadata columns + if args.list_metadata_columns: + log_and_print("Available metadata columns:", summary_logger) + for col in sorted(gtex_metadata.columns): + log_and_print(f" {col}", summary_logger) + summary_tables_file.close() + return + + # Process each gene symbol + all_genes_results = {} + total_start_time = time.time() + + for gene_idx, gene_symbol in enumerate(args.gene_symbols, 1): + log_and_print(f"\n{'='*100}", summary_logger) + log_and_print( + f"PROCESSING GENE {gene_idx}/{len(args.gene_symbols)}: {gene_symbol}", + summary_logger, + ) + log_and_print(f"{'='*100}", summary_logger) + + # Process each tissue for this gene + all_results = {} + gene_id = None + tissue_runtimes = {} + gene_start_time = time.time() + + for i, (expr_file_path, tissue_name) in enumerate(expression_files, 1): + log_and_print( + f"\n[{i}/{len(expression_files)}] Starting processing for {gene_symbol} in {tissue_name}...", + summary_logger, + ) + tissue_start_time = time.time() + + try: + results_df, current_gene_id = compute_correlations_for_tissue( + gene_symbol, + tissue_name, + expr_file_path, + gtex_metadata, + gene_map, + output_dir, + args.permutations, + args.n_jobs, + args.no_individual_logs, + ) + + tissue_end_time = time.time() + tissue_runtime = tissue_end_time - tissue_start_time + tissue_runtimes[tissue_name] = tissue_runtime + + if results_df is not None: + all_results[tissue_name] = results_df + gene_id = current_gene_id + + # Save individual tissue results + output_file = ( + output_dir + / f"{gene_symbol}_{tissue_name}_correlation_results.pkl" + ) + log_file = output_dir / f"{gene_symbol}_{tissue_name}.log" + results_df.to_pickle(output_file) + log_and_print( + f"Results for {gene_symbol} in {tissue_name} saved to: {output_file}", + summary_logger, + ) + if not args.no_individual_logs and log_file: + log_and_print( + f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}", + summary_logger, + ) + log_and_print( + f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)", + summary_logger, + ) + else: + log_file = output_dir / f"{gene_symbol}_{tissue_name}.log" + log_and_print( + f"No results generated for {gene_symbol} in {tissue_name}", + summary_logger, + ) + if not args.no_individual_logs and log_file: + log_and_print( + f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}", + summary_logger, + ) + log_and_print( + f"Runtime for {gene_symbol} in {tissue_name}: {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)", + summary_logger, + ) + + except Exception as e: + tissue_end_time = time.time() + tissue_runtime = tissue_end_time - tissue_start_time + tissue_runtimes[tissue_name] = tissue_runtime + log_file = output_dir / f"{gene_symbol}_{tissue_name}.log" + log_and_print( + f"Error processing {gene_symbol} in {tissue_name}: {e}", + summary_logger, + ) + if not args.no_individual_logs and log_file: + log_and_print( + f"Log file for {gene_symbol} in {tissue_name} saved to: {log_file}", + summary_logger, + ) + log_and_print( + f"Runtime for {gene_symbol} in {tissue_name} (failed): {tissue_runtime:.2f} seconds ({tissue_runtime/60:.2f} minutes)", + summary_logger, + ) + continue + + # Gene-level summary + gene_end_time = time.time() + gene_runtime = gene_end_time - gene_start_time + + if not all_results: + log_and_print( + f"No successful analyses completed for {gene_symbol}.", + summary_logger, + ) + log_and_print( + f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)", + summary_logger, + ) + continue + + # Store results for this gene + all_genes_results[gene_symbol] = { + "results": all_results, + "gene_id": gene_id, + "tissue_runtimes": tissue_runtimes, + "gene_runtime": gene_runtime, + } + + # Save combined results for this gene + combined_results = pd.concat(all_results.values(), ignore_index=False) + combined_output_file = ( + output_dir / f"{gene_symbol}_all_tissues_correlation_results.pkl" + ) + combined_results.to_pickle(combined_output_file) + if not args.no_csv_output: + combined_csv_file = ( + output_dir / f"{gene_symbol}_all_tissues_correlation_results.csv" + ) + combined_results.to_csv(combined_csv_file) + + # Gene-specific summary + log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file) + log_and_print( + "COMBINED RESULTS SUMMARY", summary_logger, summary_tables_file + ) + log_and_print(f"{'='*80}", summary_logger, summary_tables_file) + log_and_print( + f"Gene Symbol: {gene_symbol}", summary_logger, summary_tables_file + ) + log_and_print(f"Gene ID: {gene_id}", summary_logger, summary_tables_file) + log_and_print( + f"Permutations: {args.permutations:,}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Tissues processed: {len(all_results)}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Combined results saved to: {combined_output_file}", + summary_logger, + summary_tables_file, + ) + if not args.no_csv_output: + log_and_print( + f"Combined results (CSV) saved to: {combined_csv_file}", + summary_logger, + summary_tables_file, + ) + + # Show summary statistics for this gene + successful_analyses = combined_results[ + combined_results["status"] == "success" + ] + if len(successful_analyses) > 0: + log_and_print( + f"\nTotal successful analyses across all tissues: {len(successful_analyses)}", + summary_logger, + summary_tables_file, + ) + + log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file) + log_and_print( + "TOP CORRELATIONS ACROSS ALL TISSUES (by absolute CCC value)", + summary_logger, + summary_tables_file, + ) + log_and_print(f"{'='*80}", summary_logger, summary_tables_file) + + # Sort by absolute CCC value (descending) - simplified approach + successful_analyses_copy = successful_analyses.copy() + successful_analyses_copy["abs_ccc"] = successful_analyses_copy[ + "ccc_value" + ].abs() + top_results = successful_analyses_copy.sort_values( + "abs_ccc", ascending=False + ) + + # Display top results + log_and_print( + f"{'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 90, summary_logger, summary_tables_file) + + for idx, row in top_results.head(20).iterrows(): + tissue = row["tissue"] + ccc_val = row["ccc_value"] + p_val = row["p_value"] + + # Determine significance + if p_val < 0.001: + significance = "***" + elif p_val < 0.01: + significance = "**" + elif p_val < 0.05: + significance = "*" + else: + significance = "ns" + + log_and_print( + f"{tissue:<20} {idx:<25} {ccc_val:>10.6f} {p_val:>10.2e} {significance:<15}", + summary_logger, + summary_tables_file, + ) + + # Summary by tissue for this gene + log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file) + log_and_print("SUMMARY BY TISSUE", summary_logger, summary_tables_file) + log_and_print(f"{'='*80}", summary_logger, summary_tables_file) + + log_and_print( + f"{'Tissue':<20} {'N Samples':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 70, summary_logger, summary_tables_file) + + for tissue_name in sorted(all_results.keys()): + tissue_results = all_results[tissue_name] + tissue_successful = tissue_results[ + tissue_results["status"] == "success" + ] + n_samples = ( + tissue_results["n_samples"].iloc[0] + if len(tissue_results) > 0 + else 0 + ) + + if len(tissue_successful) > 0: + mean_ccc = tissue_successful["ccc_value"].abs().mean() + max_ccc = tissue_successful["ccc_value"].abs().max() + log_and_print( + f"{tissue_name:<20} {n_samples:<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}", + summary_logger, + summary_tables_file, + ) + else: + log_and_print( + f"{tissue_name:<20} {n_samples:<10} {'0':<12} {'N/A':<12} {'N/A':<12}", + summary_logger, + summary_tables_file, + ) + + # Runtime summary for this gene + log_and_print(f"\n{'='*80}", summary_logger, summary_tables_file) + log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file) + log_and_print(f"{'='*80}", summary_logger, summary_tables_file) + log_and_print( + f"Total runtime: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Average runtime per tissue: {gene_runtime/len(expression_files):.2f} seconds", + summary_logger, + summary_tables_file, + ) + + log_and_print("\nRuntime by tissue:", summary_logger, summary_tables_file) + log_and_print( + f"{'Tissue':<25} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Status':<10}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 70, summary_logger, summary_tables_file) + + for tissue_name in sorted(tissue_runtimes.keys()): + runtime = tissue_runtimes[tissue_name] + status = "Success" if tissue_name in all_results else "Failed" + log_and_print( + f"{tissue_name:<25} {runtime:<15.2f} {runtime/60:<15.2f} {status:<10}", + summary_logger, + summary_tables_file, + ) + + if tissue_runtimes: + # Find fastest and slowest tissues + fastest_tissue = min(tissue_runtimes.items(), key=lambda x: x[1]) + slowest_tissue = max(tissue_runtimes.items(), key=lambda x: x[1]) + + log_and_print( + f"\nFastest: {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Slowest: {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x", + summary_logger, + summary_tables_file, + ) + + log_and_print( + f"Runtime for {gene_symbol}: {gene_runtime:.2f} seconds ({gene_runtime/60:.2f} minutes)", + summary_logger, + ) + + total_end_time = time.time() + total_runtime = total_end_time - total_start_time + + if not all_genes_results: + log_and_print( + "No successful analyses completed for any gene.", summary_logger + ) + summary_tables_file.close() + return + + # Create overall summary + log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file) + log_and_print("OVERALL RESULTS SUMMARY", summary_logger, summary_tables_file) + log_and_print(f"{'='*100}", summary_logger, summary_tables_file) + log_and_print( + f"Gene symbols processed: {', '.join(all_genes_results.keys())}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Total genes: {len(all_genes_results)}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Permutations: {args.permutations:,}", summary_logger, summary_tables_file + ) + log_and_print( + f"Tissues per gene: {len(expression_files)}", + summary_logger, + summary_tables_file, + ) + + # Combine all results across genes + all_combined_results = [] + for gene_symbol, gene_data in all_genes_results.items(): + gene_combined = pd.concat(gene_data["results"].values(), ignore_index=False) + all_combined_results.append(gene_combined) + + mega_combined_results = pd.concat(all_combined_results, ignore_index=False) + + # Save mega combined results + mega_output_file = output_dir / "_all_genes_all_tissues_correlation_results.pkl" + mega_combined_results.to_pickle(mega_output_file) + log_and_print( + f"All genes combined results saved to: {mega_output_file}", + summary_logger, + summary_tables_file, + ) + + # Also save as CSV for easy viewing (if not disabled) + if not args.no_csv_output: + mega_csv_file = output_dir / "_all_genes_all_tissues_correlation_results.csv" + mega_combined_results.to_csv(mega_csv_file) + log_and_print( + f"All genes combined results (CSV) saved to: {mega_csv_file}", + summary_logger, + summary_tables_file, + ) + + # List all log files created (if individual logs are enabled) + if not args.no_individual_logs: + log_and_print("\nLog files created:", summary_logger) + for gene_symbol in all_genes_results.keys(): + for tissue_name in [name for _, name in expression_files]: + log_file = output_dir / f"{gene_symbol}_{tissue_name}.log" + if log_file.exists(): + log_and_print( + f" {gene_symbol} - {tissue_name}: {log_file}", summary_logger + ) + + # Show summary statistics across all genes and tissues + successful_analyses = mega_combined_results[ + mega_combined_results["status"] == "success" + ] + if len(successful_analyses) > 0: + log_and_print( + f"\nTotal successful analyses across all genes and tissues: {len(successful_analyses)}", + summary_logger, + summary_tables_file, + ) + + log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file) + log_and_print( + "TOP CORRELATIONS ACROSS ALL GENES AND TISSUES (by absolute CCC value)", + summary_logger, + summary_tables_file, + ) + log_and_print(f"{'='*100}", summary_logger, summary_tables_file) + + # Sort by absolute CCC value (descending) - simplified approach + successful_analyses_copy = successful_analyses.copy() + successful_analyses_copy["abs_ccc"] = successful_analyses_copy[ + "ccc_value" + ].abs() + top_results = successful_analyses_copy.sort_values( + "abs_ccc", ascending=False + ) + + # Display top results + log_and_print( + f"{'Gene':<12} {'Tissue':<20} {'Metadata Column':<25} {'CCC Value':<12} {'P-value':<12} {'Significance':<15}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 110, summary_logger, summary_tables_file) + + for idx, row in top_results.head(30).iterrows(): + gene = row["gene_symbol"] + tissue = row["tissue"] + ccc_val = row["ccc_value"] + p_val = row["p_value"] + + # Determine significance + if p_val < 0.001: + significance = "***" + elif p_val < 0.01: + significance = "**" + elif p_val < 0.05: + significance = "*" + else: + significance = "ns" + + log_and_print( + f"{gene:<12} {tissue:<20} {idx:<25} {ccc_val:>10.6f} {p_val:>10.2e} {significance:<15}", + summary_logger, + summary_tables_file, + ) + + # Summary by gene + log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file) + log_and_print("SUMMARY BY GENE", summary_logger, summary_tables_file) + log_and_print(f"{'='*100}", summary_logger, summary_tables_file) + + for gene_symbol, gene_data in all_genes_results.items(): + gene_combined = pd.concat( + gene_data["results"].values(), ignore_index=False + ) + gene_successful = gene_combined[gene_combined["status"] == "success"] + + log_and_print( + f"\nGene: {gene_symbol} (ID: {gene_data['gene_id']})", + summary_logger, + summary_tables_file, + ) + log_and_print( + f" Tissues processed: {len(gene_data['results'])}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f" Successful analyses: {len(gene_successful)}", + summary_logger, + summary_tables_file, + ) + + if len(gene_successful) > 0: + mean_ccc = gene_successful["ccc_value"].abs().mean() + max_ccc = gene_successful["ccc_value"].abs().max() + log_and_print( + f" Mean |CCC|: {mean_ccc:.6f}", + summary_logger, + summary_tables_file, + ) + log_and_print( + f" Max |CCC|: {max_ccc:.6f}", + summary_logger, + summary_tables_file, + ) + + # Top correlation for this gene + gene_successful_copy = gene_successful.copy() + gene_successful_copy["abs_ccc"] = gene_successful_copy[ + "ccc_value" + ].abs() + top_corr = gene_successful_copy.sort_values( + "abs_ccc", ascending=False + ).iloc[0] + log_and_print( + f" Top correlation: {top_corr.name} in {top_corr['tissue']} (CCC: {top_corr['ccc_value']:.6f}, p: {top_corr['p_value']:.2e})", + summary_logger, + summary_tables_file, + ) + + log_and_print( + f" Runtime: {gene_data['gene_runtime']:.2f} seconds ({gene_data['gene_runtime']/60:.2f} minutes)", + summary_logger, + summary_tables_file, + ) + + # Summary by tissue across all genes + log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file) + log_and_print( + "SUMMARY BY TISSUE (across all genes)", + summary_logger, + summary_tables_file, + ) + log_and_print(f"{'='*100}", summary_logger, summary_tables_file) + + tissue_summary = {} + for gene_symbol, gene_data in all_genes_results.items(): + for tissue_name, tissue_results in gene_data["results"].items(): + if tissue_name not in tissue_summary: + tissue_summary[tissue_name] = [] + tissue_summary[tissue_name].append(tissue_results) + + log_and_print( + f"{'Tissue':<25} {'N Genes':<10} {'Successful':<12} {'Mean |CCC|':<12} {'Max |CCC|':<12}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 75, summary_logger, summary_tables_file) + + for tissue_name in sorted(tissue_summary.keys()): + tissue_all_genes = pd.concat( + tissue_summary[tissue_name], ignore_index=False + ) + tissue_successful = tissue_all_genes[ + tissue_all_genes["status"] == "success" + ] + + if len(tissue_successful) > 0: + mean_ccc = tissue_successful["ccc_value"].abs().mean() + max_ccc = tissue_successful["ccc_value"].abs().max() + log_and_print( + f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {len(tissue_successful):<12} {mean_ccc:<12.6f} {max_ccc:<12.6f}", + summary_logger, + summary_tables_file, + ) + else: + log_and_print( + f"{tissue_name:<25} {len(tissue_summary[tissue_name]):<10} {'0':<12} {'N/A':<12} {'N/A':<12}", + summary_logger, + summary_tables_file, + ) + + # Runtime summary + log_and_print(f"\n{'='*100}", summary_logger, summary_tables_file) + log_and_print("RUNTIME SUMMARY", summary_logger, summary_tables_file) + log_and_print(f"{'='*100}", summary_logger, summary_tables_file) + log_and_print( + f"Total runtime: {total_runtime:.2f} seconds ({total_runtime/60:.2f} minutes)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Average runtime per gene: {total_runtime/len(args.gene_symbols):.2f} seconds", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Total gene-tissue combinations: {len(args.gene_symbols) * len(expression_files)}", + summary_logger, + summary_tables_file, + ) + + # Runtime by gene + log_and_print("\nRuntime by gene:", summary_logger, summary_tables_file) + log_and_print( + f"{'Gene':<15} {'Runtime (sec)':<15} {'Runtime (min)':<15} {'Tissues':<10} {'Successful':<12}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 75, summary_logger, summary_tables_file) + + for gene_symbol, gene_data in all_genes_results.items(): + successful_tissues = len(gene_data["results"]) + log_and_print( + f"{gene_symbol:<15} {gene_data['gene_runtime']:<15.2f} {gene_data['gene_runtime']/60:<15.2f} {len(expression_files):<10} {successful_tissues:<12}", + summary_logger, + summary_tables_file, + ) + + # Aggregate tissue runtime statistics across all genes + all_tissue_runtimes = {} + for gene_symbol, gene_data in all_genes_results.items(): + for tissue_name, runtime in gene_data["tissue_runtimes"].items(): + if tissue_name not in all_tissue_runtimes: + all_tissue_runtimes[tissue_name] = [] + all_tissue_runtimes[tissue_name].append(runtime) + + if all_tissue_runtimes: + log_and_print( + "\nAverage runtime by tissue (across all genes):", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"{'Tissue':<25} {'Avg Runtime (sec)':<18} {'Avg Runtime (min)':<18} {'N Runs':<8} {'Min':<10} {'Max':<10}", + summary_logger, + summary_tables_file, + ) + log_and_print("-" * 95, summary_logger, summary_tables_file) + + tissue_avg_runtimes = [] + for tissue_name in sorted(all_tissue_runtimes.keys()): + runtimes = all_tissue_runtimes[tissue_name] + avg_runtime = np.mean(runtimes) + min_runtime = np.min(runtimes) + max_runtime = np.max(runtimes) + tissue_avg_runtimes.append((tissue_name, avg_runtime)) + + log_and_print( + f"{tissue_name:<25} {avg_runtime:<18.2f} {avg_runtime/60:<18.2f} {len(runtimes):<8} {min_runtime:<10.2f} {max_runtime:<10.2f}", + summary_logger, + summary_tables_file, + ) + + # Find fastest and slowest tissues (by average) + tissue_avg_runtimes.sort(key=lambda x: x[1]) + fastest_tissue = tissue_avg_runtimes[0] + slowest_tissue = tissue_avg_runtimes[-1] + + log_and_print( + f"\nFastest tissue (avg): {fastest_tissue[0]} ({fastest_tissue[1]:.2f} seconds)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Slowest tissue (avg): {slowest_tissue[0]} ({slowest_tissue[1]:.2f} seconds)", + summary_logger, + summary_tables_file, + ) + log_and_print( + f"Speed ratio: {slowest_tissue[1]/fastest_tissue[1]:.1f}x", + summary_logger, + summary_tables_file, + ) + + # Final message about summary log + log_and_print(f"\nSummary log saved to: {summary_log_file}", summary_logger) + log_and_print( + f"Summary tables saved to: {summary_tables_file_path}", summary_logger + ) + + # Close the summary tables file + summary_tables_file.close() + + # Close the summary logger + for handler in summary_logger.handlers: + handler.close() + summary_logger.removeHandler(handler) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + # Try to close the summary tables file if it was opened + try: + summary_tables_file.close() + except: + pass + sys.exit(1) + + +if __name__ == "__main__": + main()