diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fb3f7e1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,83 @@ +name: CI + +on: + push: + branches: [master, main] + pull_request: + branches: [master, main] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff mypy + + - name: Lint with ruff + run: ruff check src/ + + - name: Type check with mypy + run: mypy src/optitype --ignore-missing-imports + + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y glpk-utils + + - name: Install package + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Run tests + run: pytest tests/ -v + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build tools + run: | + python -m pip install --upgrade pip + pip install build twine + + - name: Build package + run: python -m build + + - name: Check package + run: twine check dist/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..68375f2 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,70 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build tools + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build package + run: python -m build + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish-testpypi: + needs: build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/optitype + permissions: + id-token: write + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + + publish-pypi: + needs: publish-testpypi + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/optitype + permissions: + id-token: write + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore index bc6e23a..88b8022 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,57 @@ -# Created by .gitignore support plugin (hsz.mobi) +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg -*.pyc -.gitignore \ No newline at end of file +# Virtual environments +venv/ +ENV/ +env/ +.venv/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.nox/ + +# IDEs +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Project specific +config.ini +test/**/baseline/ +test/**/new/ +test/expected_*.tsv +*.bam +*.sam + +# Claude Code +CLAUDE.md +.claude/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 68b17f2..0000000 --- a/.travis.yml +++ /dev/null @@ -1,37 +0,0 @@ -dist: bionic -sudo: false - -language: python -python: -- '2.7' -- '3.6' -- '3.7' -- '3.8' -env: COMPILER_NAME=gcc CXX=g++ CC=gcc -addons: - apt: - packages: - - coinor-cbc - -before_script: -- git clone https://github.com/seqan/seqan.git seqan-src - && cd seqan-src - && cmake -DCMAKE_BUILD_TYPE=Release - && make razers3 - && cd .. -- pip install six -- pip install future -- pip install numpy -- pip install pyomo -- pip install --upgrade pip -- pip install pysam -- pip install matplotlib -- wget https://support.hdfgroup.org/ftp/HDF5/current18/bin/hdf5-1.8.21-Std-centos7-x86_64-shared_64.tar.gz -- tar xf hdf5-1.8.21-Std-centos7-x86_64-shared_64.tar.gz -- export LD_LIBRARY_PATH=${PWD}/hdf5-1.8.21-Std-centos7-x86_64-shared_64/lib:$LD_LIBRARY_PATH -- export HDF5_DIR=${PWD}/hdf5-1.8.21-Std-centos7-x86_64-shared_64/ -- pip install tables -- pip install cython -- pip install pandas - -script: python OptiTypePipeline.py -i ./test/rna/CRC_81_N_2_fished.fastq --rna -v -o ./test/rna/ -c ./test/test_config.ini diff --git a/OptiTypePipeline.py b/OptiTypePipeline.py deleted file mode 100755 index 8cfbc74..0000000 --- a/OptiTypePipeline.py +++ /dev/null @@ -1,437 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -""" -################################################################### - -OptiType: precision HLA typing from next-generation sequencing data - -################################################################### - -Authors: András Szolek, Benjamin Schubert, Christopher Mohr -Date: August 2017 -Version: 1.3.1 -License: OptiType is released under a three-clause BSD license - - -Introduction: -------------- -OptiType, is a novel HLA genotyping algorithm based on integer linear -programming, capable of producing accurate 4-digit HLA genotyping predictions -from NGS data by simultaneously selecting all minor and major HLA-I alleles. - - -Requirements: -------------- -OptiType uses the following software and libraries: -1) Python 2.7 -2) Biopython 1.63 -3) Pyomo 4.1 -4) Matplotlib 1.3.1 -5) Pandas 0.12 (with HDF5 support) -6) HDF5 1.8.11 -7) RazerS 3.1 -8) Cplex 12.5 - -Please make sure you have installed said software/libraries -and their dependencies. - - -Installation: -------------- -First install all required software and libraries and register the static path -in the configuration file for RazerS 3.1. CPLEX should be globally executable -via command line. Alternative ILP solver supported by Cooper are also usable. -Please do not change the folder structure or make sure you changed the necessary -entries in the config file. - - -Usage: -------------- -1) First filter the read files with the following settings: - ->razers3 --percent-identity 90 --max-hits 1 --distance-range 0 - --output-format sam --output sample_fished.sam - ./data/hla_reference.fasta sample.fastq - -where reference.fasta is either nuc_reference.fasta or gen_reference.fasta -depending on the type of NGS data. The references can be found in the ./data -sub-folder or in the supplementary material. To use the results as input -for OptiType the sam-files have to be converted into fastq format. On Unix- -based operating system you can convert from sam to fastq with the following -command: - ->cat sample_fished.sam | grep -v ^@ - | awk '{print "@"$1"\n"$10"\n+\n"$11}' > sample_fished.fastq - -For paired-end data pre-process each file individually. - -2) After pre-filtering, OptiType can be called as follows: - ->python OptiTypePipeline.py -i sample_fished_1.fastq [sample_fished_2.fastq] - (--rna | --dna) [--beta BETA] [--enumerate ENUMERATE] - --o ./out_dir/ - -This will produce a CSV with the optimal typing and possible sub-optimal -typings if specified, as well as a coverage plot of the genotype for -diagnostic purposes and a HTML file containing a summary of the results. - ->python OptiTypePipeline.py --help -usage: OptiType [-h] --input INPUT [INPUT ...] (--rna | --dna) [--beta BETA] - [--enumerate ENUMERATE] --outdir OUTDIR [--verbose] - -OptiType: 4-digit HLA typer - -optional arguments: - -h, --help show this help message and exit - --input INPUT [INPUT ...], -i INPUT [INPUT ...] - Fastq files with fished HLA reads. Max two files (for - paired-end) - --rna, -r Specifiying the mapped data as RNA. - --dna, -d Specifiying the mapped data as DNA. - --beta BETA, -b BETA The beta value for for homozygosity detection. - --enumerate ENUMERATE, -e ENUMERATE - The number of enumerations. - --outdir OUTDIR, -o OUTDIR - Specifies the out directory to which all files should - be written - --verbose, -v Set verbose mode on. -""" -from __future__ import print_function -from future import standard_library -standard_library.install_aliases() -from builtins import zip -from builtins import filter -from builtins import map -from builtins import range - -## eliminate dependency on an X11 server -import matplotlib -matplotlib.use('Agg') - -import sys -import subprocess -import os -import argparse -if sys.version_info > (3,0): - import configparser -else: - import ConfigParser as configparser -import time -import datetime -import pandas as pd -import hlatyper as ht - -import numpy as np -from model import OptiType -from collections import defaultdict - -# Try to import pysam, important for RazerS 3 output format -try: - import pysam - PYSAM_AVAILABLE = True -except ImportError: - PYSAM_AVAILABLE = False - - -freq_alleles = '''A*01:01 A*01:02 A*01:03 A*01:06 A*01:09 A*01:23 A*01:38 A*01:44 A*02:01 A*02:02 A*02:03 A*02:04 A*02:05 A*02:06 A*02:07 A*02:08 A*02:09 A*02:10 A*02:11 A*02:12 A*02:13 A*02:133 A*02:14 A*02:141 A*02:146 A*02:15N A*02:16 A*02:17 A*02:18 A*02:19 A*02:20 A*02:21 A*02:22 A*02:226N A*02:24 A*02:25 A*02:26 A*02:27 A*02:28 A*02:29 A*02:30 A*02:33 A*02:34 A*02:35 A*02:36 A*02:37 A*02:38 A*02:40 A*02:42 A*02:44 A*02:45 A*02:46 A*02:48 A*02:49 A*02:51 A*02:53N A*02:54 A*02:55 A*02:57 A*02:58 A*02:60 A*02:64 A*02:67 A*02:74 A*02:85 A*02:90 A*02:93 A*03:01 A*03:02 A*03:05 A*03:07 A*03:08 A*03:10 A*03:12 A*03:22 A*03:25 A*03:65 A*03:69N A*03:97 A*11:01 A*11:02 A*11:03 A*11:04 A*11:05 A*11:06 A*11:08 A*11:10 A*11:12 A*11:13 A*11:18 A*11:19 A*11:20 A*11:29 A*11:40 A*23:01 A*23:02 A*23:03 A*23:04 A*23:05 A*23:09 A*24:02 A*24:03 A*24:04 A*24:05 A*24:06 A*24:07 A*24:08 A*24:09N A*24:10 A*24:13 A*24:14 A*24:15 A*24:17 A*24:18 A*24:20 A*24:21 A*24:22 A*24:23 A*24:24 A*24:25 A*24:26 A*24:27 A*24:28 A*24:29 A*24:31 A*24:35 A*24:46 A*24:51 A*24:56 A*24:63 A*24:93 A*25:01 A*25:02 A*25:04 A*26:01 A*26:02 A*26:03 A*26:04 A*26:05 A*26:06 A*26:07 A*26:08 A*26:09 A*26:10 A*26:11N A*26:12 A*26:14 A*26:15 A*26:16 A*26:17 A*26:18 A*26:20 A*26:49 A*29:01 A*29:02 A*29:03 A*29:04 A*29:10 A*29:12 A*30:01 A*30:02 A*30:03 A*30:04 A*30:06 A*30:08 A*30:09 A*30:10 A*30:11 A*30:12 A*30:16 A*31:01 A*31:02 A*31:03 A*31:04 A*31:05 A*31:06 A*31:08 A*31:09 A*31:12 A*32:01 A*32:02 A*32:03 A*32:04 A*32:05 A*32:06 A*32:08 A*32:13 A*32:20 A*32:22 A*33:01 A*33:03 A*33:04 A*33:05 A*33:10 A*33:26 A*34:01 A*34:02 A*34:03 A*34:05 A*36:01 A*36:03 A*43:01 A*66:01 A*66:02 A*66:03 A*68:01 A*68:02 A*68:03 A*68:04 A*68:05 A*68:06 A*68:07 A*68:08 A*68:12 A*68:13 A*68:15 A*68:16 A*68:17 A*68:18N A*68:23 A*68:24 A*68:38 A*69:01 A*74:01 A*74:02 A*74:03 A*74:04 A*74:06 A*74:09 A*74:11 A*80:01 A*80:02 B*07:02 B*07:03 B*07:04 B*07:05 B*07:06 B*07:07 B*07:08 B*07:09 B*07:10 B*07:12 B*07:13 B*07:14 B*07:15 B*07:17 B*07:20 B*07:22 B*07:26 B*07:33 B*07:36 B*07:47 B*07:53 B*07:85 B*08:01 B*08:02 B*08:03 B*08:04 B*08:05 B*08:09 B*08:12 B*08:18 B*08:23 B*13:01 B*13:02 B*13:03 B*13:04 B*13:07N B*13:09 B*13:11 B*13:13 B*14:01 B*14:02 B*14:03 B*14:04 B*14:05 B*14:06 B*15:01 B*15:02 B*15:03 B*15:04 B*15:05 B*15:06 B*15:07 B*15:08 B*15:09 B*15:10 B*15:108 B*15:11 B*15:12 B*15:123 B*15:125 B*15:13 B*15:135 B*15:15 B*15:153 B*15:16 B*15:17 B*15:18 B*15:20 B*15:21 B*15:23 B*15:24 B*15:25 B*15:27 B*15:28 B*15:29 B*15:30 B*15:31 B*15:32 B*15:33 B*15:34 B*15:35 B*15:36 B*15:37 B*15:38 B*15:39 B*15:40 B*15:42 B*15:45 B*15:46 B*15:47 B*15:48 B*15:50 B*15:52 B*15:53 B*15:54 B*15:55 B*15:56 B*15:58 B*15:61 B*15:63 B*15:67 B*15:68 B*15:70 B*15:71 B*15:73 B*15:82 B*15:86 B*18:01 B*18:02 B*18:03 B*18:04 B*18:05 B*18:06 B*18:07 B*18:08 B*18:09 B*18:11 B*18:13 B*18:14 B*18:18 B*18:19 B*18:20 B*18:28 B*18:33 B*27:01 B*27:02 B*27:03 B*27:04 B*27:05 B*27:06 B*27:07 B*27:08 B*27:09 B*27:10 B*27:11 B*27:12 B*27:13 B*27:14 B*27:19 B*27:20 B*27:21 B*27:30 B*27:39 B*35:01 B*35:02 B*35:03 B*35:04 B*35:05 B*35:06 B*35:08 B*35:09 B*35:10 B*35:11 B*35:12 B*35:13 B*35:14 B*35:15 B*35:16 B*35:17 B*35:18 B*35:19 B*35:20 B*35:21 B*35:22 B*35:23 B*35:24 B*35:25 B*35:27 B*35:28 B*35:29 B*35:30 B*35:31 B*35:32 B*35:33 B*35:34 B*35:36 B*35:43 B*35:46 B*35:51 B*35:77 B*35:89 B*37:01 B*37:02 B*37:04 B*37:05 B*38:01 B*38:02 B*38:04 B*38:05 B*38:06 B*38:15 B*39:01 B*39:02 B*39:03 B*39:04 B*39:05 B*39:06 B*39:07 B*39:08 B*39:09 B*39:10 B*39:11 B*39:12 B*39:13 B*39:14 B*39:15 B*39:23 B*39:24 B*39:31 B*39:34 B*40:01 B*40:02 B*40:03 B*40:04 B*40:05 B*40:06 B*40:07 B*40:08 B*40:09 B*40:10 B*40:11 B*40:12 B*40:14 B*40:15 B*40:16 B*40:18 B*40:19 B*40:20 B*40:21 B*40:23 B*40:27 B*40:31 B*40:35 B*40:36 B*40:37 B*40:38 B*40:39 B*40:40 B*40:42 B*40:44 B*40:49 B*40:50 B*40:52 B*40:64 B*40:80 B*41:01 B*41:02 B*41:03 B*42:01 B*42:02 B*44:02 B*44:03 B*44:04 B*44:05 B*44:06 B*44:07 B*44:08 B*44:09 B*44:10 B*44:12 B*44:13 B*44:15 B*44:18 B*44:20 B*44:21 B*44:22 B*44:26 B*44:27 B*44:29 B*44:31 B*44:59 B*45:01 B*45:02 B*45:04 B*45:06 B*46:01 B*46:02 B*46:13 B*47:01 B*47:02 B*47:03 B*48:01 B*48:02 B*48:03 B*48:04 B*48:05 B*48:06 B*48:07 B*48:08 B*49:01 B*49:02 B*49:03 B*50:01 B*50:02 B*50:04 B*50:05 B*51:01 B*51:02 B*51:03 B*51:04 B*51:05 B*51:06 B*51:07 B*51:08 B*51:09 B*51:10 B*51:12 B*51:13 B*51:14 B*51:15 B*51:18 B*51:21 B*51:22 B*51:27N B*51:29 B*51:31 B*51:32 B*51:33 B*51:34 B*51:36 B*51:37 B*51:63 B*51:65 B*52:01 B*52:02 B*52:06 B*53:01 B*53:02 B*53:03 B*53:04 B*53:05 B*53:07 B*53:08 B*54:01 B*54:02 B*55:01 B*55:02 B*55:03 B*55:04 B*55:07 B*55:08 B*55:10 B*55:12 B*55:16 B*55:46 B*56:01 B*56:02 B*56:03 B*56:04 B*56:05 B*56:06 B*56:07 B*56:09 B*56:11 B*57:01 B*57:02 B*57:03 B*57:04 B*57:05 B*57:06 B*57:10 B*58:01 B*58:02 B*58:06 B*59:01 B*67:01 B*67:02 B*73:01 B*78:01 B*78:02 B*78:03 B*78:05 B*81:01 B*81:02 B*82:01 B*82:02 B*83:01 C*01:02 C*01:03 C*01:04 C*01:05 C*01:06 C*01:08 C*01:14 C*01:17 C*01:30 C*01:32 C*02:02 C*02:03 C*02:04 C*02:06 C*02:08 C*02:09 C*02:10 C*02:19 C*02:20 C*02:27 C*03:02 C*03:03 C*03:04 C*03:05 C*03:06 C*03:07 C*03:08 C*03:09 C*03:10 C*03:13 C*03:14 C*03:15 C*03:16 C*03:17 C*03:19 C*03:21 C*03:32 C*03:42 C*03:43 C*03:56 C*03:67 C*03:81 C*04:01 C*04:03 C*04:04 C*04:05 C*04:06 C*04:07 C*04:10 C*04:11 C*04:14 C*04:15 C*04:24 C*04:29 C*04:33 C*04:37 C*05:01 C*05:03 C*05:04 C*05:07N C*05:09 C*06:02 C*06:03 C*06:04 C*06:06 C*06:08 C*06:09 C*06:17 C*06:24 C*06:53 C*07:01 C*07:02 C*07:03 C*07:04 C*07:05 C*07:06 C*07:07 C*07:08 C*07:09 C*07:10 C*07:109 C*07:123 C*07:13 C*07:14 C*07:17 C*07:18 C*07:19 C*07:21 C*07:22 C*07:27 C*07:29 C*07:32N C*07:37 C*07:43 C*07:46 C*07:49 C*07:56 C*07:66 C*07:67 C*07:68 C*07:80 C*07:95 C*08:01 C*08:02 C*08:03 C*08:04 C*08:05 C*08:06 C*08:13 C*08:15 C*08:20 C*08:21 C*08:27 C*12:02 C*12:03 C*12:04 C*12:05 C*12:07 C*12:12 C*12:15 C*12:16 C*14:02 C*14:03 C*14:04 C*15:02 C*15:03 C*15:04 C*15:05 C*15:06 C*15:07 C*15:08 C*15:09 C*15:11 C*15:12 C*15:13 C*15:17 C*16:01 C*16:02 C*16:04 C*16:08 C*16:09 C*17:01 C*17:02 C*17:03 C*17:04 C*18:01 C*18:02 A*30:07 B*15:64 B*18:12'''.split(' ') - - -def is_frequent(allele_id): - # prepare for HLA12345_HLA67890 and use the first part - allele_id = allele_id.split('_')[0] - return table.loc[allele_id]['4digit'] in freq_alleles and table.loc[allele_id]['flags'] == 0 or (table.loc[allele_id]['locus'] in 'HGJ') - - -def get_4digit(allele_id): - allele_id = allele_id.split('_')[0] # for reconstructed IDs like HLA12345_HLA67890 return HLA12345's 4-digit type - return table.loc[allele_id]['4digit'] - - -def get_types(allele_id): - if not isinstance(allele_id, str): - return allele_id - else: - aa = allele_id.split('_') - if len(aa) == 1: - return table.loc[aa[0]]['4digit'] - else: - return table.loc[aa[0]]['4digit'] #+ '/' + table.loc[aa[1]]['4digit'] - -def get_num_threads(configured_threads): - try: - import multiprocessing - except (ImportError, NotImplementedError): - return 2 - if(multiprocessing.cpu_count() < configured_threads): - return multiprocessing.cpu_count() - return configured_threads - - -if __name__ == '__main__': - - this_dir = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - config_default = os.path.join(this_dir, 'config.ini') - - parser = argparse.ArgumentParser(description=' OptiType: 4-digit HLA typer', prog='OptiType') - parser.add_argument('--input','-i', - nargs='+', - required=True, - metavar='FQ', - help=(".fastq file(s) (fished or raw) or .bam files stored for re-use, generated by " - "an earlier OptiType run. One file: single-end mode, two files: paired-end mode.") - ) - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--rna','-r', - action="store_true", - help="Use with RNA sequencing data." - ) - group.add_argument('--dna','-d', - action="store_true", - help="Use with DNA sequencing data." - ) - parser.add_argument('--beta','-b', - type=float, - metavar='B', - default=0.009, - help="The beta value for for homozygosity detection (see paper). Default: 0.009. Handle with care." - ) - parser.add_argument('--enumerate','-e', - type=int, - default=1, - metavar='N', - help=("Number of enumerations. OptiType will output the optimal solution and " - "the top N-1 suboptimal solutions in the results CSV. Default: 1") - ) - parser.add_argument('--outdir','-o', - required=True, - help="Specifies the out directory to which all files should be written." - ) - parser.add_argument('--prefix', '-p', - default=None, dest="prefix", type=str, - help="Specifies prefix of output files" - ) - parser.add_argument('--verbose','-v', - required=False, - action="store_true", - help="Set verbose mode on." - ) - parser.add_argument('--config', '-c', - type=argparse.FileType('r'), - default=config_default, - help="Path to config file. Default: config.ini in the same directory as this script" - ) - - args = parser.parse_args() - - if not os.path.isfile(args.config.name): - print ("Config file not found. Place config.ini either alongside this script or use the -c option. " - "See config.ini.example and note that its fields have changed recently.") - sys.exit(-1) - - config = configparser.ConfigParser(os.environ) - config.read(args.config.name) - - unpaired_weight = config.getfloat('behavior', 'unpaired_weight') - use_discordant = config.getboolean('behavior', 'use_discordant') - - if not os.path.exists(args.outdir): - os.makedirs(args.outdir) - - # test if inputs are legit: - if args.beta < 0.0 or args.beta >= 0.1: - print("Beta value is not correctly chosen. Please choose another beta value between [0,0.1]") - sys.exit(-1) - - if args.enumerate <= 0: - print("The specified number of enumerations must be bigger than %i"%args.enumeration) - sys.exit(-1) - - if len(args.input) not in (1, 2): - print("Number of input files can only be 1 (single-end) or 2 (paired-end)") - sys.exit(-1) - - input_extension = args.input[0].split('.')[-1] - assert all(ii.endswith('.' + input_extension) for ii in args.input), 'Mixed input file extensions' - - bam_input = (input_extension in ('sam', 'bam', 'SAM', 'BAM')) # otherwise treated as fastq - - # Constants - VERBOSE = ht.VERBOSE = bool(args.verbose) # set verbosity setting in hlatyper too - COMMAND = "-i 97 -m 99999 --distance-range 0 -pa -tc %d -o %s %s %s" - ALLELE_HDF = os.path.join(this_dir, 'data/alleles.h5') - MAPPING_REF = {'gen': os.path.join(this_dir, 'data/hla_reference_dna.fasta'), - 'nuc': os.path.join(this_dir, 'data/hla_reference_rna.fasta')} - MAPPING_CMD = config.get("mapping", "razers3") + " " + COMMAND - date = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S') - if args.prefix == None: - prefix = date - out_dir = os.path.join(args.outdir, date) - else: - prefix = args.prefix - out_dir = args.outdir - - if not os.path.exists(out_dir): - os.makedirs(out_dir) - - if PYSAM_AVAILABLE: - extension = 'bam' - else: - extension = 'sam' - - bam_paths = args.input if bam_input else [os.path.join(out_dir, ("%s_%i.%s" % (prefix, i+1, extension))) for i in range(len(args.input))] - - # SETUP variables and OUTPUT samples - ref_type = "nuc" if args.rna else "gen" - is_paired = len(args.input) > 1 - - out_csv = os.path.join(out_dir, ("%s_result.tsv" % prefix)) - out_plot = os.path.join(out_dir, ("%s_coverage_plot.pdf" % prefix)) - - # mapping fished file to reference - if not bam_input: - threads = get_num_threads(config.getint("mapping", "threads")) - if VERBOSE: - print("\nmapping with %s threads..." % threads) - for (i, sample), outbam in zip(enumerate(args.input), bam_paths): - if VERBOSE: - print("\n", ht.now(), "Mapping %s to %s reference..." % (os.path.basename(sample), ref_type.upper())) - - subprocess.call(MAPPING_CMD % (threads, outbam, - MAPPING_REF[ref_type], sample), shell=True) - - # sam-to-hdf5 - table, features = ht.load_hdf(ALLELE_HDF, False, 'table', 'features') - if VERBOSE: - print("\n", ht.now(), "Generating binary hit matrix.") - - if is_paired: - # combine matrices for paired-end mapping - pos, read_details = ht.pysam_to_hdf(bam_paths[0]) - binary1 = np.sign(pos) # dtype=np.uint16 - - pos2, read_details2 = ht.pysam_to_hdf(bam_paths[1]) - binary2 = np.sign(pos2) # dtype=np.uint16 - - if not bam_input and config.getboolean('behavior', 'deletebam'): - os.remove(bam_paths[0]) - os.remove(bam_paths[1]) - - id1 = set(binary1.index) - id2 = set(binary2.index) - - ''' - test if we actually can do paired-end mapping - 1) look at the last character 2-1 character if they are always the same if so -> proon them away and do - paired-end - 2) if not test if the intersection of ID-binary1 and ID-binary2 has at least 10% of the former read - number -> do paired-end - 3) if nothing worked ( perhaps pair-end ID was in the middle or something) raise flag and do single-end - mapping on first input - ''' - if len(set([r[-1] for r in id1])) == 1 and len(set([r[-1] for r in id2])) == 1: - # if this case is true you have to edit also all pos,etc,desc indices such that the plotting works correctly - # again .. maybe it is also neccessary to test for the last two characters - cut_last_char = lambda x: x[:-1] - binary1.index = list(map(cut_last_char, binary1.index)) - binary2.index = list(map(cut_last_char, binary2.index)) - pos.index = list(map(cut_last_char, pos.index)) - pos2.index = list(map(cut_last_char, pos2.index)) - read_details.index = list(map(cut_last_char, read_details.index)) - read_details2.index = list(map(cut_last_char, read_details2.index)) - - binary_p, binary_mis, binary_un = ht.create_paired_matrix(binary1, binary2) - - if binary_p.shape[0] < len(id1) * 0.1: - print(("\nWARNING: Less than 10%% of reads could be paired. Consider an appropriate unpaired_weight setting " - "in your config file (currently %.3f), because you may need to resort to using unpaired reads.") % unpaired_weight) - - if unpaired_weight > 0: - if use_discordant: - binary = pd.concat([binary_p, binary_un, binary_mis]) - else: - binary = pd.concat([binary_p, binary_un]) - else: - binary = binary_p - - else: - pos, read_details = ht.pysam_to_hdf(bam_paths[0]) - - if not bam_input and config.getboolean('behavior', 'deletebam'): - os.remove(bam_paths[0]) - - binary = np.sign(pos) # dtype=np.uint16 - - # dimensionality reduction and typing - - alleles_to_keep = list(filter(is_frequent, binary.columns)) - binary = binary[alleles_to_keep] - - if VERBOSE: - print("\n", ht.now(), 'temporary pruning of identical rows and columns') - unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True) - representing_df = pd.DataFrame([[a1, a2] for a1, a_l in representing.items() for a2 in a_l], - columns=['representative', 'represented']) - - temp_pruned = ht.prune_identical_reads(unique_col) - - if VERBOSE: - print("\n", ht.now(), 'Size of mtx with unique rows and columns:', temp_pruned.shape) - print(ht.now(), 'determining minimal set of non-overshadowed alleles') - - minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned) - - if VERBOSE: - print("\n", ht.now(), 'Keeping only the minimal number of required alleles', minimal_alleles.shape) - - binary = binary[minimal_alleles] - - if VERBOSE: - print("\n", ht.now(), 'Creating compact model...') - - if is_paired and unpaired_weight > 0: - if use_discordant: - compact_mtx, compact_occ = ht.get_compact_model(binary_p[minimal_alleles], - pd.concat([binary_un, binary_mis])[minimal_alleles], weight=unpaired_weight) - else: - compact_mtx, compact_occ = ht.get_compact_model(binary_p[minimal_alleles], - binary_un[minimal_alleles], weight=unpaired_weight) - else: - compact_mtx, compact_occ = ht.get_compact_model(binary) - - allele_ids = binary.columns - - groups_4digit = defaultdict(list) - for allele in allele_ids: - type_4digit = get_4digit(allele) - groups_4digit[type_4digit].append(allele) - - sparse_dict = ht.mtx_to_sparse_dict(compact_mtx) - threads = get_num_threads(config.getint("ilp", "threads")) - if VERBOSE: - print("\nstarting ilp solver with %s threads..." % threads) - print("\n", ht.now(), 'Initializing OptiType model...') - - op = OptiType(sparse_dict, compact_occ, groups_4digit, table, args.beta, 2, - config.get("ilp", "solver"), threads, verbosity=VERBOSE) - result = op.solve(args.enumerate) - - if VERBOSE: - print("\n", ht.now(), 'Result dataframe has been constructed...') - - result_4digit = result.applymap(get_types) - for iii in ["A1", "A2", "B1", "B2", "C1", "C2"]: - if not iii in result_4digit: - result_4digit[iii] = None - r = result_4digit[["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"]] - - # write CSV to out. And generate plots - r.to_csv(out_csv, sep="\t", - columns=["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"], - header=["A1", "A2", "B1", "B2", "C1", "C2", "Reads", "Objective"]) - - hlatype = result.iloc[0].reindex(["A1", "A2", "B1", "B2", "C1", "C2"]).drop_duplicates().dropna() - features_used = [('intron', 1), ('exon', 2), ('intron', 2), ('exon', 3), ('intron', 3)] \ - if not args.rna else [('exon',2),('exon',3)] - plot_variables = [pos, read_details, pos2, read_details2, (binary_p, binary_un, binary_mis)] if is_paired else [pos, read_details] - coverage_mat = ht.calculate_coverage(plot_variables, features, hlatype, features_used) - ht.plot_coverage(out_plot, coverage_mat, table, features, features_used) diff --git a/README.md b/README.md index 7bcd94b..29db5d3 100644 --- a/README.md +++ b/README.md @@ -1,213 +1,220 @@ -[![Build Status](https://travis-ci.org/FRED-2/OptiType.svg?branch=master)](https://travis-ci.org/FRED-2/OptiType) -OptiType -======== +[![CI](https://github.com/FRED-2/OptiType/actions/workflows/ci.yml/badge.svg)](https://github.com/FRED-2/OptiType/actions/workflows/ci.yml) +[![PyPI version](https://badge.fury.io/py/optitype.svg)](https://badge.fury.io/py/optitype) + +# OptiType Precision HLA typing from next-generation sequencing data -Authors: András Szolek, Benjamin Schubert, Christopher Mohr -Date: April 2014 -Version: 1.3.3 -License: OptiType is released under a three-clause BSD license +**Authors:** András Szolek, Benjamin Schubert, Christopher Mohr, Jonas Scheid +**Version:** 1.5.0 +**License:** BSD-3-Clause +## Introduction -Introduction -------------- OptiType is a novel HLA genotyping algorithm based on integer linear programming, capable of producing accurate 4-digit HLA genotyping predictions from NGS data by simultaneously selecting all major and minor HLA Class I alleles. +## Installation -Requirements -------------- -OptiType uses the following software and libraries: - -1. [Python 2.7](https://www.python.org/) -2. [RazerS 3.4](http://www.seqan.de/projects/razers/) -3. [SAMtools 1.2](http://www.htslib.org/) -4. [HDF5 1.8.15](https://www.hdfgroup.org/HDF5/) -5. [CPLEX 12.5](http://www-01.ibm.com/software/commerce/optimization/cplex-optimizer/) - or other Pyomo-supported ILP solver ([GLPK](https://www.gnu.org/software/glpk/), - [CBC](https://projects.coin-or.org/Cbc), ...) +### Via pip (Recommended) -And the following Python modules: +```bash +pip install optitype +``` -1. NumPy 1.9.3 -2. Pyomo 4.2 -3. PyTables 3.2.2 -4. Pandas 0.16.2 -5. Pysam 0.8.3 -6. Matplotlib 1.4.3 -7. Future 0.15.2 +### From source -Note: CPLEX has a proprietary license but is free for academic use. See IBM's -[academic initiative.](http://www-304.ibm.com/ibm/university/academic/pub/page/academic_initiative) +```bash +git clone https://github.com/FRED-2/OptiType.git +cd OptiType +pip install -e . +``` -Installation via Docker ------------------------ +### External Dependencies -1. Install Docker on your computer and make sure it works. +OptiType requires external tools that cannot be installed via pip: -2. Call `docker pull fred2/optitype` which will download the Docker image. +1. **RazerS3** - Read mapper + ```bash + # Via conda/bioconda + conda install -c bioconda razers3 -3. You can use the image as followes: + # Or build from source: https://github.com/seqan/seqan + ``` -`docker run -v /path/to/data/folder:/data/ -t fred2/optitype -i input1 [input2] (-r|-d) -o /data/` +2. **ILP Solver** - At least one of: + - **GLPK** (open source) + ```bash + apt install glpk-utils # Debian/Ubuntu + conda install -c conda-forge glpk # Conda + ``` + - **CBC** (open source) + ```bash + apt install coinor-cbc # Debian/Ubuntu + conda install -c conda-forge coincbc # Conda + ``` + - **CPLEX** (commercial, free for academia) -OptiType uses the CBC-Solver and RazerS3 internally with one thread if no other configuration file is provided. RazerS3's binary can be found at `/usr/local/bin` within the Docker image. +### Verify Installation -Installation from Source ------------------------- -1. Install all required software and libraries from the first list. +```bash +optitype check-deps +``` -2. Include SAMtools and your ILP solver in your `PATH` environment variable. -They both have to be globally accessible every time you run OptiType, so make -them permanent (put in in your `.bashrc` or similar shell startup script). +## Quick Start -3. Add HDF5's `lib` directory to your `LD_LIBRARY_PATH`. Make sure it's -permanent too. +### Command Line -4. Create and activate a Python virtual environment with the package -[virtualenv](https://virtualenv.pypa.io/en/latest/). This will automatically -install the package manager `pip` which you will need for the next steps. -Always run OptiType from this virtual environment. +```bash +# DNA sequencing (paired-end) +optitype run -i reads_1.fastq -i reads_2.fastq --dna -o results/ -5. Install NumPy, Pyomo, Pysam and Matplotlib with -the following commands: +# RNA sequencing (single-end) +optitype run -i sample.fastq --rna -o results/ - ``` - pip install numpy - pip install pyomo - pip install pysam - pip install matplotlib - ``` +# Re-analyze from BAM +optitype run -i mapped.bam --dna -o results/ +``` -6. Create a new environment variable containing the path to your HDF5 -installation. It doesn't have to be permanent, but it has to be accessible -when you install PyTables. On the bash shell it would be -`export HDF5_DIR=/path/to/hdf5-1.8.15` +### Python API -7. Install PyTables, Pandas and Future with +```python +from optitype import run_hla_typing, HLATypingConfig - ``` - pip install tables - pip install pandas - pip install future - ``` +result = run_hla_typing( + fastq_files=["sample_1.fastq", "sample_2.fastq"], + seq_type="dna", + config=HLATypingConfig(solver="cbc", threads=4) +) -8. Create a configuration file following `config.ini.example`. You will find -all necessary instructions in there. OptiType will look for the configuration -file at `config.ini` in the same directory by default, but you can put it -anywhere and pass it with the `-c` option when running OptiType. +print(result.best_result) +# {'A1': 'A*02:01', 'A2': 'A*03:01', 'B1': 'B*07:02', ...} +``` +## Usage -Usage -------------- +``` +optitype run --help + +Usage: optitype run [OPTIONS] + + Run HLA typing analysis. + +Options: + -i, --input PATH Input FASTQ or BAM files (use multiple times for paired-end) + -r, --rna Input data is RNA sequencing + -d, --dna Input data is DNA sequencing (default) + -o, --outdir PATH Output directory for results (required) + -p, --prefix TEXT Output filename prefix (default: timestamp) + -b, --beta FLOAT Homozygosity detection parameter (0.0-0.1) + -e, --enumerate INT Number of solutions to enumerate + --solver [glpk|cbc|cplex] ILP solver to use + --razers3 PATH Path to RazerS3 binary + --threads INT Number of threads for mapping + -v, --verbose Enable verbose output + -c, --config PATH Path to config.ini file + --help Show this message and exit +``` -Optional step zero: you might want to filter your sequencing data for -HLA reads. Should you have to re-run OptiType multiple times on the same sample -(different settings, etc.) it could save you time if instead of giving OptiType -the full, multi-gigabyte sequencing data multiple times, you would rather give -it the relevant reads only, on the order of megabytes. +### Additional Commands -You can use any read mapper to do this step, although we suggest you use RazerS3. -Its only drawback is that due to way RazerS3 was designed, it loads all reads -into memory, which could be a problem on older, low-memory computing nodes. +```bash +# Check dependencies +optitype check-deps -Make sure to filter your files correctly depending on whether you have DNA -(exome, WGS) or RNA-Seq data. The reference fasta files are -`data/hla_reference_dna.fasta` and `data/hla_reference_rna.fasta` respectively. -Below is an example for DNA sequencing data: +# Generate config file +optitype init-config +# Show installation info +optitype info ``` ->razers3 -i 95 -m 1 -dr 0 -o fished_1.bam /path/to/OptiType/data/hla_reference_dna.fasta sample_1.fastq ->samtools bam2fq fished_1.bam > sample_1_fished.fastq +## Configuration ->rm fished_1.bam +Generate a config file with: + +```bash +optitype init-config -o config.ini ``` -If you have paired-end data, repeat this with the second ends' fastq file as well. -Note: it's important that you filter the two ends individually. Don't use the -read mapper's paired-end capabilities. +Key settings: -After the optional filtering, OptiType can be called as follows: -``` ->python /path/to/OptiTypePipeline.py -i sample_fished_1.fastq [sample_fished_2.fastq] - (--rna | --dna) [--beta BETA] [--enumerate N] - [-c CONFIG] [--verbose] --outdir /path/to/out_dir/ +```ini +[mapping] +threads=4 # Threads for read mapping + +[ilp] +solver=glpk # ILP solver: glpk, cbc, or cplex +threads=1 # Threads for ILP solver + +[behavior] +deletebam=true # Delete intermediate BAM files +unpaired_weight=0 # Weight for unpaired reads (0-1) ``` -This will produce a time-stamped directory inside the specified output directory -containing a CSV file with the predicted optimal (and if enumerated, sub-optimal) -HLA genotype, and a pdf file containing a coverage plot of the predicted alleles -for diagnostic purposes. +## Docker +```bash +docker pull fred2/optitype +docker run -v /path/to/data:/data -t fred2/optitype \ + -i /data/reads_1.fastq -i /data/reads_2.fastq --dna -o /data/results/ ``` ->python OptiTypePipeline.py --help - -usage: OptiType [-h] --input FQ [FQ] (--rna | --dna) [--beta B] - [--enumerate N] --outdir OUTDIR [--verbose] [--config CONFIG] - -OptiType: 4-digit HLA typer - -optional arguments: - -h, --help show this help message and exit - --input FQ [FQ], -i FQ [FQ] - .fastq file(s) (fished or raw) or .bam files stored - for re-use, generated by an earlier OptiType run. One - file: single-end mode, two files: paired-end mode. - --rna, -r Use with RNA sequencing data. - --dna, -d Use with DNA sequencing data. - --beta B, -b B The beta value for for homozygosity detection (see - paper). Default: 0.009. Handle with care. - --enumerate N, -e N Number of enumerations. OptiType will output the - optimal solution and the top N-1 suboptimal solutions - in the results CSV. Default: 1 - --outdir OUTDIR, -o OUTDIR - Specifies the out directory to which all files should - be written. - --prefix PREFIX, -p PREFIX - Specifies prefix of output files - --verbose, -v Set verbose mode on. - --config CONFIG, -c CONFIG - Path to config file. Default: config.ini in the same - directory as this script + +## Test Examples + +```bash +# DNA (paired-end) +optitype run \ + -i ./test/exome/NA11995_SRR766010_1_fished.fastq \ + -i ./test/exome/NA11995_SRR766010_2_fished.fastq \ + --dna -v -o ./test/exome/ + +# RNA (paired-end) +optitype run \ + -i ./test/rna/CRC_81_N_1_fished.fastq \ + -i ./test/rna/CRC_81_N_2_fished.fastq \ + --rna -v -o ./test/rna/ ``` -Furthermore, depending on your settings in `config.ini` you can choose to keep -the bam files OptiType produces when all-mapping reads against the reference: -these will be stored in the output directory of your current run. +## Output -Then, if you want to re-run OptiType on the same sample, you can give it those -intermediate `.bam` files as input instead of `.fastq` files, and spare on the -mapping part of the pipeline. Note: these `.bam` files have nothing to do with -those produced during the filtering/fishing step. +OptiType produces: +- `*_result.tsv` - HLA typing results +- `*_coverage_plot.pdf` - Coverage visualization +Example output: -Test examples -------------- -DNA data (paired end): ``` -python OptiTypePipeline.py -i ./test/exome/NA11995_SRR766010_1_fished.fastq ./test/exome/NA11995_SRR766010_2_fished.fastq --dna -v -o ./test/exome/ + A1 A2 B1 B2 C1 C2 Reads Objective +0 A*02:01 A*03:01 B*07:02 B*44:02 C*07:02 C*05:01 1234 1156.5 ``` -RNA data (paired end): -``` -python OptiTypePipeline.py -i ./test/rna/CRC_81_N_1_fished.fastq ./test/rna/CRC_81_N_2_fished.fastq --rna -v -o ./test/rna/ -``` +## Migration from v1.x -Contact -------------- -András Szolek -szolek@informatik.uni-tuebingen.de -University of Tübingen, Applied Bioinformatics, -Center for Bioinformatics, Quantitative Biology Center, -and Dept. of Computer Science, -Sand 14, 72076 Tübingen, Germany +Version 1.5 introduces a modernized CLI. Main changes: +- Install with `pip install optitype` +- Use `optitype run` instead of `python OptiTypePipeline.py` +- Multiple input files: use `-i file1 -i file2` instead of `-i file1 file2` +- Data bundled with package (no need to set paths) + +The core algorithm and output format remain unchanged. + +## Requirements + +- Python 3.10+ +- External: RazerS3, ILP solver (GLPK/CBC/CPLEX) + +## Reference -Reference -------------- Szolek, A, Schubert, B, Mohr, C, Sturm, M, Feldhahn, M, and Kohlbacher, O (2014). -OptiType: precision HLA typing from next-generation sequencing data -Bioinformatics, 30(23):3310-6. +**OptiType: precision HLA typing from next-generation sequencing data** +*Bioinformatics*, 30(23):3310-6. +[doi:10.1093/bioinformatics/btu548](https://doi.org/10.1093/bioinformatics/btu548) + +## Contact + +András Szolek +szolek@informatik.uni-tuebingen.de +University of Tübingen, Applied Bioinformatics diff --git a/data/alleles.h5 b/data/alleles.h5 deleted file mode 100644 index 0710a20..0000000 Binary files a/data/alleles.h5 and /dev/null differ diff --git a/data/alleles/features.parquet b/data/alleles/features.parquet new file mode 100644 index 0000000..92f9503 Binary files /dev/null and b/data/alleles/features.parquet differ diff --git a/data/alleles/table.parquet b/data/alleles/table.parquet new file mode 100644 index 0000000..cfb2f62 Binary files /dev/null and b/data/alleles/table.parquet differ diff --git a/hlatyper.py b/hlatyper.py deleted file mode 100644 index 0ff466a..0000000 --- a/hlatyper.py +++ /dev/null @@ -1,778 +0,0 @@ -from __future__ import division -from __future__ import print_function -from builtins import zip -from builtins import str -from builtins import map -from builtins import range -from past.utils import old_div -import pandas as pd -import numpy as np -import re -import pylab -import warnings -from collections import OrderedDict -from datetime import datetime -import sys - -try: - import pysam - PYSAM_AVAILABLE = True -except ImportError: - PYSAM_AVAILABLE = False - -VERBOSE = False - -def now(start=datetime.now()): - # function argument NOT to be ever set! It gets initialized at function definition time - # so we can calculate difference without further hassle. - return str(datetime.now()-start)[:-4] - - -def memoize(f): # from http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/ - class MemoDict(dict): - def __missing__(self, key): - ret = self[key] = f(key) - return ret - return MemoDict().__getitem__ - -# if we used MDINSHP=X chars this could be a full cigar parser, however, we only need M and D to -# determine a read's span on the reference sequence to create a coverage plot. -CIGAR_SLICER = re.compile(r'[0-9]+[MD]') - -@memoize -def length_on_reference(cigar_string): - return sum([int(p[:-1]) for p in re.findall(CIGAR_SLICER, cigar_string)]) - - -def feature_order(feature): - # to use it in sorted(..., key=feature_order) this returns: - # 0 for ('UTR', 1) - # 2 for exon, 1 - # 3 for intron, 1 - # 4 for exon, 2 - # 5 for intron, 2 - # ... - # 999 for UTR, 2 - feat_type, feat_number = feature - assert feat_type in ('intron', 'exon', 'UTR'), 'Unknown feature in list (function accepts intron/exon/UTR)' - assert isinstance(feat_number, int), 'Feature number has to be integer' - return 0 if feature==('UTR', 1) else 999 if feature==('UTR', 2) else (feat_number*2 + (feat_type=='intron')) - - -def store_dataframes(out_hdf, **kwargs): - # DataFrames to serialize have to be passed by keyword arguments. An argument matrix1=DataFrame(...) - # will be written into table 'matrix1' in the HDF file. - - complevel = kwargs.pop('complevel', 9) # default complevel & complib values if - complib = kwargs.pop('complib', 'zlib') # not explicitly asked for as arguments - - if VERBOSE: - print(now(), 'Storing %d DataFrames in file %s with compression settings %d %s...' % (len(kwargs), out_hdf, complevel, complib)) - - store = pd.HDFStore(out_hdf, complevel=complevel, complib=complib) # TODO: WRITE ONLY? it probably appends now - for table_name, dataframe in kwargs.items(): - store[table_name] = dataframe - store.close() - - if VERBOSE: - print(now(), 'DataFrames stored in file.') - - -def load_hdf(in_hdf, as_dict=False, *args): # isn't really neccesary, but makes a read-only flag on it to be sure - - store = pd.HDFStore(in_hdf, 'r') - if len(args): - if as_dict: - to_return = {table: store[table] for table in args} - else: - to_return = tuple((store[table] for table in args)) - store.close() - return to_return - else: - return store # store doesn't get closed! Either the user closes it manually or gets closed on exit - - -def sam_to_hdf(samfile): - if VERBOSE: - print(now(), 'Loading alleles and read IDs from %s...' % samfile) - - # run through the SAM file once to see how many reads and alleles we are dealing with - # for a one-step DataFrame initialization instead of slow growing - - read_ids, allele_ids = [], [] - first_hit_row = True - total_hits = 0 - - with open(samfile, 'rb') as f: - last_read_id = None - for line in f: - line = line.decode('utf-8') - if line.startswith('@'): - if line.startswith('@SQ'): - allele_ids.append(line.split('\t')[1][3:]) # SN:HLA:HLA00001 - continue - - total_hits += 1 - read_id = line.split('\t')[0] - if last_read_id != read_id: - read_ids.append(read_id) - last_read_id = read_id - - if first_hit_row: # analyze SAM file structure and find MD tag column (describes mismatches). - first_hit_row = False - columns = line.split() - try: - nm_index = list(map(lambda x: x.startswith('NM:'), columns)).index(True) - except ValueError: - # TODO: we don't really handle the case if NM-tag is not present, code will fail later - print('\tNo NM-tag found in SAM file!') - nm_index = None - - if VERBOSE: - print(now(), '%d alleles and %d reads found.' % (len(allele_ids), len(read_ids))) - print(now(), 'Initializing mapping matrix...') - - # major performance increase if we initialize a numpy zero-matrix and pass that in the constructor - # than if we just let pandas initialize its default NaN matrix - matrix_pos = pd.DataFrame(np.zeros((len(read_ids), len(allele_ids)), dtype=np.uint16), columns=allele_ids, index=read_ids) - - # read_details contains NM and read length tuples, calculated from the first encountered CIGAR string. - read_details = OrderedDict() - - if VERBOSE: - print(now(), '%dx%d mapping matrix initialized. Populating %d hits from SAM file...' % (len(read_ids), len(allele_ids), total_hits)) - - milestones = [x * total_hits / 10 for x in range(1, 11)] # for progress bar - - with open(samfile, 'rb') as f: - counter = 0 - percent = 0 - for line in f: - line = line.decode('utf-8') - if line.startswith('@'): - continue - - fields = line.strip().split('\t') - read_id, allele_id, position, cigar, nm = (fields[i] for i in (0, 2, 3, 5, nm_index)) - - if read_id not in read_details: - read_details[read_id] = (int(nm[5:]), length_on_reference(cigar)) - - matrix_pos[allele_id][read_id] = int(position) # SAM indexes from 1, so null elements are not hits - - counter += 1 - if counter in milestones: - percent += 10 - if VERBOSE: - print('\t%d%% completed' % percent) - if VERBOSE: - print(now(), '%d elements filled. Matrix sparsity: 1 in %.2f' % (counter, matrix_pos.shape[0]*matrix_pos.shape[1]/float(counter))) - - # convert HLA:HLA00001 identifiers to HLA00001 - matrix_pos.rename(columns=lambda x: x.replace('HLA:', ''), inplace=True) - - details_df = pd.DataFrame.from_dict(read_details, orient='index') - details_df.columns = ['mismatches', 'read_length'] - - return matrix_pos, details_df - - -def pysam_to_hdf(samfile): - if not PYSAM_AVAILABLE: - print("Warning: PySam not available on the system. Falling back to primitive SAM parsing.") - return sam_to_hdf(samfile) - - sam_or_bam = 'rb' if samfile.endswith('.bam') else 'r' - sam = pysam.AlignmentFile(samfile, sam_or_bam) - is_yara = (sam.header['PG'][0]['ID'] in ('Yara', 'yara')) - # If yara produced the sam/bam file, we need to know in what form to expect secondary alignments. - # If the -os flag was used in the call, they are proper secondary alignments. Otherwise, they are - # one line per read with a long XA custom tag containing alternative hits. - xa_tag = is_yara and (' -os ' not in sam.header['PG'][0]['CL']) - - nref = sam.nreferences - hits = OrderedDict() - - allele_id_to_index = {aa: ii for ii, aa in enumerate(sam.references)} - - # read_details contains NM and read length tuples. We ignore length on reference from now on. - # Not worth the effort and calculation. Indels are rare and they have to be dealt with differently. The coverage - # plot wil still be fine and +-1 bp regarding read end isn't concerning. - read_details = OrderedDict() - - if VERBOSE: - print(now(), 'Loading %s started. Number of HLA reads loaded (updated every thousand):' % samfile) - - read_counter = 0 - hit_counter = 0 - - for aln in sam: - # TODO: we could spare on accessing hits[aln.qname] if we were guaranteed alignments of one read come in batches - # and never mix. Most aligners behave like this but who knows... - if aln.qname not in hits: # for primary alignments (first hit of read) - # not using defaultdict because we have other one-time things to do when new reads come in anyway - hits[aln.qname] = np.zeros(nref, dtype=np.uint16) # 16 bits are enough for 65K positions, enough for HLA. - # TODO: as soon as we don't best-map but let in suboptimal alignments, this below is not good enough, - # as we need to store NM for every read-allele pair. - # and then there are cases (usually 1D/1I artefacts at the end) where aligned reference length isn't the same - # for all alignments of a read. How to handle that? Maybe needless if we re-map reads anyway. - read_details[aln.qname] = (aln.get_tag('NM'), aln.query_length) # aln.reference_length it used to be. Soft-trimming is out of question now. - read_counter += 1 - if VERBOSE and not (read_counter % 1000): - sys.stdout.write('%dK...' % (old_div(len(hits),1000))) - sys.stdout.flush() - if xa_tag and aln.has_tag('XA'): - current_row = hits[aln.qname] # we may access this hundreds of times, better do it directly - subtags = aln.get_tag('XA').split(';')[:-1] - hit_counter += len(subtags) - for subtag in subtags: - allele, pos = subtag.split(',')[:2] # subtag is like HLA02552,691,792,-,1 (id, start, end, orient, nm) - current_row[allele_id_to_index[allele]] = int(pos) # 1-based positions - - # this runs for primary and secondary alignments as well: - hits[aln.qname][aln.reference_id] = aln.reference_start + 1 # pysam reports 0-based positions - hit_counter += 1 - # num_mismatches = aln.get_tag('NM') # if we ever need suboptimal alignments... doubtful. - - if VERBOSE: - print('\n', now(), len(hits), 'reads loaded. Creating dataframe...') - pos_df = pd.DataFrame.from_dict(hits, orient='index') - pos_df.columns = sam.references[:] - details_df = pd.DataFrame.from_dict(read_details, orient='index') - details_df.columns = ['mismatches', 'read_length'] - if VERBOSE: - print(now(), 'Dataframes created. Shape: %d x %d, hits: %d (%d), sparsity: 1 in %.2f' % ( - pos_df.shape[0], pos_df.shape[1], np.sign(pos_df).sum().sum(), hit_counter, pos_df.shape[0]*pos_df.shape[1]/float(hit_counter) - )) # TODO: maybe return the binary here right away if we're using it to calculate density anyway. - return pos_df, details_df - - -def get_compact_model(hit_df, weak_hit_df=None, weight=None): -# turn a binary hit matrix dataframe into a smaller matrix DF that removes duplicate rows and -# creates the "occurence" vector with the number of rows the representative read represents. -# Note: one can pass "weak" hits (e.g., unpaired reads) and use them with a lower weight. - - hit_df = hit_df.loc[hit_df.any(axis=1)] # remove all-zero rows - occurence = {r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.values()} - - if weak_hit_df is not None: - weak_hit_df = weak_hit_df.loc[weak_hit_df.any(axis=1)] - assert 0 < weight <= 1, 'weak hit weight must be in (0, 1]' - weak_occ = {r[0]: len(r)*weight for r in weak_hit_df.groupby(weak_hit_df.columns.tolist()).groups.values()} - occurence.update(weak_occ) - unique_mtx = pd.concat([hit_df.drop_duplicates(), weak_hit_df.drop_duplicates()]) - else: - unique_mtx = hit_df.drop_duplicates() - - return unique_mtx, occurence - - -def mtx_to_sparse_dict(hit_df): - # takes a hit matrix and creates a dictionary of (read, allele):1 tuples corresponding to hits - # (needed by OptiType) - all_hits = {} - for read_id, alleles in hit_df.iterrows(): - hit_alleles = alleles[alleles!=0].index - for hit_allele in hit_alleles: - all_hits[(read_id, hit_allele)] = 1 - return all_hits - #return {(read, allele): 1 for read in hit_df.index for allele in hit_df.columns if hit_df[allele][read]>0} # awfully inefficient - - -def create_allele_dataframes(imgt_dat, fasta_gen, fasta_nuc): - from Bio import SeqIO - if VERBOSE: - print(now(), 'Loading IMGT allele dat file...') - - alleles = OrderedDict() - - with open(imgt_dat, 'rU') as handle: - for i, record in enumerate(SeqIO.parse(handle, "imgt")): - # TODO: IMGT has changed the ID system. Now it's HLA00001.1 or HLA12345.2 showing versions. I'll get rid of this now though - record.id = record.id.split('.')[0] - alleles[record.id] = record - - if VERBOSE: - print(now(), 'Initializing allele DataFrame...') - - ''' - id HLA000001 - type A*01:01:01:01 - locus A - class I --- I, II, TAP, MIC, other - flags 0 --- problems with allele stored here, see below - len_gen 3503 - len_nuc 1098 - full_gen 1 - full_nuc 1 - - flagging: sum of the below codes - +1 if HLA type ends in a letter (N, Q, etc.) - +2 if CDS annotation != exon annotation - +4 if features don't add up to gen sequence - +8 if exons don't add up to nuc sequence - ''' - - allele_info = 'id type 4digit locus flags len_dat len_gen len_nuc full_gen full_nuc' - - table = pd.DataFrame(index=list(alleles.keys()), columns=allele_info.split()) - sequences = [] - - if VERBOSE: - print(now(), 'Filling DataFrame with allele data...') - - all_features = [] # contains tuples: (HLA id, feature type, feature number, feature start, feature end) - - for allele in alleles.values(): - - allele_type = allele.description.replace('HLA-', '').split(',')[0] - - table.loc[allele.id]['id'] = allele.id - table.loc[allele.id]['type'] = allele_type - table.loc[allele.id]['4digit'] = ':'.join(allele_type.split(':')[:2]) - table.loc[allele.id]['locus'] = allele_type.split('*')[0] - table.loc[allele.id]['flags'] = 0 if allele_type[-1].isdigit() else 1 - table.loc[allele.id]['len_dat'] = len(str(allele.seq)) - table.loc[allele.id]['len_gen'] = 0 # TODO: IT STILL DOESNT SOLVE PERFORMANCEWARNING! - table.loc[allele.id]['len_nuc'] = 0 # we initialize these nulls so that we don't get a - table.loc[allele.id]['full_gen'] = 0 # PerformanceWarning + pickling when storing HDF - table.loc[allele.id]['full_nuc'] = 0 # because of NaNs (they don't map to ctypes) - sequences.append((allele.id, 'dat', str(allele.seq))) - - - # number of features in 2013-04-30 hla.dat: - # 9296 source (total # of alleles) - # 9291 CDS, 5 gene (HLA-P pseudogenes) - # 24493 exons, 3697 introns, 1027 UTRs - # CDS matches exons in 99+% of cases, some have a single base-pair extension at the end for some - # weird single-base exons or such (all on unimportant pseudogene loci) - # so we extract exons, introns and UTRs - features = [f for f in allele.features if f.type in ('exon', 'intron', 'UTR')] - - for feature in features: - if feature.type in ('exon', 'intron'): - feature_num = int(feature.qualifiers['number'][0]) - else: # UTR - # UTR either starts at the beginning or ends at the end - assert feature.location.start == 0 or feature.location.end == len(allele.seq) - feature_num = 1 if feature.location.start == 0 else 2 # 1 if 5' UTR, 2 if 3' UTR - - all_features.append( - (allele.id, feature.type, feature_num, - int(feature.location.start), int(feature.location.end), len(feature), feature_order((feature.type, feature_num))) - ) - - # small sanity check, can be commented out - cds = [f for f in allele.features if f.type == 'CDS'] - if cds: - if sum(map(len, [f for f in features if f.type == 'exon'])) != len(cds[0]): - if VERBOSE: - print("\tCDS length doesn't match sum of exons for", allele.id, allele_type) - table.loc[allele.id]['flags'] += 2 - else: - if VERBOSE: - print("\tNo CDS found for", allele.id, allele_type) - table.loc[allele.id]['flags'] += 2 - - if VERBOSE: - print(now(), 'Loading gen and nuc files...') - - with open(fasta_gen, 'r') as fasta_gen: - for record in SeqIO.parse(fasta_gen, 'fasta'): - allele_id = record.id.replace('HLA:', '') - table.loc[allele_id]['len_gen'] = len(record.seq) - sequences.append((allele_id, 'gen', str(record.seq))) - - with open(fasta_nuc, 'r') as fasta_nuc: - for record in SeqIO.parse(fasta_nuc, 'fasta'): - allele_id = record.id.replace('HLA:', '') - table.loc[allele_id]['len_nuc'] = len(record.seq) - sequences.append((allele_id, 'nuc', str(record.seq))) - - # convert list of tuples into DataFrame for features and sequences - all_features = pd.DataFrame(all_features, columns=['id', 'feature', 'number', 'start', 'end', 'length', 'order']) - sequences = pd.DataFrame(sequences, columns=['id', 'source', 'sequence']) - - - joined = pd.merge(table, all_features, how='inner', on='id') - - exons_for_locus = {} - - for i_locus, i_group in joined.groupby('locus'): - exons_for_locus[i_locus] = i_group[i_group['feature']=='exon']['number'].max() - - # for i_id, i_group in joined.groupby('id'): - # max_exons = exons_for_locus[table.loc[i_id]['locus']] - # if len(i_group) >= 2*max_exons-1: - # print i_id, 'is fully annotated on all exons and introns and UTRs' - - if VERBOSE: - print(now(), 'Checking dat features vs gen/nuc sequences...') - - for allele, features in joined.groupby('id'): - row = features.irow(0) # first row of the features subtable. Contains all allele information because of the join - sum_features_length = features['length'].sum() - sum_exons_length = features.loc[features['feature']=='exon']['length'].sum() - if row['len_gen']>0 and row['len_gen'] != sum_features_length: - if VERBOSE: - print("\tFeature lengths don't add up to gen sequence length", allele, row['len_gen'], sum_features_length, row['type']) - table.loc[allele]['flags'] += 4 - if row['len_nuc']>0 and row['len_nuc'] != sum_exons_length: - if VERBOSE: - print("\tExon lengths don't add up to nuc sequence length", allele, row['len_nuc'], sum_exons_length, row['type']) - table.loc[allele]['flags'] += 8 - - if VERBOSE: - print(now(), 'Sanity check finished. Computing feature sequences...') - - ft_seq_lookup = OrderedDict() - ft_seq_lookup['---DUMMY---'] = 0 # it will be useful later on if 0 isn't used. lookup_id*boolean operation, etc. - ft_counter = 1 - all_ft_counter = 0 - all_features['seq_id'] = 0 - for i_id, i_features in all_features.groupby('id'): - seq = sequences.loc[(sequences['id']==i_id) & (sequences['source']=='dat')].irow(0)['sequence'] - for ft_idx, feature in i_features.iterrows(): - ft_seq = seq[feature['start']:feature['end']] - all_ft_counter += 1 - if ft_seq not in ft_seq_lookup: - ft_seq_lookup[ft_seq] = ft_counter - all_features.loc[ft_idx, 'seq_id'] = ft_counter - ft_counter += 1 - else: - all_features.loc[ft_idx, 'seq_id'] = ft_seq_lookup[ft_seq] - - feature_sequences = pd.DataFrame([seq for seq in list(ft_seq_lookup.keys())], columns=['sequence']) # , index=ft_seq_lookup.values() but it's 0,1,2,... anyway, as the default - - return table, all_features, sequences, feature_sequences - - -def prune_identical_alleles(binary_mtx, report_groups=False): - # return binary_mtx.transpose().drop_duplicates().transpose() - # # faster: - hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0])) # dtype np.uint16 okay here because result will be float64 - if report_groups: - grouper = hash_columns.groupby(hash_columns) - groups = {g[1].index[0]: g[1].index.tolist() for g in grouper} - alleles_to_keep = hash_columns.drop_duplicates().index # relying on keeping the first representative - # TODO: maybe return sets of alleles that were collapsed into the representative (identical patterned sets) - return binary_mtx[alleles_to_keep] if not report_groups else (binary_mtx[alleles_to_keep], groups) - - -def prune_identical_reads(binary_mtx): - # almost the same as ht.get_compact_model() except it doesn't return an occurence vector. - # It should only be used to compactify a matrix before passing it to the function finding - # overshadowed alleles. Final compactifying should be later done on the original binary matrix. - # - # return binary_mtx.drop_duplicates() - # # faster: - reads_to_keep = binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index # dtype np.uint16 okay because result will be float64 - return binary_mtx.loc[reads_to_keep] - - -def prune_overshadowed_alleles(binary_mtx): - # Calculates B_T*B of the (pruned) binary matrix to determine if certain alleles "overshadow" others, i.e. - # have the same hits as other alleles plus more. In this case, these "other alleles" can be thrown out early, as - # they would be never chosen over the one overshadowing them. - # - # For a 1000reads x 3600alleles matrix it takes 15 seconds. - # So you should always prune identical columns and rows before to give it a matrix as small as possible. - # So ideal usage: - # prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx))) - - # np.dot() is prone to overflow and doesn't expand to int64 like np.sum(). Our binary matrices are np.uint16. - # If we have less than 65K rows in the binary mtx, we're good with uint16. We're also good if there's no - # column with 65K+ hits. We check for both with lazy 'or' to avoid calculating the sum if not needed anyway. - # If we would overflow, we change to uint32. - if (binary_mtx.shape[0] < np.iinfo(np.uint16).max) or (binary_mtx.sum(axis=0).max() < np.iinfo(np.uint16).max): - # In case we would reduce the binary mtx to np.uint8 we should to ensure it's at least uint16. - bb = binary_mtx if all(binary_mtx.dtypes == np.uint16) else binary_mtx.astype(np.uint16) - else: - bb = binary_mtx.astype(np.uint32) - - covariance = bb.transpose().dot(bb) - diagonal = pd.Series([covariance[ii][ii] for ii in covariance.columns], index=covariance.columns) - new_covariance = covariance[covariance.columns] - for ii in new_covariance.columns: - new_covariance[ii][ii] = 0 - overshadowed = [] - for ii in new_covariance.columns: - potential_superiors = new_covariance[ii][new_covariance[ii]==diagonal[ii]].index - if any(diagonal[potential_superiors] > diagonal[ii]): - overshadowed.append(ii) - non_overshadowed = covariance.columns.difference(overshadowed) - return non_overshadowed - -def create_paired_matrix(binary_1, binary_2, id_cleaning=None): - # id_cleaning is a function object that turns a read ID that contains pair member information (like XYZ_1234:5678/1) into an - # ID that identifies the pair itself (like XYZ_1234:5678) so we can match the two DF's IDs. In the above case a suitable - # id_cleaning function would be lambda x: x[:-2] (gets rid of /1 and /2 from the end). Sometimes it's trickier as pair member - # information can be somewhere in the middle, and sometimes it's not even required at all as both pair members have the same ID - # just in two different files (1000 genomes). - - if id_cleaning is not None: - binary_1.index = list(map(id_cleaning, binary_1.index)) - binary_2.index = list(map(id_cleaning, binary_2.index)) - - common_read_ids = binary_1.index.intersection(binary_2.index) - only_1 = binary_1.index.difference(binary_2.index) - only_2 = binary_2.index.difference(binary_1.index) - - b_1 = binary_1.loc[common_read_ids] - b_2 = binary_2.loc[common_read_ids] - b_12 = b_1 * b_2 # elementwise AND - b_ispaired = b_12.any(axis=1) # reads with at least one allele w/ paired hits - b_paired = b_12.loc[b_ispaired] - b_mispaired = b_1.loc[~b_ispaired] + b_2.loc[~b_ispaired] # elementwise AND where two ends only hit different alleles - b_unpaired = pd.concat([binary_1.loc[only_1], binary_2.loc[only_2]]) # concatenation for reads w/ just one end mapping anywhere - - if VERBOSE: - print(now(), ('Alignment pairing completed. %d paired, %d unpaired, %d discordant ' % - (b_paired.shape[0], b_unpaired.shape[0], b_mispaired.shape[0]))) - - return b_paired, b_mispaired, b_unpaired - - -def get_features(allele_id, features, feature_list): - # allele_id can be like HLA12345_HLA67890: then we take features from HLA12345 first if possible, then from HLA67890 - # for sequence reconstruction (in that case HLA67890 should be a nearest neighbor of HLA12345) - if '_' in allele_id: - partial_allele, complete_allele = allele_id.split('_') - else: - complete_allele = allele_id - - feats_complete = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==complete_allele].iterrows()} - feats_partial = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==partial_allele].iterrows()} if '_' in allele_id else feats_complete - - feats_to_include = [] - - for feat in sorted(feature_list, key=feature_order): - if feat in feats_partial: - feats_to_include.append(feats_partial[feat]) - elif feat in feats_complete: - feats_to_include.append(feats_complete[feat]) - else: - warnings.warn('Feature %s not found for allele %s' % (feat, allele_id)) - - return pd.DataFrame(feats_to_include) - - -def calculate_coverage(alignment, features, alleles_to_plot, features_used): - assert len(alignment) in (2, 4, 5), ("Alignment tuple either has to have 2, 4 or 5 elements. First four: pos, read_details " - "once or twice depending on single or paired end, and an optional binary DF at the end for PROPER paired-end plotting") - has_pairing_info = (len(alignment) == 5) - - if len(alignment) == 2: - matrix_pos, read_details = alignment[:2] - pos = matrix_pos[alleles_to_plot] - pairing = np.sign(pos) * 2 # see explanation on the else branch. These mean unpaired hits. - hit_counts = np.sign(pos).sum(axis=1) # how many of the selected alleles does a particular read hit? (unambiguous hit or not) - # if only a single allele is fed to this function that has no hits at all, we still want to create a null-matrix - # for it. Its initialization depends on max_ambiguity (it's the size of one of its dimensions) so we set this to 1 at least. - max_ambiguity = max(1, hit_counts.max()) - to_process = [(pos, read_details, hit_counts, pairing)] - elif len(alignment) == 4: # TODO: deprecated. We don't use this, it should probably be taken out. - matrix_pos1, read_details1, matrix_pos2, read_details2 = alignment - pos1 = matrix_pos1[alleles_to_plot] - pos2 = matrix_pos2[alleles_to_plot] - pairing1 = np.sign(pos1) * 2 # every read is considered unpaired - pairing2 = np.sign(pos2) * 2 - hit_counts1 = np.sign(pos1).sum(axis=1) - hit_counts2 = np.sign(pos2).sum(axis=1) - max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max()) - to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)] - else: - matrix_pos1, read_details1, matrix_pos2, read_details2, pairing_binaries = alignment - bin_p, bin_u, bin_m = pairing_binaries - pos1 = matrix_pos1[alleles_to_plot] - pos2 = matrix_pos2[alleles_to_plot] - # pairing's values - 0: no hit, 1: paired hit, 2: unpaired hit, 3: discordantly paired hit - pairing = pd.concat([bin_p[alleles_to_plot], bin_u[alleles_to_plot]*2, bin_m[alleles_to_plot]*3]) - pairing1 = pairing.loc[pos1.index] # pairing information for first ends' hit matrix - pairing2 = pairing.loc[pos2.index] # pairing information for second ends' hit matrix - hit_counts1 = np.sign(pairing1).sum(axis=1) - hit_counts2 = np.sign(pairing2).sum(axis=1) - max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max()) - to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)] - - coverage_matrices = [] - - for allele in alleles_to_plot: - allele_features = get_features(allele, features, features_used) - allele_length = allele_features['length'].sum() - - # Dimensions: - # 1st - 0: perfect hit, 1: hit w/ mismatch(es) - # 2nd - 0: paired, 1: unpaired, 2: discordantly paired # maybe introduce unpaired despite being paired on other alleles - # 3rd - 0: unique hit, 1: ambiguous hit between two alleles, ... n: ambiguous bw/ n+1 alleles - # 4th - 0 ... [sequence length] - coverage = np.zeros((2, 3, max_ambiguity, allele_length), dtype=int) - - # to_process is a list containing tuples with mapping/position/hit property dataframes to superimpose. - # For single-end plotting to_process has just one element. For paired end, to_process has two elements that - # were pre-processed to only plot reads where both pairs map, etc. but the point is that superimposing the two - # will provide the correct result. - - for pos, read_details, hit_counts, pairing_info in to_process: - reads = pos[pos[allele]!=0].index # reads hitting allele: coverage plot will be built using these - for i_pos, i_read_length, i_mismatches, i_hitcount, i_pairing in zip( - pos.loc[reads][allele], - read_details.loc[reads]['read_length'], - read_details.loc[reads]['mismatches'], - hit_counts[reads], - pairing_info.loc[reads][allele]): - if not i_pairing: - continue # or i_pairing = 4. Happens if one end maps to the allele but a different allele has a full paired hit. - coverage[int(bool(i_mismatches))][i_pairing-1][i_hitcount-1][i_pos-1:i_pos-1+i_read_length] += 1 - - coverage_matrices.append((allele, coverage)) - return coverage_matrices - - -def plot_coverage(outfile, coverage_matrices, allele_data, features, features_used, columns=2): - - def start_end_zeros(cov_array): - # the areaplot polygon gets screwed up if the series don't end/start with zero - # this adds a zero to the sides to circumvent that - return np.append(np.append([0], cov_array), [0]) - - def allele_sorter(allele_cov_mtx_tuple): - allele, _ = allele_cov_mtx_tuple - return allele_data.loc[allele.split('_')[0]]['type'] - - def get_allele_locus(allele): - return allele_data.loc[allele.split('_')[0]]['locus'] - - number_of_loci = len(set((get_allele_locus(allele) for allele, _ in coverage_matrices))) - - dpi = 50 - box_size = (7, 1) - - # subplot_rows = len(coverage_matrices)/columns + bool(len(coverage_matrices) % columns) # instead of float division + rounding up - # subplot_rows = 3 # TODO - subplot_rows = 3 * number_of_loci + 1 # rowspan=3 for each plot, legend at the bottom with third height and colspan=2 - - area_colors = [ # stacked plot colors - (0.26, 0.76, 0.26), # perfect, paired, unique - (0.40, 0.84, 0.40), # perfect, paired, shared - - (0.99, 0.75, 0.20), # perfect, unpaired, unique - (0.99, 0.75, 0.20), # perfect, mispaired, unique - (0.99, 0.85, 0.35), # perfect, unpaired, shared - (0.99, 0.85, 0.35), # perfect, mispaired, shared - - (0.99, 0.23, 0.23), # mismatch, paired, unique - (0.99, 0.49, 0.49), # mismatch, paired, shared - - (0.14, 0.55, 0.72), # mismatch, unpaired, unique - (0.14, 0.55, 0.72), # mismatch, mispaired, unique - (0.33, 0.70, 0.88), # mismatch, unpaired, shared - (0.33, 0.70, 0.88)] # mismatch, mispaired, shared - - figure = pylab.figure(figsize=(box_size[0]*columns, box_size[1]*subplot_rows), dpi=dpi) # TODO: dpi doesn't seem to do shit. Is it stuck in 100? - - coverage_matrices = sorted(coverage_matrices, key=allele_sorter) # so that the A alleles come first, then B, and so on. - prev_locus = '' - i_locus = -1 - - for allele, coverage in coverage_matrices: - - if '_' in allele: - partial, complete = allele.split('_') - plot_title = '%s (introns from %s)' % (allele_data.loc[partial]['type'], allele_data.loc[complete]['type']) # , allele for debugging (original ID) - else: - plot_title = allele_data.loc[allele]['type'] # + allele for debugging original ID - - if prev_locus != get_allele_locus(allele): # new locus, start new row - i_locus += 1 - i_allele_in_locus = 0 - else: - i_allele_in_locus = 1 - - prev_locus = get_allele_locus(allele) - - plot = pylab.subplot2grid((subplot_rows, columns), (3*i_locus, i_allele_in_locus), rowspan=3, adjustable='box') - - _, _, max_ambig, seq_length = coverage.shape # first two dimensions known (mismatched[2], pairing[3]) - - shared_weighting = np.reciprocal(np.arange(max_ambig)+1.0) # --> 1, 1/2, 1/3... - shared_weighting[0] = 0 # --> 0, 1/2, 1/3, so the unique part doesn't get mixed in - - perfect_paired_unique = start_end_zeros(coverage[0][0][0]) - mismatch_paired_unique = start_end_zeros(coverage[1][0][0]) - perfect_unpaired_unique = start_end_zeros(coverage[0][1][0]) - mismatch_unpaired_unique = start_end_zeros(coverage[1][1][0]) - perfect_mispaired_unique = start_end_zeros(coverage[0][2][0]) - mismatch_mispaired_unique = start_end_zeros(coverage[1][2][0]) - - perfect_paired_shared = start_end_zeros(shared_weighting.dot(coverage[0][0])) - mismatch_paired_shared = start_end_zeros(shared_weighting.dot(coverage[1][0])) - perfect_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][1])) - mismatch_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][1])) - perfect_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][2])) - mismatch_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][2])) - - # Exon annotation - i_start = 1 # position of last feature's end. It's one because we padded with zeros above - for _, ft in get_features(allele, features, features_used).iterrows(): - if ft['feature'] == 'exon': - plot.axvspan(i_start, i_start + ft['length'], facecolor='black', alpha=0.1, linewidth=0, zorder=1) - i_start += ft['length'] - - - areas = plot.stackplot(np.arange(seq_length+2), # seq_length+2 because of the zero padding at either end - perfect_paired_unique + 0.001, # so that 0 isn't -inf on the logplot, but still below cutoff - perfect_paired_shared, - - perfect_unpaired_unique, - perfect_mispaired_unique, - perfect_unpaired_shared, - perfect_mispaired_shared, - - mismatch_paired_unique, - mismatch_paired_shared, - - mismatch_unpaired_unique, - mismatch_mispaired_unique, - mismatch_unpaired_shared, - mismatch_mispaired_shared, - - linewidth=0, colors=area_colors, zorder=5) - - for aa in areas: - # if you output to pdf it strangely doesn't respect linewidth=0 perfectly, you'll have - # to set line colors identical to area color to avoid having a black line between them - aa.set_edgecolor(aa.get_facecolor()) - - plot.tick_params(axis='both', labelsize=10, direction='out', which='both', top=False) - - plot.text(.015, 0.97, plot_title, horizontalalignment='left', verticalalignment='top', transform=plot.transAxes, fontsize=10, zorder=6) - _, _, _, y2 = plot.axis() - plot.axis((0, seq_length, 1, y2)) # enforce y axis minimum at 10^0. This corresponds to zero coverage because of the +1 above - plot.set_yscale('log') - plot.set_ylim(bottom=0.5) - - legend = pylab.subplot2grid((subplot_rows, columns), (subplot_rows-1, 0), colspan=2, adjustable='box') - ppp = pylab.matplotlib.patches - legend.add_patch(ppp.Rectangle((0, 2), 2, 2, color=area_colors[0])) - legend.add_patch(ppp.Rectangle((0, 0), 2, 2, color=area_colors[1])) - legend.add_patch(ppp.Rectangle((25, 2), 2, 2, color=area_colors[2])) - legend.add_patch(ppp.Rectangle((25, 0), 2, 2, color=area_colors[4])) - legend.add_patch(ppp.Rectangle((50, 2), 2, 2, color=area_colors[6])) - legend.add_patch(ppp.Rectangle((50, 0), 2, 2, color=area_colors[7])) - legend.add_patch(ppp.Rectangle((75, 2), 2, 2, color=area_colors[8])) - legend.add_patch(ppp.Rectangle((75, 0), 2, 2, color=area_colors[10])) - legend.text( 2.5, 3, 'paired, no mismatches, unique', va='center', size='smaller') - legend.text( 2.5, 1, 'paired, no mismatches, ambiguous', va='center', size='smaller') - legend.text(27.5, 3, 'unpaired, no mismatches, unique', va='center', size='smaller') - legend.text(27.5, 1, 'unpaired, no mismatches, ambiguous', va='center', size='smaller') - legend.text(52.5, 3, 'paired, mismatched, unique', va='center', size='smaller') - legend.text(52.5, 1, 'paired, mismatched, ambiguous', va='center', size='smaller') - legend.text(77.5, 3, 'unpaired, mismatched, unique', va='center', size='smaller') - legend.text(77.5, 1, 'unpaired, mismatched, ambiguous', va='center', size='smaller') - legend.set_xlim(0, 100) - legend.set_ylim(0, 4) - legend.axison = False - - figure.tight_layout() - figure.savefig(outfile) diff --git a/model.py b/model.py deleted file mode 100644 index f6ce188..0000000 --- a/model.py +++ /dev/null @@ -1,496 +0,0 @@ -""" -Created on Jul 19, 2013 - - -This class represents the OptiType model for HLA typing based on NGS data - -It is dependent on Coopr and uses an external ILP solver such as GLPK or CPLEX - - - -@author: Benjamin Schubert -""" - -from __future__ import division -from __future__ import print_function -from builtins import str -from builtins import range -from builtins import object -from pyomo.environ import ConcreteModel, Set, Param, Var, Binary, Objective, Constraint, ConstraintList, maximize -from pyomo.opt import SolverFactory, TerminationCondition -from collections import defaultdict -import pandas as pd -import itertools - - -class OptiType(object): - """ - classdocs - - """ - - def __init__(self, cov, occ, groups_4digit, allele_table, beta, t_max_allele=2, solver="glpk", threads=1, - verbosity=0): - """ - Constructor - """ - - self.__allele_table = allele_table - self.__beta = float(beta) - self.__t_max_allele = t_max_allele - self.__solver = SolverFactory(solver) - self.__threads = threads - self.__opts = {"threads": threads} if threads > 1 else {} - self.__verbosity = verbosity - self.__changed = True # model needs to know if it changed from last run or not - self.__ks = 1 - self.__groups_4digit = groups_4digit - - loci_alleles = defaultdict(list) - for type_4digit, group_alleles in groups_4digit.items(): - # print type_4digit, group_alleles - loci_alleles[type_4digit.split('*')[0]].extend(group_alleles) - - loci = loci_alleles - - self.__allele_to_4digit = {allele: type_4digit for type_4digit, group in groups_4digit.items() for allele in - group} - - ''' - generates the basic ILP model - ''' - - model = ConcreteModel() - - # init Sets - model.LociNames = Set(initialize=list(loci.keys())) - model.Loci = Set(model.LociNames, initialize=lambda m, l: loci[l]) - - L = list(itertools.chain(*list(loci.values()))) - reconst = {allele_id: 0.01 for allele_id in L if '_' in allele_id} - R = set([r for (r, _) in list(cov.keys())]) - model.L = Set(initialize=L) - model.R = Set(initialize=R) - - # init Params - model.cov = Param(model.R, model.L, initialize=lambda model, r, a: cov.get((r, a), 0)) - model.reconst = Param(model.L, initialize=lambda model, a: reconst.get(a, 0)) - - model.occ = Param(model.R, initialize=occ) - model.t_allele = Param(initialize=self.__t_max_allele, mutable=True) - - model.beta = Param(initialize=self.__beta, - validate=lambda val, model: 0.0 <= float(self.__beta) <= 0.999, - mutable=True) - model.nof_loci = Param(initialize=len(loci)) - - # init variables - model.x = Var(model.L, domain=Binary) - model.y = Var(model.R, domain=Binary) - - model.re = Var(model.R, bounds=(0.0, None)) - model.hetero = Var(bounds=(0.0, model.nof_loci)) - - # init objective - model.read_cov = Objective( - rule=lambda model: sum(model.occ[r] * (model.y[r] - model.beta * (model.re[r])) for r in model.R) - sum( - model.reconst[a] * model.x[a] for a in model.L), sense=maximize) - - # init Constraints - model.max_allel_selection = Constraint(model.LociNames, rule=lambda model, l: sum( - model.x[a] for a in model.Loci[l]) <= model.t_allele) - model.min_allel_selection = Constraint(model.LociNames, - rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) >= 1) - model.is_read_cov = Constraint(model.R, - rule=lambda model, r: sum(model.cov[r, a] * model.x[a] for a in model.L) >= - model.y[r]) - model.heterozygot_count = Constraint( - rule=lambda model: model.hetero >= sum(model.x[a] for a in model.L) - model.nof_loci) - - # regularization constraints - model.reg1 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.nof_loci * model.y[r]) - model.reg2 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.hetero) - model.reg3 = Constraint(model.R, - rule=lambda model, r: model.re[r] >= model.hetero - model.nof_loci * (1 - model.y[r])) - - # generate constraint list for solution enumeration - model.c = ConstraintList() - # Generate instance. Used to be .create() but deprecated since, - # as ConcreteModels are instances on their own now. - self.__instance = model - - def set_beta(self, beta): - """ - Sets the parameter beta - """ - self.__changed = True - getattr(self.__instance, str(self.__instance.beta)).set_value(float(beta)) - - def set_t_max_allele(self, t_max_allele): - """ - Sets the upper bound of alleles selected per loci - """ - self.__changed = True - getattr(self.__instance, str(self.__instance.t_allele)).set_value(t_max_allele) - - def solve(self, ks): - """ - solves the problem k times and discards the found solutions in the next run. - """ - d = defaultdict(list) # in there we store the typing +objective and generate afterwards a DatarFrame with it - - if self.__changed or self.__ks != ks: - self.__ks = ks - for k in range(ks): - expr = 0 - - self.__instance.preprocess() - try: - res = self.__solver.solve(self.__instance, options=self.__opts, tee=self.__verbosity) - except: - print ("WARNING: Solver does not support multi-threading. Please change the config" - " file accordingly. Falling back to single-threading.") - res = self.__solver.solve(self.__instance, options={}, tee=self.__verbosity) - self.__instance.solutions.load_from(res) # solution loading changed recently. - - # if self.__verbosity > 0: - # res.write(num=1) - - if res.solver.termination_condition != TerminationCondition.optimal: - print("Optimal solution hasn't been obtained. This is a terminal problem.") # TODO message, exit - break - - selected = [] - indices = [] - encountered_4digit = [] - for j in self.__instance.x: - if self.__allele_to_4digit[j][0] in 'HJG': - if 0.99 <= self.__instance.x[j].value <= 1.01: - selected.append(j) - indices.append(j) - continue - if 0.99 <= self.__instance.x[j].value <= 1.01: - selected.append(j) - exp_i = 0 - exp_i += self.__instance.x[j] - if self.__allele_to_4digit[j] in encountered_4digit: - continue - encountered_4digit.append(self.__allele_to_4digit[j]) - for i_allele in self.__groups_4digit[self.__allele_to_4digit[j]]: - if self.__instance.x[i_allele].value <= 0: - exp_i += self.__instance.x[i_allele] - indices.append(i_allele) - expr += (1.0 - exp_i) - zero_indices = set([j for j in self.__instance.x]).difference(set(indices)) - for j in zero_indices: - expr += self.__instance.x[j] - - self.__instance.c.add(expr >= 1) - - # if self.__verbosity > 0: - # print selected - # self.__instance.c.pprint() - aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected] - c = dict.fromkeys(aas, 1) - for i in range(len(aas)): - if aas.count(aas[i]) < 2: - d[aas[i] + "1"].append(selected[i]) - d[aas[i] + "2"].append(selected[i]) - else: - d[aas[i] + str(c[aas[i]])].append(selected[i]) - c[aas[i]] += 1 - - nof_reads = sum((self.__instance.occ[j] * self.__instance.y[j].value for j in self.__instance.y)) - # if self.__verbosity > 0: - # print "Obj", res.Solution.Objective.__default_objective__.Value - d['obj'].append(self.__instance.read_cov()) - d['nof_reads'].append(nof_reads) - - self.__instance.c.clear() - self.__changed = False - self.__enumeration = pd.DataFrame(d) - - # self.__rank() - return self.__enumeration - else: - return self.__enumeration - - def solve_for_k_alleles(self, k, ks=1): - """ - EXPERIMENTAL! - - generates a solution without the regularization term and only k selected alleles - """ - if k < int(self.__instance.nof_loci.value) or k > int(self.__instance.nof_loci.value * self.__t_max_allele): - raise Warning("k " + str(k) + " is out of range [" + str(self.__instance.nof_loci.value) + "," + str( - self.__instance.nof_loci * self.__t_max_allele) + "]") - - - # copy the instance - inst = self.__instance.clone() - # set beta = 0 # because we do homozygosity calling manually - getattr(inst, str(inst.beta)).set_value(float(0.0)) - - inst.del_component("heterozygot_count") - inst.del_component("reg1") - inst.del_component("reg2") - inst.del_component("reg3") - # generate constraint which allows only k alleles to be selected - expr1 = 0 - for j in inst.x: - expr1 += inst.x[j] - - inst.c.add(expr1 == k) - d = defaultdict(list) - - for _ in range(ks): - inst.preprocess() - try: - res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity) - except: - print ("WARNING: Solver does not support multi-threading. Please change the config" - " file accordingly. Falling back to single-threading.") - res = self.__solver.solve(inst, options={}, tee=self.__verbosity) - inst.solutions.load_from(res) - - if self.__verbosity > 0: - res.write(num=1) - - if res.solver.termination_condition != TerminationCondition.optimal: - print("Optimal solution hasn't been obtained. This is a terminal problem.") # TODO message, exit - break - - selected = [] - expr = 0 - - indices = [] - encountered_4digit = [] - for j in inst.x: - if 0.99 <= inst.x[j].value <= 1.01: - exp_i = 0 - selected.append(j) - exp_i += inst.x[j] - if self.__allele_to_4digit[j] in encountered_4digit: - continue - - encountered_4digit.append(self.__allele_to_4digit[j]) - for i_allele in self.__groups_4digit[self.__allele_to_4digit[j]]: - if inst.x[i_allele].value <= 0: - exp_i += inst.x[i_allele] - indices.append(i_allele) - expr += (1 - exp_i) - zero_indices = set([j for j in inst.x]).difference(set(indices)) - for j in zero_indices: - expr += inst.x[j] - - inst.c.add(expr >= 1) - - if self.__verbosity > 0: - print(selected) - aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected] - c = dict.fromkeys(aas, 1) - for i in range(len(aas)): - if aas.count(aas[i]) < 2: - d[aas[i] + "1"].append(selected[i]) - d[aas[i] + "2"].append(selected[i]) - else: - d[aas[i] + str(c[aas[i]])].append(selected[i]) - c[aas[i]] += 1 - - # print "Obj", res.Solution.Objective.__default_objective__.Value - nof_reads = sum((inst.occ[j] * inst.y[j].value for j in inst.y)) - d['obj'].append(inst.read_cov()) - d['nof_reads'].append(nof_reads) - - return pd.DataFrame(d) - - def solve_fixed_typing(self, fixed_alleles): - """ - EXPERIMENTAL! - - forces the allele to pic a 4-digit of the provided alleles - """ - k = len(set(fixed_alleles)) - if k < int(self.__instance.nof_loci.value) or k > int(self.__instance.nof_loci.value * self.__t_max_allele): - raise Warning("k " + str(k) + " is out of range [" + str(self.__instance.nof_loci.value) + "," + str( - self.__instance.nof_loci * self.__t_max_allele) + "]") - - - # copy the instance - inst = self.__instance.clone() - # set beta = 0 because we do homozygocity calling manually - getattr(inst, str(inst.beta)).set_value(float(0.0)) - - inst.del_component("heterozygot_count") - inst.del_component("reg1") - inst.del_component("reg2") - inst.del_component("reg3") - # generate constraint which allows only k alleles to be selected - expr1 = 0 - for j in inst.x: - expr1 += inst.x[j] - inst.c.add(expr1 == k) - - # generate for each of the provided alleles the fixation constraint: - for a in set(fixed_alleles): - expr_f = 0 - print(self.__groups_4digit) - for ids in self.__groups_4digit[a]: - print(ids) - expr_f += inst.x[ids] - inst.c.add(expr_f == 1) - - d = defaultdict(list) - - inst.preprocess() - try: - res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity) - except: - print ("WARNING: Solver does not support multi-threading. Please change the config" - " file accordingly. Falling back to single-threading.") - res = self.__solver.solve(inst, options={}, tee=self.__verbosity) - inst.solutions.load_from(res) - - opt_ids = [j for j in inst.x if 0.99 <= inst.x[j].value <= 1.01] - - aas = [self.__allele_to_4digit[x].split('*')[0] for x in opt_ids] - c = dict.fromkeys(aas, 1) - for i in range(len(aas)): - if aas.count(aas[i]) < 2: - d[aas[i] + "1"].append(opt_ids[i]) - d[aas[i] + "2"].append(opt_ids[i]) - else: - d[aas[i] + str(c[aas[i]])].append(opt_ids[i]) - c[aas[i]] += 1 - nof_reads = sum((inst.occ[j] * inst.y[j].value for j in inst.y)) - d['obj'].append(self.inst.read_cov()) - d['nof_reads'].append(nof_reads) - - return pd.DataFrame(d) - - def enumerate_allele_wise(self): - """ - EXPERIMENTAL! - - fixes all but one allele and solves it again to investigate the influence of this - particular allele on the objective value. - """ - d = defaultdict(list) - - self.__instance.preprocess() - try: - res = self.__solver.solve(self.__instance, options=self.__opts, tee=self.__verbosity) - except: - print ("WARNING: Solver does not support multi-threading. Please change the config" - " file accordingly. Falling back to single-threading.") - res = self.__solver.solve(self.__instance, options={}, tee=self.__verbosity) - self.__instance.solutions.load_from(res) - - opt_ids = [j for j in self.__instance.x if 0.99 <= self.__instance.x[j].value <= 1.01] - - aas = [self.__allele_to_4digit[x].split('*')[0] for x in opt_ids] - c = dict.fromkeys(aas, 1) - for i in range(len(aas)): - if aas.count(aas[i]) < 2: - d[aas[i] + "1"].append(opt_ids[i]) - d[aas[i] + "2"].append(opt_ids[i]) - else: - d[aas[i] + str(c[aas[i]])].append(opt_ids[i]) - c[aas[i]] += 1 - nof_reads = sum((self.__instance.occ[j] * self.__instance.y[j].value for j in self.__instance.y)) - d['obj'].append(self.__instance.read_cov()) - d['nof_reads'].append(nof_reads) - d['discarded'].append(0) - - for j in opt_ids: - if self.__verbosity > 0: - self.__instance.c.pprint() - self.__instance.c.clear() - # fix all but j'th variable - fix = 0 - for i in opt_ids: - if i != j: - fix += (1 - self.__instance.x[i]) - self.__instance.c.add(fix == 0.0) - - # discard j'th allele and all its 4digit equivalent alleles form the next solution - discard = 0 - for k in self.__groups_4digit[self.__allele_to_4digit[j]]: - discard += self.__instance.x[k] - self.__instance.c.add(discard == 0.0) - - # solve with new constraints - self.__instance.preprocess() - try: - res = self.__solver.solve(self.__instance, tee=self.__verbosity) # ,tee=True) verbose solvinf - self.__instance.solutions.load_from(res) - except: - print(Warning("There is no replacement for allele " + self.__allele_to_4digit[j])) - continue - - selected = [al for al in self.__instance.x if 0.99 <= self.__instance.x[al].value <= 1.01] - aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected] - c = dict.fromkeys(aas, 1) - for q in range(len(aas)): - if aas.count(aas[q]) < 2: - d[aas[q] + "1"].append(selected[q]) - d[aas[q] + "2"].append(selected[q]) - else: - d[aas[q] + str(c[aas[q]])].append(selected[q]) - c[aas[q]] += 1 - nof_reads = sum((self.__instance.occ[h] * self.__instance.y[h].value for h in self.__instance.y)) - d['obj'].append(self.__instance.read_cov()) - d['nof_reads'].append(nof_reads) - d['discarded'].append(j) - return pd.DataFrame(d) - - def solve_enforced_zygosity(self, gosity_dict): - """ - EXPERIMENTAL! - - solves the ilp without regularization but enforced homo/heterozygosity for each locus - @param gosity_dict: a dictionary with all loci as keys and value = number of alleles per locus (default is 2) - """ - - inst = self.__instance.clone() - # set beta = 0 because we do homozygocity calling manually - getattr(inst, str(inst.beta)).set_value(float(0.0)) - - inst.del_component("heterozygot_count") - inst.del_component("reg1") - inst.del_component("reg2") - inst.del_component("reg3") - - # now delete max_allele_constraint and reconstruct it again - inst.del_component("max_allel_selection") - for locus in inst.LociNames: - cons = 0 - for a in inst.Loci[locus]: - cons += inst.x[a] - inst.c.add(cons <= gosity_dict.get(locus, 2)) - - d = defaultdict(list) - - inst.preprocess() - try: - res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity) - except: - print ("WARNING: Solver does not support multi-threading. Please change the config" - " file accordingly. Falling back to single-threading.") - res = self.__solver.solve(inst, options={}, tee=self.__verbosity) - inst.solutions.load_from(res) - - selected = [al for al in inst.x if 0.99 <= inst.x[al].value <= 1.01] - aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected] - c = dict.fromkeys(aas, 1) - for q in range(len(aas)): - if aas.count(aas[q]) < 2: - d[aas[q] + "1"].append(selected[q]) - d[aas[q] + "2"].append(selected[q]) - else: - d[aas[q] + str(c[aas[q]])].append(selected[q]) - c[aas[q]] += 1 - nof_reads = sum((inst.occ[h] * inst.y[h].value for h in inst.y)) - d['obj'].append(inst.read_cov()) - d['nof_reads'].append(nof_reads) - return pd.DataFrame(d) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5dff81f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,115 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "optitype" +version = "1.5.0" +description = "Precision HLA typing from next-generation sequencing data" +readme = "README.md" +license = {text = "BSD-3-Clause"} +requires-python = ">=3.10" +authors = [ + {name = "Andras Szolek"}, + {name = "Benjamin Schubert"}, + {name = "Christopher Mohr"}, + {name = "Jonas Scheid"}, +] +maintainers = [ + {name = "OptiType Contributors"} +] +keywords = [ + "HLA", + "typing", + "bioinformatics", + "NGS", + "next-generation-sequencing", + "immunogenetics", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "click>=8.0", + "numpy>=1.23", + "pandas>=2.0", + "pyarrow>=12.0", + "pyomo>=6.6", + "pysam>=0.21", + "matplotlib>=3.5", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-cov>=4.0", + "ruff>=0.1.0", + "mypy>=1.0", + "build>=1.0", + "twine>=4.0", +] + +[project.scripts] +optitype = "optitype.cli:main" + +[project.urls] +Homepage = "https://github.com/FRED-2/OptiType" +Documentation = "https://github.com/FRED-2/OptiType#readme" +Repository = "https://github.com/FRED-2/OptiType" +Issues = "https://github.com/FRED-2/OptiType/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/optitype"] + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/data", + "/tests", + "/README.md", + "/LICENSE", +] + +[tool.hatch.build.targets.wheel.shared-data] +"data/alleles" = "share/optitype/data/alleles" +"data/hla_reference_dna.fasta" = "share/optitype/data/hla_reference_dna.fasta" +"data/hla_reference_rna.fasta" = "share/optitype/data/hla_reference_rna.fasta" + +[tool.ruff] +target-version = "py310" +line-length = 100 +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "UP", # pyupgrade +] +ignore = [ + "E501", # line too long - handled by formatter +] + +[tool.ruff.isort] +known-first-party = ["optitype"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "-v --tb=short" diff --git a/src/optitype/__init__.py b/src/optitype/__init__.py new file mode 100644 index 0000000..3be77c7 --- /dev/null +++ b/src/optitype/__init__.py @@ -0,0 +1,19 @@ +""" +OptiType: Precision HLA typing from next-generation sequencing data. + +OptiType is a novel HLA genotyping algorithm based on integer linear +programming, capable of producing accurate 4-digit HLA genotyping predictions +from NGS data by simultaneously selecting all minor and major HLA-I alleles. +""" + +__version__ = "1.5.0" +__author__ = "Andras Szolek, Benjamin Schubert, Christopher Mohr, Jonas Scheid" + +from optitype.api import HLATypingConfig, HLATypingResult, run_hla_typing + +__all__ = [ + "__version__", + "run_hla_typing", + "HLATypingResult", + "HLATypingConfig", +] diff --git a/src/optitype/__main__.py b/src/optitype/__main__.py new file mode 100644 index 0000000..e05ae51 --- /dev/null +++ b/src/optitype/__main__.py @@ -0,0 +1,6 @@ +"""Allow running optitype as a module: python -m optitype""" + +from optitype.cli import main + +if __name__ == "__main__": + main() diff --git a/src/optitype/_logging.py b/src/optitype/_logging.py new file mode 100644 index 0000000..6112e5c --- /dev/null +++ b/src/optitype/_logging.py @@ -0,0 +1,28 @@ +"""Logging configuration for OptiType.""" + +import logging +import time + +# Package-level logger +logger = logging.getLogger("optitype") + +# Record start time at import for elapsed-time display +_start_time = time.monotonic() + + +def elapsed() -> str: + """Return elapsed time since package load as H:MM:SS.ss.""" + delta = time.monotonic() - _start_time + minutes, seconds = divmod(delta, 60) + hours, minutes = divmod(int(minutes), 60) + return f"{hours}:{minutes:02d}:{seconds:05.2f}" + + +def configure_logging(verbose: bool) -> None: + """Configure the optitype logger for CLI or API use.""" + level = logging.DEBUG if verbose else logging.WARNING + logger.setLevel(level) + if not logger.handlers: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(handler) diff --git a/src/optitype/api.py b/src/optitype/api.py new file mode 100644 index 0000000..de005a7 --- /dev/null +++ b/src/optitype/api.py @@ -0,0 +1,191 @@ +""" +Programmatic API for OptiType. + +This module provides a clean Python API for running HLA typing analyses. +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + +from optitype.pipeline import PipelineConfig, load_config, run_pipeline + + +@dataclass +class HLATypingConfig: + """ + Configuration for HLA typing analysis. + + Attributes: + solver: ILP solver to use ('glpk', 'cbc', 'cplex'). + threads: Number of threads for ILP solving. + mapping_threads: Number of threads for read mapping. + razers3_path: Path to RazerS3 binary. + beta: Homozygosity detection parameter (0.0 to 0.1). + enumerate_count: Number of solutions to enumerate. + delete_bam: Whether to delete intermediate BAM files. + unpaired_weight: Weight for unpaired reads in paired-end mode. + use_discordant: Whether to use discordant read pairs. + """ + + solver: str = "glpk" + threads: int = 1 + mapping_threads: int = 4 + razers3_path: str = "razers3" + beta: float = 0.009 + enumerate_count: int = 1 + delete_bam: bool = True + unpaired_weight: float = 0.0 + use_discordant: bool = False + + def to_pipeline_config(self) -> PipelineConfig: + """Convert to internal PipelineConfig.""" + return PipelineConfig( + razers3_path=self.razers3_path, + mapping_threads=self.mapping_threads, + solver=self.solver, + ilp_threads=self.threads, + delete_bam=self.delete_bam, + unpaired_weight=self.unpaired_weight, + use_discordant=self.use_discordant, + ) + + +@dataclass +class HLATypingResult: + """ + Results from HLA typing analysis. + + Attributes: + best_result: Dictionary mapping locus positions to allele types. + all_results: DataFrame with all enumerated results. + reads: Number of reads explained by the best solution. + objective: Objective function value of the best solution. + output_csv: Path to the output CSV file. + output_plot: Path to the coverage plot PDF. + """ + + best_result: dict[str, str | None] + all_results: pd.DataFrame + reads: float + objective: float + output_csv: str | None = None + output_plot: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert result to a dictionary.""" + return { + "typing": self.best_result, + "reads": self.reads, + "objective": self.objective, + "output_csv": self.output_csv, + "output_plot": self.output_plot, + } + + def __repr__(self) -> str: + types = [f"{k}: {v}" for k, v in self.best_result.items() if v is not None] + return f"HLATypingResult({', '.join(types)}, reads={self.reads:.0f})" + + +def run_hla_typing( + fastq_files: str | list[str], + seq_type: str, + output_dir: str | Path | None = None, + config: HLATypingConfig | None = None, + config_file: str | Path | None = None, + verbose: bool = False, +) -> HLATypingResult: + """ + Run HLA typing analysis on FASTQ or BAM files. + + This is the main entry point for programmatic use of OptiType. + + Args: + fastq_files: Path to input file(s). Single file for single-end, + list of two files for paired-end. + seq_type: Sequencing type ('dna' or 'rna'). + output_dir: Output directory. If None, uses current directory. + config: HLATypingConfig with analysis settings. If None, uses defaults. + config_file: Path to INI config file for additional settings. + verbose: Enable verbose output. + + Returns: + HLATypingResult with typing results. + + Raises: + ValueError: If inputs are invalid. + FileNotFoundError: If input files or reference data not found. + + Example: + >>> from optitype import run_hla_typing, HLATypingConfig + >>> result = run_hla_typing( + ... fastq_files=["sample_1.fastq", "sample_2.fastq"], + ... seq_type="dna", + ... config=HLATypingConfig(solver="cbc", threads=4) + ... ) + >>> print(result.best_result) + {'A1': 'A*02:01', 'A2': 'A*03:01', 'B1': 'B*07:02', ...} + """ + # Normalize input files to list + if isinstance(fastq_files, str): + input_files = [fastq_files] + else: + input_files = list(fastq_files) + + # Validate inputs + for f in input_files: + if not Path(f).exists(): + raise FileNotFoundError(f"Input file not found: {f}") + + # Set up output directory + if output_dir is None: + output_dir = Path.cwd() / "optitype_output" + else: + output_dir = Path(output_dir) + + # Set up configuration + if config is None: + config = HLATypingConfig() + + # Load config file if provided + pipeline_config = config.to_pipeline_config() + if config_file: + file_config = load_config(config_file) + # Merge file config with API config (API takes precedence) + if config.razers3_path == "razers3": + pipeline_config.razers3_path = file_config.razers3_path + if config.solver == "glpk": + pipeline_config.solver = file_config.solver + + # Run pipeline + result = run_pipeline( + input_files=input_files, + seq_type=seq_type, + output_dir=str(output_dir), + beta=config.beta, + enumerate_count=config.enumerate_count, + config=pipeline_config, + verbose=verbose, + ) + + # Extract best result + best_row = result.result_4digit.iloc[0] + best_result = { + "A1": best_row.get("A1"), + "A2": best_row.get("A2"), + "B1": best_row.get("B1"), + "B2": best_row.get("B2"), + "C1": best_row.get("C1"), + "C2": best_row.get("C2"), + } + + return HLATypingResult( + best_result=best_result, + all_results=result.result_4digit, + reads=best_row.get("nof_reads", 0), + objective=best_row.get("obj", 0), + output_csv=result.output_csv, + output_plot=result.output_plot, + ) diff --git a/src/optitype/cli.py b/src/optitype/cli.py new file mode 100644 index 0000000..f603ab1 --- /dev/null +++ b/src/optitype/cli.py @@ -0,0 +1,428 @@ +""" +Click-based command-line interface for OptiType. + +Usage: + optitype run -i sample_1.fq -i sample_2.fq --dna -o output/ + optitype check-deps + optitype init-config +""" + +import shutil +import sys +from pathlib import Path + +import click + +from optitype import __version__ +from optitype.io.data import get_data_path, get_reference_fasta + + +@click.group() +@click.version_option(version=__version__, prog_name="optitype") +def main(): + """OptiType: Precision HLA typing from next-generation sequencing data. + + OptiType is a novel HLA genotyping algorithm based on integer linear + programming, capable of producing accurate 4-digit HLA genotyping + predictions from NGS data. + + \b + External dependencies: + - RazerS3: For read mapping (conda install -c bioconda razers3) + - ILP solver: GLPK, CBC, or CPLEX + + \b + Example usage: + optitype run -i reads_1.fq -i reads_2.fq --dna -o results/ + optitype run -i sample.bam --rna -o results/ + """ + pass + + +@main.command() +@click.option( + "-i", "--input", + "input_files", + multiple=True, + required=True, + type=click.Path(exists=True), + help="Input FASTQ or BAM files. One file for single-end, two for paired-end.", +) +@click.option( + "-r", "--rna", + "seq_type", + flag_value="rna", + help="Input data is RNA sequencing.", +) +@click.option( + "-d", "--dna", + "seq_type", + flag_value="dna", + default=True, + help="Input data is DNA sequencing (default).", +) +@click.option( + "-o", "--outdir", + required=True, + type=click.Path(), + help="Output directory for results.", +) +@click.option( + "-p", "--prefix", + default=None, + help="Output filename prefix. Default: timestamp.", +) +@click.option( + "-b", "--beta", + type=float, + default=0.009, + help="Homozygosity detection parameter (0.0-0.1). Default: 0.009.", +) +@click.option( + "-e", "--enumerate", + "enumerate_count", + type=int, + default=1, + help="Number of solutions to enumerate. Default: 1.", +) +@click.option( + "--solver", + type=click.Choice(["glpk", "cbc", "cplex"]), + default="glpk", + envvar="OPTITYPE_SOLVER", + help="ILP solver to use. Default: glpk.", +) +@click.option( + "--razers3", + type=click.Path(), + default=None, + envvar="OPTITYPE_RAZERS3", + help="Path to RazerS3 binary. Default: search PATH.", +) +@click.option( + "--threads", + type=int, + default=4, + help="Number of threads for mapping. Default: 4.", +) +@click.option( + "--ilp-threads", + type=int, + default=1, + help="Number of threads for ILP solver. Default: 1.", +) +@click.option( + "--keep-bam/--delete-bam", + default=False, + help="Keep or delete intermediate BAM files. Default: delete.", +) +@click.option( + "-c", "--config", + type=click.Path(exists=True), + default=None, + help="Path to config.ini file for additional settings.", +) +@click.option( + "-v", "--verbose", + is_flag=True, + help="Enable verbose output.", +) +def run( + input_files, + seq_type, + outdir, + prefix, + beta, + enumerate_count, + solver, + razers3, + threads, + ilp_threads, + keep_bam, + config, + verbose, +): + """Run HLA typing analysis. + + \b + Examples: + # Paired-end DNA analysis + optitype run -i reads_1.fq -i reads_2.fq --dna -o results/ + + # Single-end RNA analysis + optitype run -i sample.fastq --rna -o results/ + + # Re-analyze from BAM file + optitype run -i mapped.bam --dna -o results/ + + # With custom settings + optitype run -i reads.fq --dna -o results/ --solver cbc --threads 8 + """ + from optitype.pipeline import PipelineConfig, run_pipeline + + # Validate inputs + if len(input_files) not in (1, 2): + raise click.BadParameter( + "Number of input files must be 1 (single-end) or 2 (paired-end)", + param_hint="'-i/--input'", + ) + + if not 0.0 <= beta < 0.1: + raise click.BadParameter( + "Beta must be between 0.0 and 0.1", + param_hint="'-b/--beta'", + ) + + if enumerate_count < 1: + raise click.BadParameter( + "Enumerate count must be at least 1", + param_hint="'-e/--enumerate'", + ) + + # Determine razers3 path (only required for FASTQ input, not BAM/SAM) + bam_input = input_files[0].split(".")[-1].lower() in ("sam", "bam") + if razers3 is None: + razers3 = shutil.which("razers3") or "razers3" + if not bam_input and not shutil.which(razers3): + raise click.ClickException( + "RazerS3 not found in PATH. Install with: conda install -c bioconda razers3\n" + "Or specify path with --razers3 option or OPTITYPE_RAZERS3 environment variable." + ) + + # Create config + pipeline_config = PipelineConfig( + razers3_path=razers3, + mapping_threads=threads, + solver=solver, + ilp_threads=ilp_threads, + delete_bam=not keep_bam, + unpaired_weight=0.0, + use_discordant=False, + ) + + # Load additional settings from config file if provided + if config: + from optitype.pipeline import load_config + file_config = load_config(config) + if file_config.unpaired_weight > 0: + pipeline_config.unpaired_weight = file_config.unpaired_weight + if file_config.use_discordant: + pipeline_config.use_discordant = file_config.use_discordant + + # Run the pipeline + if verbose: + click.echo(f"OptiType {__version__}") + click.echo(f"Input files: {list(input_files)}") + click.echo(f"Sequence type: {seq_type}") + click.echo(f"Output directory: {outdir}") + click.echo(f"Solver: {solver}") + click.echo() + + try: + result = run_pipeline( + input_files=list(input_files), + seq_type=seq_type, + output_dir=outdir, + prefix=prefix, + beta=beta, + enumerate_count=enumerate_count, + config=pipeline_config, + verbose=verbose, + ) + + # Print results summary + click.echo() + click.echo("=" * 60) + click.echo("HLA Typing Results") + click.echo("=" * 60) + + best = result.result_4digit.iloc[0] + for locus in ["A", "B", "C"]: + a1 = best.get(f"{locus}1", "-") + a2 = best.get(f"{locus}2", "-") + if a1 == a2: + click.echo(f" HLA-{locus}: {a1} (homozygous)") + else: + click.echo(f" HLA-{locus}: {a1}, {a2}") + + click.echo() + click.echo(f"Reads: {best['nof_reads']:.0f}") + click.echo(f"Objective: {best['obj']:.2f}") + click.echo() + click.echo(f"Results written to: {result.output_csv}") + click.echo(f"Coverage plot: {result.output_plot}") + + except Exception as e: + raise click.ClickException(str(e)) + + +@main.command("check-deps") +def check_deps(): + """Check that all external dependencies are available. + + Verifies that RazerS3 and an ILP solver are installed and accessible. + """ + click.echo("Checking OptiType dependencies...") + click.echo() + + all_ok = True + + # Check RazerS3 + razers3 = shutil.which("razers3") + if razers3: + click.echo(click.style(" [OK]", fg="green") + f" RazerS3: {razers3}") + else: + click.echo(click.style(" [MISSING]", fg="red") + " RazerS3") + click.echo(" Install with: conda install -c bioconda razers3") + all_ok = False + + # Check GLPK + glpsol = shutil.which("glpsol") + if glpsol: + click.echo(click.style(" [OK]", fg="green") + f" GLPK: {glpsol}") + else: + click.echo(click.style(" [MISSING]", fg="yellow") + " GLPK") + click.echo(" Install with: apt install glpk-utils (or conda install -c conda-forge glpk)") + + # Check CBC + cbc = shutil.which("cbc") + if cbc: + click.echo(click.style(" [OK]", fg="green") + f" CBC: {cbc}") + else: + click.echo(click.style(" [MISSING]", fg="yellow") + " CBC") + click.echo(" Install with: apt install coinor-cbc (or conda install -c conda-forge coincbc)") + + # Check CPLEX (optional) + cplex = shutil.which("cplex") + if cplex: + click.echo(click.style(" [OK]", fg="green") + f" CPLEX: {cplex}") + else: + click.echo(click.style(" [N/A]", fg="cyan") + " CPLEX (optional, commercial solver)") + + # Check reference data + click.echo() + try: + data_path = get_data_path() + click.echo(click.style(" [OK]", fg="green") + f" Reference data: {data_path}") + except FileNotFoundError: + click.echo(click.style(" [MISSING]", fg="red") + " Reference data") + click.echo(" Set OPTITYPE_DATA environment variable to data directory") + all_ok = False + + click.echo() + if all_ok and (glpsol or cbc): + click.echo(click.style("All required dependencies are available!", fg="green")) + else: + click.echo(click.style("Some dependencies are missing.", fg="red")) + if not (glpsol or cbc): + click.echo("At least one ILP solver (GLPK or CBC) is required.") + raise SystemExit(1) + + +@main.command("init-config") +@click.option( + "-o", "--output", + type=click.Path(), + default="config.ini", + help="Output path for config file. Default: config.ini", +) +@click.option( + "--force", + is_flag=True, + help="Overwrite existing config file.", +) +def init_config(output, force): + """Generate a default configuration file. + + Creates a config.ini file with default settings that can be customized. + """ + config_content = """[mapping] + +# Path to RazerS3 binary. If not specified, searched in PATH. +# razers3=/path/to/razers3 + +# Number of threads for read mapping +threads=4 + +[ilp] + +# ILP solver to use: glpk, cbc, or cplex +solver=glpk + +# Number of threads for ILP solver +threads=1 + +[behavior] + +# Delete intermediate BAM files after processing +deletebam=true + +# Weight for unpaired reads in paired-end mode (0-1) +# 0 means unpaired reads are ignored +# 1 means unpaired reads are weighted equally to paired reads +unpaired_weight=0 + +# Use discordant read pairs (where ends map to different alleles) +use_discordant=false +""" + + output_path = Path(output) + if output_path.exists() and not force: + raise click.ClickException( + f"Config file already exists: {output}\n" + "Use --force to overwrite." + ) + + output_path.write_text(config_content) + click.echo(f"Config file created: {output}") + click.echo("Edit the file to customize settings.") + + +@main.command("info") +def info(): + """Show information about the OptiType installation.""" + click.echo(f"OptiType version: {__version__}") + click.echo() + + # Python info + click.echo(f"Python: {sys.version}") + click.echo() + + # Check data path + try: + data_path = get_data_path() + click.echo(f"Data directory: {data_path}") + + # Check for reference files + for ref_type in ["dna", "rna"]: + try: + ref_path = get_reference_fasta(ref_type) + click.echo(f" {ref_type.upper()} reference: {ref_path}") + except FileNotFoundError: + click.echo(f" {ref_type.upper()} reference: NOT FOUND") + + # Check for allele data + alleles_path = data_path / "alleles" + if alleles_path.exists(): + parquet_files = list(alleles_path.glob("*.parquet")) + click.echo(f" Allele data: {len(parquet_files)} parquet files") + else: + click.echo(" Allele data: NOT FOUND") + except FileNotFoundError as e: + click.echo(f"Data directory: NOT FOUND ({e})") + + click.echo() + + # Package dependencies + click.echo("Dependencies:") + for pkg in ["click", "numpy", "pandas", "pyarrow", "pyomo", "pysam", "matplotlib"]: + try: + mod = __import__(pkg) + version = getattr(mod, "__version__", "unknown") + click.echo(f" {pkg}: {version}") + except ImportError: + click.echo(f" {pkg}: NOT INSTALLED") + + +if __name__ == "__main__": + main() diff --git a/src/optitype/hlatyper.py b/src/optitype/hlatyper.py new file mode 100644 index 0000000..9e62540 --- /dev/null +++ b/src/optitype/hlatyper.py @@ -0,0 +1,538 @@ +""" +Core HLA typing logic for OptiType. + +This module handles coverage matrix construction, statistical calculations, +allele pruning, and visualization. +""" + +import logging +import warnings +from collections.abc import Callable + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from optitype._logging import elapsed + +logger = logging.getLogger(__name__) + + +def feature_order(feature: tuple[str, int]) -> int: + """ + Return sort order for feature tuples. + + Returns: + 0 for ('UTR', 1) + 2 for exon, 1 + 3 for intron, 1 + 4 for exon, 2 + ... + 999 for UTR, 2 + """ + feat_type, feat_number = feature + if feat_type not in ("intron", "exon", "UTR"): + raise ValueError(f"Unknown feature type: {feat_type!r}") + if not isinstance(feat_number, int): + raise TypeError(f"Feature number must be int, got {type(feat_number).__name__}") + if feature == ("UTR", 1): + return 0 + elif feature == ("UTR", 2): + return 999 + else: + return feat_number * 2 + (feat_type == "intron") + + +def get_compact_model( + hit_df: pd.DataFrame, + weak_hit_df: pd.DataFrame | None = None, + weight: float | None = None, +) -> tuple[pd.DataFrame, dict]: + """ + Turn a binary hit matrix into a compact model. + + Removes duplicate rows and creates an occurrence vector with the number + of rows each representative read represents. + + Args: + hit_df: Binary hit matrix DataFrame. + weak_hit_df: Optional DataFrame of "weak" hits (e.g., unpaired reads). + weight: Weight for weak hits (0 < weight <= 1). + + Returns: + Tuple of (unique_matrix, occurrence_dict). + """ + hit_df = hit_df.loc[hit_df.any(axis=1)] # Remove all-zero rows + occurrence = { + r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.values() + } + + if weak_hit_df is not None: + weak_hit_df = weak_hit_df.loc[weak_hit_df.any(axis=1)] + if not (0 < weight <= 1): + raise ValueError(f"weak hit weight must be in (0, 1], got {weight}") + weak_occ = { + r[0]: len(r) * weight + for r in weak_hit_df.groupby(weak_hit_df.columns.tolist()).groups.values() + } + occurrence.update(weak_occ) + unique_mtx = pd.concat([hit_df.drop_duplicates(), weak_hit_df.drop_duplicates()]) + else: + unique_mtx = hit_df.drop_duplicates() + + return unique_mtx, occurrence + + +def mtx_to_sparse_dict(hit_df: pd.DataFrame) -> dict: + """ + Convert a hit matrix to a sparse dictionary. + + Creates dictionary of (read, allele): 1 tuples for hits. + + Args: + hit_df: Hit matrix DataFrame. + + Returns: + Dictionary mapping (read_id, allele_id) to 1 for each hit. + """ + all_hits = {} + for read_id, alleles in hit_df.iterrows(): + hit_alleles = alleles[alleles != 0].index + for hit_allele in hit_alleles: + all_hits[(read_id, hit_allele)] = 1 + return all_hits + + +def prune_identical_alleles( + binary_mtx: pd.DataFrame, report_groups: bool = False +) -> pd.DataFrame | tuple[pd.DataFrame, dict]: + """ + Remove identical allele columns from the binary matrix. + + Args: + binary_mtx: Binary hit matrix. + report_groups: If True, also return a dict mapping representatives to their groups. + + Returns: + Pruned matrix, or tuple of (pruned_matrix, groups_dict) if report_groups=True. + """ + hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0])) + + if report_groups: + grouper = hash_columns.groupby(hash_columns) + groups = {g[1].index[0]: g[1].index.tolist() for g in grouper} + + alleles_to_keep = hash_columns.drop_duplicates().index + + if report_groups: + return binary_mtx[alleles_to_keep], groups + return binary_mtx[alleles_to_keep] + + +def prune_identical_reads(binary_mtx: pd.DataFrame) -> pd.DataFrame: + """ + Remove identical read rows from the binary matrix. + + This should only be used to compactify a matrix before passing it to + the function finding overshadowed alleles. Final compactifying should + be done on the original binary matrix. + + Args: + binary_mtx: Binary hit matrix. + + Returns: + Pruned matrix with unique rows. + """ + reads_to_keep = ( + binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index + ) + return binary_mtx.loc[reads_to_keep] + + +def prune_overshadowed_alleles(binary_mtx: pd.DataFrame) -> pd.Index: + """ + Find and remove alleles that are overshadowed by others. + + An allele is overshadowed if another allele has all of its hits plus more. + + For optimal performance, first prune identical columns and rows: + prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx))) + + Args: + binary_mtx: Binary hit matrix. + + Returns: + Index of non-overshadowed alleles. + """ + # Handle potential overflow in dot product + if (binary_mtx.shape[0] < np.iinfo(np.uint16).max) or ( + binary_mtx.sum(axis=0).max() < np.iinfo(np.uint16).max + ): + bb = ( + binary_mtx + if all(binary_mtx.dtypes == np.uint16) + else binary_mtx.astype(np.uint16) + ) + else: + bb = binary_mtx.astype(np.uint32) + + covariance = bb.transpose().dot(bb) + diagonal = pd.Series( + [covariance[ii][ii] for ii in covariance.columns], index=covariance.columns + ) + + new_covariance = covariance[covariance.columns].copy() + for ii in new_covariance.columns: + new_covariance.loc[ii, ii] = 0 + + overshadowed = [] + for ii in new_covariance.columns: + potential_superiors = new_covariance[ii][new_covariance[ii] == diagonal[ii]].index + if any(diagonal[potential_superiors] > diagonal[ii]): + overshadowed.append(ii) + + non_overshadowed = covariance.columns.difference(overshadowed) + return non_overshadowed + + +def create_paired_matrix( + binary_1: pd.DataFrame, + binary_2: pd.DataFrame, + id_cleaning: Callable | None = None, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Create paired, mispaired, and unpaired matrices from two binary hit matrices. + + Args: + binary_1: Binary hit matrix for first read end. + binary_2: Binary hit matrix for second read end. + id_cleaning: Optional function to clean read IDs for matching. + + Returns: + Tuple of (paired, mispaired, unpaired) DataFrames. + """ + if id_cleaning is not None: + binary_1.index = list(map(id_cleaning, binary_1.index)) + binary_2.index = list(map(id_cleaning, binary_2.index)) + + common_read_ids = binary_1.index.intersection(binary_2.index) + only_1 = binary_1.index.difference(binary_2.index) + only_2 = binary_2.index.difference(binary_1.index) + + b_1 = binary_1.loc[common_read_ids] + b_2 = binary_2.loc[common_read_ids] + b_12 = b_1 * b_2 # Elementwise AND + + b_ispaired = b_12.any(axis=1) + b_paired = b_12.loc[b_ispaired] + b_mispaired = b_1.loc[~b_ispaired] + b_2.loc[~b_ispaired] + b_unpaired = pd.concat([binary_1.loc[only_1], binary_2.loc[only_2]]) + + logger.debug( + "%s Alignment pairing completed. %d paired, %d unpaired, %d discordant", + elapsed(), b_paired.shape[0], b_unpaired.shape[0], b_mispaired.shape[0], + ) + + return b_paired, b_mispaired, b_unpaired + + +def get_features( + allele_id: str, features: pd.DataFrame, feature_list: list[tuple[str, int]] +) -> pd.DataFrame: + """ + Get features for an allele from the features DataFrame. + + Args: + allele_id: Allele ID (can be HLA12345 or HLA12345_HLA67890 for reconstructed). + features: Features DataFrame. + feature_list: List of (feature_type, feature_number) tuples to include. + + Returns: + DataFrame of features to include. + """ + if "_" in allele_id: + partial_allele, complete_allele = allele_id.split("_") + else: + complete_allele = allele_id + partial_allele = None + + feats_complete = { + (of["feature"], of["number"]): of + for _, of in features.loc[features["id"] == complete_allele].iterrows() + } + + if partial_allele: + feats_partial = { + (of["feature"], of["number"]): of + for _, of in features.loc[features["id"] == partial_allele].iterrows() + } + else: + feats_partial = feats_complete + + feats_to_include = [] + for feat in sorted(feature_list, key=feature_order): + if feat in feats_partial: + feats_to_include.append(feats_partial[feat]) + elif feat in feats_complete: + feats_to_include.append(feats_complete[feat]) + else: + warnings.warn(f"Feature {feat} not found for allele {allele_id}") + + return pd.DataFrame(feats_to_include) + + +def calculate_coverage( + alignment: tuple, + features: pd.DataFrame, + alleles_to_plot: pd.Series, + features_used: list[tuple[str, int]], +) -> list[tuple[str, np.ndarray]]: + """ + Calculate coverage matrices for alleles. + + Args: + alignment: Tuple of alignment data (pos, read_details, ...). + features: Features DataFrame. + alleles_to_plot: Series of allele IDs to plot. + features_used: List of features to use. + + Returns: + List of (allele_id, coverage_matrix) tuples. + """ + if len(alignment) not in (2, 4, 5): + raise ValueError(f"Alignment tuple must have 2, 4 or 5 elements, got {len(alignment)}") + has_pairing_info = len(alignment) == 5 + + if len(alignment) == 2: + matrix_pos, read_details = alignment[:2] + pos = matrix_pos[alleles_to_plot] + pairing = np.sign(pos) * 2 + hit_counts = np.sign(pos).sum(axis=1) + max_ambiguity = max(1, hit_counts.max()) + to_process = [(pos, read_details, hit_counts, pairing)] + elif len(alignment) == 4: + matrix_pos1, read_details1, matrix_pos2, read_details2 = alignment + pos1 = matrix_pos1[alleles_to_plot] + pos2 = matrix_pos2[alleles_to_plot] + pairing1 = np.sign(pos1) * 2 + pairing2 = np.sign(pos2) * 2 + hit_counts1 = np.sign(pos1).sum(axis=1) + hit_counts2 = np.sign(pos2).sum(axis=1) + max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max()) + to_process = [ + (pos1, read_details1, hit_counts1, pairing1), + (pos2, read_details2, hit_counts2, pairing2), + ] + else: + matrix_pos1, read_details1, matrix_pos2, read_details2, pairing_binaries = alignment + bin_p, bin_u, bin_m = pairing_binaries + pos1 = matrix_pos1[alleles_to_plot] + pos2 = matrix_pos2[alleles_to_plot] + pairing = pd.concat([ + bin_p[alleles_to_plot], + bin_u[alleles_to_plot] * 2, + bin_m[alleles_to_plot] * 3, + ]) + pairing1 = pairing.loc[pos1.index] + pairing2 = pairing.loc[pos2.index] + hit_counts1 = np.sign(pairing1).sum(axis=1) + hit_counts2 = np.sign(pairing2).sum(axis=1) + max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max()) + to_process = [ + (pos1, read_details1, hit_counts1, pairing1), + (pos2, read_details2, hit_counts2, pairing2), + ] + + coverage_matrices = [] + + for allele in alleles_to_plot: + allele_features = get_features(allele, features, features_used) + allele_length = allele_features["length"].sum() + + coverage = np.zeros((2, 3, max_ambiguity, allele_length), dtype=int) + + for pos, read_details, hit_counts, pairing_info in to_process: + reads = pos[pos[allele] != 0].index + for i_pos, i_read_length, i_mismatches, i_hitcount, i_pairing in zip( + pos.loc[reads][allele], + read_details.loc[reads]["read_length"], + read_details.loc[reads]["mismatches"], + hit_counts[reads], + pairing_info.loc[reads][allele], + ): + if not i_pairing: + continue + coverage[int(bool(i_mismatches))][i_pairing - 1][i_hitcount - 1][ + i_pos - 1 : i_pos - 1 + i_read_length + ] += 1 + + coverage_matrices.append((allele, coverage)) + + return coverage_matrices + + +def plot_coverage( + outfile: str, + coverage_matrices: list[tuple[str, np.ndarray]], + allele_data: pd.DataFrame, + features: pd.DataFrame, + features_used: list[tuple[str, int]], + columns: int = 2, +) -> None: + """ + Generate coverage plot PDF. + + Args: + outfile: Output PDF path. + coverage_matrices: List of (allele_id, coverage_matrix) tuples. + allele_data: Allele table DataFrame. + features: Features DataFrame. + features_used: List of features used. + columns: Number of columns in the plot. + """ + def start_end_zeros(cov_array): + return np.append(np.append([0], cov_array), [0]) + + def allele_sorter(allele_cov_mtx_tuple): + allele, _ = allele_cov_mtx_tuple + return allele_data.loc[allele.split("_")[0]]["type"] + + def get_allele_locus(allele): + return allele_data.loc[allele.split("_")[0]]["locus"] + + number_of_loci = len(set(get_allele_locus(allele) for allele, _ in coverage_matrices)) + + dpi = 50 + box_size = (7, 1) + subplot_rows = 3 * number_of_loci + 1 + + area_colors = [ + (0.26, 0.76, 0.26), # perfect, paired, unique + (0.40, 0.84, 0.40), # perfect, paired, shared + (0.99, 0.75, 0.20), # perfect, unpaired, unique + (0.99, 0.75, 0.20), # perfect, mispaired, unique + (0.99, 0.85, 0.35), # perfect, unpaired, shared + (0.99, 0.85, 0.35), # perfect, mispaired, shared + (0.99, 0.23, 0.23), # mismatch, paired, unique + (0.99, 0.49, 0.49), # mismatch, paired, shared + (0.14, 0.55, 0.72), # mismatch, unpaired, unique + (0.14, 0.55, 0.72), # mismatch, mispaired, unique + (0.33, 0.70, 0.88), # mismatch, unpaired, shared + (0.33, 0.70, 0.88), # mismatch, mispaired, shared + ] + + figure = plt.figure(figsize=(box_size[0] * columns, box_size[1] * subplot_rows), dpi=dpi) + + coverage_matrices = sorted(coverage_matrices, key=allele_sorter) + prev_locus = "" + i_locus = -1 + + for allele, coverage in coverage_matrices: + if "_" in allele: + partial, complete = allele.split("_") + plot_title = f"{allele_data.loc[partial]['type']} (introns from {allele_data.loc[complete]['type']})" + else: + plot_title = allele_data.loc[allele]["type"] + + if prev_locus != get_allele_locus(allele): + i_locus += 1 + i_allele_in_locus = 0 + else: + i_allele_in_locus = 1 + + prev_locus = get_allele_locus(allele) + + plot = plt.subplot2grid( + (subplot_rows, columns), + (3 * i_locus, i_allele_in_locus), + rowspan=3, + adjustable="box", + ) + + _, _, max_ambig, seq_length = coverage.shape + shared_weighting = np.reciprocal(np.arange(max_ambig) + 1.0) + shared_weighting[0] = 0 + + perfect_paired_unique = start_end_zeros(coverage[0][0][0]) + mismatch_paired_unique = start_end_zeros(coverage[1][0][0]) + perfect_unpaired_unique = start_end_zeros(coverage[0][1][0]) + mismatch_unpaired_unique = start_end_zeros(coverage[1][1][0]) + perfect_mispaired_unique = start_end_zeros(coverage[0][2][0]) + mismatch_mispaired_unique = start_end_zeros(coverage[1][2][0]) + + perfect_paired_shared = start_end_zeros(shared_weighting.dot(coverage[0][0])) + mismatch_paired_shared = start_end_zeros(shared_weighting.dot(coverage[1][0])) + perfect_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][1])) + mismatch_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][1])) + perfect_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][2])) + mismatch_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][2])) + + # Exon annotation + i_start = 1 + for _, ft in get_features(allele, features, features_used).iterrows(): + if ft["feature"] == "exon": + plot.axvspan(i_start, i_start + ft["length"], facecolor="black", alpha=0.1, linewidth=0, zorder=1) + i_start += ft["length"] + + areas = plot.stackplot( + np.arange(seq_length + 2), + perfect_paired_unique + 0.001, + perfect_paired_shared, + perfect_unpaired_unique, + perfect_mispaired_unique, + perfect_unpaired_shared, + perfect_mispaired_shared, + mismatch_paired_unique, + mismatch_paired_shared, + mismatch_unpaired_unique, + mismatch_mispaired_unique, + mismatch_unpaired_shared, + mismatch_mispaired_shared, + linewidth=0, + colors=area_colors, + zorder=5, + ) + + for aa in areas: + aa.set_edgecolor(aa.get_facecolor()) + + plot.tick_params(axis="both", labelsize=10, direction="out", which="both", top=False) + plot.text( + 0.015, 0.97, plot_title, + horizontalalignment="left", + verticalalignment="top", + transform=plot.transAxes, + fontsize=10, + zorder=6, + ) + _, _, _, y2 = plot.axis() + plot.axis((0, seq_length, 1, y2)) + plot.set_yscale("log") + plot.set_ylim(bottom=0.5) + + # Legend + legend = plt.subplot2grid((subplot_rows, columns), (subplot_rows - 1, 0), colspan=2, adjustable="box") + legend.add_patch(mpatches.Rectangle((0, 2), 2, 2, color=area_colors[0])) + legend.add_patch(mpatches.Rectangle((0, 0), 2, 2, color=area_colors[1])) + legend.add_patch(mpatches.Rectangle((25, 2), 2, 2, color=area_colors[2])) + legend.add_patch(mpatches.Rectangle((25, 0), 2, 2, color=area_colors[4])) + legend.add_patch(mpatches.Rectangle((50, 2), 2, 2, color=area_colors[6])) + legend.add_patch(mpatches.Rectangle((50, 0), 2, 2, color=area_colors[7])) + legend.add_patch(mpatches.Rectangle((75, 2), 2, 2, color=area_colors[8])) + legend.add_patch(mpatches.Rectangle((75, 0), 2, 2, color=area_colors[10])) + legend.text(2.5, 3, "paired, no mismatches, unique", va="center", size="smaller") + legend.text(2.5, 1, "paired, no mismatches, ambiguous", va="center", size="smaller") + legend.text(27.5, 3, "unpaired, no mismatches, unique", va="center", size="smaller") + legend.text(27.5, 1, "unpaired, no mismatches, ambiguous", va="center", size="smaller") + legend.text(52.5, 3, "paired, mismatched, unique", va="center", size="smaller") + legend.text(52.5, 1, "paired, mismatched, ambiguous", va="center", size="smaller") + legend.text(77.5, 3, "unpaired, mismatched, unique", va="center", size="smaller") + legend.text(77.5, 1, "unpaired, mismatched, ambiguous", va="center", size="smaller") + legend.set_xlim(0, 100) + legend.set_ylim(0, 4) + legend.axison = False + + figure.tight_layout() + figure.savefig(outfile) + plt.close(figure) diff --git a/src/optitype/io/__init__.py b/src/optitype/io/__init__.py new file mode 100644 index 0000000..1d63698 --- /dev/null +++ b/src/optitype/io/__init__.py @@ -0,0 +1,12 @@ +"""I/O utilities for OptiType.""" + +from optitype.io.data import get_data_path, load_reference_data +from optitype.io.readers import PYSAM_AVAILABLE, pysam_to_dataframe, sam_to_dataframe + +__all__ = [ + "PYSAM_AVAILABLE", + "get_data_path", + "load_reference_data", + "pysam_to_dataframe", + "sam_to_dataframe", +] diff --git a/src/optitype/io/data.py b/src/optitype/io/data.py new file mode 100644 index 0000000..9456e49 --- /dev/null +++ b/src/optitype/io/data.py @@ -0,0 +1,114 @@ +"""Reference data loading utilities.""" + +import os +import sys +from pathlib import Path + +import pandas as pd + + +def get_data_path() -> Path: + """ + Get the path to OptiType reference data. + + The data can be located in: + 1. Package installation directory (share/optitype/data) + 2. Development directory (data/) + 3. Custom path via OPTITYPE_DATA environment variable + + Returns: + Path to the data directory. + + Raises: + FileNotFoundError: If no valid data directory is found. + """ + # Check environment variable first + if env_path := os.environ.get("OPTITYPE_DATA"): + data_path = Path(env_path) + if data_path.exists(): + return data_path + + # Check for installed package data + # When installed via pip, data goes to share/optitype/data + if sys.prefix != sys.base_prefix: # In a virtual environment + share_path = Path(sys.prefix) / "share" / "optitype" / "data" + if share_path.exists(): + return share_path + + # Also check site-packages share location + for site_packages in sys.path: + share_path = Path(site_packages).parent / "share" / "optitype" / "data" + if share_path.exists(): + return share_path + + # Development mode: check relative to package + dev_data_path = Path(__file__).parent.parent.parent.parent / "data" + if dev_data_path.exists(): + return dev_data_path + + raise FileNotFoundError( + "Could not find OptiType reference data. " + "Set OPTITYPE_DATA environment variable to the data directory path, " + "or ensure the package is properly installed." + ) + + +def load_reference_data() -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Load the allele reference data from Parquet files. + + Returns: + Tuple of (table, features) DataFrames containing allele information. + + Raises: + FileNotFoundError: If the data files are not found. + """ + data_path = get_data_path() + alleles_path = data_path / "alleles" + + if not alleles_path.exists(): + raise FileNotFoundError( + f"Allele data directory not found at {alleles_path}. " + "Ensure the data has been converted to Parquet format." + ) + + table_path = alleles_path / "table.parquet" + features_path = alleles_path / "features.parquet" + + if not table_path.exists() or not features_path.exists(): + raise FileNotFoundError( + f"Parquet files not found in {alleles_path}. " + "Expected table.parquet and features.parquet." + ) + + table = pd.read_parquet(table_path) + features = pd.read_parquet(features_path) + + return table, features + + +def get_reference_fasta(seq_type: str) -> Path: + """ + Get the path to the HLA reference FASTA file. + + Args: + seq_type: Either 'dna' or 'rna'. + + Returns: + Path to the reference FASTA file. + + Raises: + ValueError: If seq_type is not 'dna' or 'rna'. + FileNotFoundError: If the reference file is not found. + """ + if seq_type not in ("dna", "rna"): + raise ValueError(f"seq_type must be 'dna' or 'rna', got {seq_type!r}") + + data_path = get_data_path() + fasta_name = f"hla_reference_{seq_type}.fasta" + fasta_path = data_path / fasta_name + + if not fasta_path.exists(): + raise FileNotFoundError(f"Reference FASTA not found: {fasta_path}") + + return fasta_path diff --git a/src/optitype/io/readers.py b/src/optitype/io/readers.py new file mode 100644 index 0000000..0a98e03 --- /dev/null +++ b/src/optitype/io/readers.py @@ -0,0 +1,198 @@ +"""SAM/BAM file parsing utilities.""" + +import logging +import re +import sys +from collections import OrderedDict +from functools import lru_cache + +import numpy as np +import pandas as pd + +from optitype._logging import elapsed + +try: + import pysam + PYSAM_AVAILABLE = True +except ImportError: + PYSAM_AVAILABLE = False + +logger = logging.getLogger(__name__) + +# CIGAR pattern for calculating read span on reference +CIGAR_SLICER = re.compile(r"[0-9]+[MD]") + + +@lru_cache(maxsize=None) +def _length_on_reference(cigar_string: str) -> int: + """Calculate the length a read spans on the reference from its CIGAR string.""" + return sum(int(p[:-1]) for p in CIGAR_SLICER.findall(cigar_string)) + + +def sam_to_dataframe(samfile: str) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Parse a SAM file into position and read detail DataFrames. + + This is a fallback parser when pysam is not available. + + Args: + samfile: Path to the SAM file. + + Returns: + Tuple of (matrix_pos, read_details) DataFrames. + """ + logger.debug("%s Loading alleles and read IDs from %s...", elapsed(), samfile) + + read_ids, allele_ids = [], [] + first_hit_row = True + total_hits = 0 + nm_index = None + + with open(samfile, "rb") as f: + last_read_id = None + for line in f: + line = line.decode("utf-8") + if line.startswith("@"): + if line.startswith("@SQ"): + allele_ids.append(line.split("\t")[1][3:]) # SN:HLA:HLA00001 + continue + + total_hits += 1 + read_id = line.split("\t")[0] + if last_read_id != read_id: + read_ids.append(read_id) + last_read_id = read_id + + if first_hit_row: + first_hit_row = False + columns = line.split() + try: + nm_index = [x.startswith("NM:") for x in columns].index(True) + except ValueError: + logger.warning("No NM-tag found in SAM file!") + nm_index = None + + logger.debug("%s %d alleles and %d reads found.", elapsed(), len(allele_ids), len(read_ids)) + logger.debug("%s Initializing mapping matrix...", elapsed()) + + matrix_pos = pd.DataFrame( + np.zeros((len(read_ids), len(allele_ids)), dtype=np.uint16), + columns=allele_ids, + index=read_ids, + ) + + read_details = OrderedDict() + + logger.debug( + "%s %dx%d mapping matrix initialized. Populating %d hits from SAM file...", + elapsed(), len(read_ids), len(allele_ids), total_hits, + ) + + milestones = [x * total_hits // 10 for x in range(1, 11)] + + with open(samfile, "rb") as f: + counter = 0 + percent = 0 + for line in f: + line = line.decode("utf-8") + if line.startswith("@"): + continue + + fields = line.strip().split("\t") + read_id, allele_id, position, cigar = fields[0], fields[2], fields[3], fields[5] + nm = fields[nm_index] if nm_index is not None else "NM:i:0" + + if read_id not in read_details: + read_details[read_id] = (int(nm[5:]), _length_on_reference(cigar)) + + matrix_pos.at[read_id, allele_id] = int(position) + + counter += 1 + if counter in milestones: + percent += 10 + logger.debug("\t%d%% completed", percent) + + logger.debug( + "%s %d elements filled. Matrix sparsity: 1 in %.2f", + elapsed(), counter, matrix_pos.shape[0] * matrix_pos.shape[1] / float(counter), + ) + + matrix_pos.rename(columns=lambda x: x.replace("HLA:", ""), inplace=True) + + details_df = pd.DataFrame.from_dict(read_details, orient="index") + details_df.columns = ["mismatches", "read_length"] + + return matrix_pos, details_df + + +def pysam_to_dataframe(samfile: str) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Parse a SAM/BAM file into position and read detail DataFrames using pysam. + + Args: + samfile: Path to the SAM or BAM file. + + Returns: + Tuple of (matrix_pos, read_details) DataFrames. + """ + if not PYSAM_AVAILABLE: + logger.warning("PySam not available. Falling back to primitive SAM parsing.") + return sam_to_dataframe(samfile) + + sam_or_bam = "rb" if samfile.endswith(".bam") else "r" + sam = pysam.AlignmentFile(samfile, sam_or_bam) + + # Check if yara produced the file (for XA tag handling) + is_yara = sam.header.get("PG", [{}])[0].get("ID", "").lower() == "yara" + xa_tag = is_yara and (" -os " not in sam.header.get("PG", [{}])[0].get("CL", "")) + + nref = sam.nreferences + hits = OrderedDict() + allele_id_to_index = {aa: ii for ii, aa in enumerate(sam.references)} + read_details = OrderedDict() + + logger.debug( + "%s Loading %s started. Number of HLA reads loaded (updated every thousand):", + elapsed(), samfile, + ) + + read_counter = 0 + hit_counter = 0 + + for aln in sam: + if aln.qname not in hits: + hits[aln.qname] = np.zeros(nref, dtype=np.uint16) + read_details[aln.qname] = (aln.get_tag("NM"), aln.query_length) + read_counter += 1 + + if logger.isEnabledFor(logging.DEBUG) and not (read_counter % 1000): + sys.stdout.write(f"{len(hits) // 1000}K...") + sys.stdout.flush() + + if xa_tag and aln.has_tag("XA"): + current_row = hits[aln.qname] + subtags = aln.get_tag("XA").split(";")[:-1] + hit_counter += len(subtags) + for subtag in subtags: + allele, pos = subtag.split(",")[:2] + current_row[allele_id_to_index[allele]] = int(pos) + + hits[aln.qname][aln.reference_id] = aln.reference_start + 1 + hit_counter += 1 + + logger.debug("\n %s %d reads loaded. Creating dataframe...", elapsed(), len(hits)) + + pos_df = pd.DataFrame.from_dict(hits, orient="index") + pos_df.columns = sam.references[:] + + details_df = pd.DataFrame.from_dict(read_details, orient="index") + details_df.columns = ["mismatches", "read_length"] + + logger.debug( + "%s Dataframes created. Shape: %d x %d, hits: %s (%d), sparsity: 1 in %.2f", + elapsed(), pos_df.shape[0], pos_df.shape[1], + np.sign(pos_df).sum().sum(), hit_counter, + pos_df.shape[0] * pos_df.shape[1] / float(hit_counter), + ) + + return pos_df, details_df diff --git a/src/optitype/model.py b/src/optitype/model.py new file mode 100644 index 0000000..4c52c21 --- /dev/null +++ b/src/optitype/model.py @@ -0,0 +1,387 @@ +""" +OptiType ILP model for HLA typing. + +This module implements the integer linear programming model for optimal +HLA allele selection using Pyomo. +""" + +import itertools +import logging +from collections import defaultdict + +import pandas as pd +from pyomo.environ import ( + Binary, + ConcreteModel, + Constraint, + ConstraintList, + Objective, + Param, + Set, + Var, + maximize, +) +from pyomo.common.errors import ApplicationError +from pyomo.opt import SolverFactory, TerminationCondition + +logger = logging.getLogger(__name__) + + +class OptiType: + """ + OptiType ILP model for HLA genotyping. + + Uses integer linear programming to select the optimal combination + of HLA alleles that best explain the observed read mappings. + """ + + def __init__( + self, + cov: dict, + occ: dict, + groups_4digit: dict, + allele_table: pd.DataFrame, + beta: float, + t_max_allele: int = 2, + solver: str = "glpk", + threads: int = 1, + verbosity: bool = False, + ): + """ + Initialize the OptiType model. + + Args: + cov: Coverage dictionary mapping (read, allele) to 1 for hits. + occ: Occurrence dictionary mapping read patterns to counts. + groups_4digit: Dictionary mapping 4-digit types to allele lists. + allele_table: DataFrame with allele information. + beta: Homozygosity detection parameter (0.0 to 0.1). + t_max_allele: Maximum alleles per locus (default 2). + solver: ILP solver to use ('glpk', 'cplex', 'cbc'). + threads: Number of solver threads. + verbosity: Whether to print verbose output. + """ + self._allele_table = allele_table + self._beta = float(beta) + self._t_max_allele = t_max_allele + self._solver = SolverFactory(solver) + self._threads = threads + self._opts = {"threads": threads} if threads > 1 else {} + self._verbosity = verbosity + self._changed = True + self._ks = 1 + self._groups_4digit = groups_4digit + + # Group alleles by locus + loci_alleles = defaultdict(list) + for type_4digit, group_alleles in groups_4digit.items(): + loci_alleles[type_4digit.split("*")[0]].extend(group_alleles) + + self._allele_to_4digit = { + allele: type_4digit + for type_4digit, group in groups_4digit.items() + for allele in group + } + + # Build the ILP model + model = ConcreteModel() + + # Initialize Sets + model.LociNames = Set(initialize=list(loci_alleles.keys())) + model.Loci = Set(model.LociNames, initialize=lambda m, l: loci_alleles[l]) + + L = list(itertools.chain(*list(loci_alleles.values()))) + reconst = {allele_id: 0.01 for allele_id in L if "_" in allele_id} + R = set(r for (r, _) in cov.keys()) + model.L = Set(initialize=L) + model.R = Set(initialize=R) + + # Initialize Parameters + model.cov = Param(model.R, model.L, initialize=lambda model, r, a: cov.get((r, a), 0)) + model.reconst = Param(model.L, initialize=lambda model, a: reconst.get(a, 0)) + model.occ = Param(model.R, initialize=occ) + model.t_allele = Param(initialize=self._t_max_allele, mutable=True) + model.beta = Param( + initialize=self._beta, + validate=lambda val, model: 0.0 <= float(self._beta) <= 0.999, + mutable=True, + ) + model.nof_loci = Param(initialize=len(loci_alleles)) + + # Initialize Variables + model.x = Var(model.L, domain=Binary) + model.y = Var(model.R, domain=Binary) + model.re = Var(model.R, bounds=(0.0, None)) + model.hetero = Var(bounds=(0.0, model.nof_loci)) + + # Objective function + model.read_cov = Objective( + rule=lambda model: sum( + model.occ[r] * (model.y[r] - model.beta * model.re[r]) for r in model.R + ) + - sum(model.reconst[a] * model.x[a] for a in model.L), + sense=maximize, + ) + + # Constraints + model.max_allel_selection = Constraint( + model.LociNames, + rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) <= model.t_allele, + ) + model.min_allel_selection = Constraint( + model.LociNames, + rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) >= 1, + ) + model.is_read_cov = Constraint( + model.R, + rule=lambda model, r: sum(model.cov[r, a] * model.x[a] for a in model.L) + >= model.y[r], + ) + model.heterozygot_count = Constraint( + rule=lambda model: model.hetero >= sum(model.x[a] for a in model.L) - model.nof_loci + ) + + # Regularization constraints + model.reg1 = Constraint( + model.R, rule=lambda model, r: model.re[r] <= model.nof_loci * model.y[r] + ) + model.reg2 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.hetero) + model.reg3 = Constraint( + model.R, + rule=lambda model, r: model.re[r] + >= model.hetero - model.nof_loci * (1 - model.y[r]), + ) + + # Constraint list for solution enumeration + model.c = ConstraintList() + + self._instance = model + + def _append_result(self, selected: list, result_dict: dict, instance) -> None: + """Append a solution's allele assignments and scores to the result dict.""" + loci = [self._allele_to_4digit[x].split("*")[0] for x in selected] + locus_count = dict.fromkeys(loci, 1) + + for i, locus in enumerate(loci): + if loci.count(locus) < 2: + result_dict[locus + "1"].append(selected[i]) + result_dict[locus + "2"].append(selected[i]) + else: + result_dict[locus + str(locus_count[locus])].append(selected[i]) + locus_count[locus] += 1 + + nof_reads = sum(instance.occ[j] * instance.y[j].value for j in instance.y) + result_dict["obj"].append(instance.read_cov()) + result_dict["nof_reads"].append(nof_reads) + + def set_beta(self, beta: float) -> None: + """Set the beta parameter for homozygosity detection.""" + self._changed = True + self._instance.beta.set_value(float(beta)) + + def set_t_max_allele(self, t_max_allele: int) -> None: + """Set the upper bound of alleles selected per locus.""" + self._changed = True + self._instance.t_allele.set_value(t_max_allele) + + def solve(self, ks: int) -> pd.DataFrame: + """ + Solve the ILP model and enumerate top k solutions. + + Args: + ks: Number of solutions to enumerate. + + Returns: + DataFrame with typing results. + """ + d = defaultdict(list) + + if self._changed or self._ks != ks: + self._ks = ks + + for k in range(ks): + expr = 0 + + try: + res = self._solver.solve( + self._instance, options=self._opts, tee=self._verbosity + ) + except (RuntimeError, ApplicationError): + logger.warning( + "Solver does not support multi-threading. " + "Falling back to single-threading." + ) + res = self._solver.solve(self._instance, options={}, tee=self._verbosity) + + self._instance.solutions.load_from(res) + + if res.solver.termination_condition != TerminationCondition.optimal: + logger.error("Optimal solution hasn't been obtained. This is a terminal problem.") + break + + selected = [] + indices = [] + encountered_4digit = [] + + for j in self._instance.x: + if self._allele_to_4digit[j][0] in "HJG": + if 0.99 <= self._instance.x[j].value <= 1.01: + selected.append(j) + indices.append(j) + continue + + if 0.99 <= self._instance.x[j].value <= 1.01: + selected.append(j) + exp_i = 0 + exp_i += self._instance.x[j] + + if self._allele_to_4digit[j] in encountered_4digit: + continue + + encountered_4digit.append(self._allele_to_4digit[j]) + for i_allele in self._groups_4digit[self._allele_to_4digit[j]]: + if self._instance.x[i_allele].value <= 0: + exp_i += self._instance.x[i_allele] + indices.append(i_allele) + expr += 1.0 - exp_i + + zero_indices = set(j for j in self._instance.x).difference(set(indices)) + for j in zero_indices: + expr += self._instance.x[j] + + self._instance.c.add(expr >= 1) + + self._append_result(selected, d, self._instance) + + self._instance.c.clear() + self._changed = False + self._enumeration = pd.DataFrame(d) + + return self._enumeration + else: + return self._enumeration + + def solve_for_k_alleles(self, k: int, ks: int = 1) -> pd.DataFrame: + """ + Solve with exactly k alleles (experimental). + + Args: + k: Exact number of alleles to select. + ks: Number of solutions to enumerate. + + Returns: + DataFrame with typing results. + """ + if k < int(self._instance.nof_loci.value) or k > int( + self._instance.nof_loci.value * self._t_max_allele + ): + raise ValueError( + f"k {k} is out of range [{self._instance.nof_loci.value}, " + f"{self._instance.nof_loci.value * self._t_max_allele}]" + ) + + inst = self._instance.clone() + inst.beta.set_value(0.0) + + inst.del_component("heterozygot_count") + inst.del_component("reg1") + inst.del_component("reg2") + inst.del_component("reg3") + + expr1 = sum(inst.x[j] for j in inst.x) + inst.c.add(expr1 == k) + + d = defaultdict(list) + + for _ in range(ks): + try: + res = self._solver.solve(inst, options=self._opts, tee=self._verbosity) + except (RuntimeError, ApplicationError): + logger.warning( + "Solver does not support multi-threading. " + "Falling back to single-threading." + ) + res = self._solver.solve(inst, options={}, tee=self._verbosity) + + inst.solutions.load_from(res) + + if self._verbosity: + res.write(num=1) + + if res.solver.termination_condition != TerminationCondition.optimal: + logger.error("Optimal solution hasn't been obtained.") + break + + selected = [] + expr = 0 + indices = [] + encountered_4digit = [] + + for j in inst.x: + if 0.99 <= inst.x[j].value <= 1.01: + exp_i = 0 + selected.append(j) + exp_i += inst.x[j] + + if self._allele_to_4digit[j] in encountered_4digit: + continue + + encountered_4digit.append(self._allele_to_4digit[j]) + for i_allele in self._groups_4digit[self._allele_to_4digit[j]]: + if inst.x[i_allele].value <= 0: + exp_i += inst.x[i_allele] + indices.append(i_allele) + expr += 1 - exp_i + + zero_indices = set(j for j in inst.x).difference(set(indices)) + for j in zero_indices: + expr += inst.x[j] + + inst.c.add(expr >= 1) + + logger.debug("Selected alleles: %s", selected) + + self._append_result(selected, d, inst) + + return pd.DataFrame(d) + + def solve_enforced_zygosity(self, gosity_dict: dict) -> pd.DataFrame: + """ + Solve with enforced homo/heterozygosity per locus (experimental). + + Args: + gosity_dict: Dictionary mapping locus to number of alleles. + + Returns: + DataFrame with typing results. + """ + inst = self._instance.clone() + inst.beta.set_value(0.0) + + inst.del_component("heterozygot_count") + inst.del_component("reg1") + inst.del_component("reg2") + inst.del_component("reg3") + inst.del_component("max_allel_selection") + + for locus in inst.LociNames: + cons = sum(inst.x[a] for a in inst.Loci[locus]) + inst.c.add(cons <= gosity_dict.get(locus, 2)) + + d = defaultdict(list) + + try: + res = self._solver.solve(inst, options=self._opts, tee=self._verbosity) + except (RuntimeError, ApplicationError): + logger.warning( + "Solver does not support multi-threading. " + "Falling back to single-threading." + ) + res = self._solver.solve(inst, options={}, tee=self._verbosity) + + inst.solutions.load_from(res) + + selected = [al for al in inst.x if 0.99 <= inst.x[al].value <= 1.01] + self._append_result(selected, d, inst) + + return pd.DataFrame(d) diff --git a/src/optitype/pipeline.py b/src/optitype/pipeline.py new file mode 100644 index 0000000..ad6d418 --- /dev/null +++ b/src/optitype/pipeline.py @@ -0,0 +1,381 @@ +""" +Core pipeline orchestration for OptiType. + +This module contains the main HLA typing pipeline logic, coordinating +read mapping, matrix construction, and ILP solving. +""" + +import datetime +import logging +import multiprocessing +import os +import shutil +import subprocess +import time +from collections import defaultdict +from configparser import ConfigParser +from dataclasses import dataclass +from pathlib import Path + +import numpy as np +import pandas as pd + +from optitype import hlatyper as ht +from optitype._logging import configure_logging, elapsed +from optitype.io.data import get_reference_fasta, load_reference_data +from optitype.io.readers import PYSAM_AVAILABLE, pysam_to_dataframe +from optitype.model import OptiType + +logger = logging.getLogger(__name__) + + +# Frequent alleles list for filtering +FREQ_ALLELES = """A*01:01 A*01:02 A*01:03 A*01:06 A*01:09 A*01:23 A*01:38 A*01:44 A*02:01 A*02:02 A*02:03 A*02:04 A*02:05 A*02:06 A*02:07 A*02:08 A*02:09 A*02:10 A*02:11 A*02:12 A*02:13 A*02:133 A*02:14 A*02:141 A*02:146 A*02:15N A*02:16 A*02:17 A*02:18 A*02:19 A*02:20 A*02:21 A*02:22 A*02:226N A*02:24 A*02:25 A*02:26 A*02:27 A*02:28 A*02:29 A*02:30 A*02:33 A*02:34 A*02:35 A*02:36 A*02:37 A*02:38 A*02:40 A*02:42 A*02:44 A*02:45 A*02:46 A*02:48 A*02:49 A*02:51 A*02:53N A*02:54 A*02:55 A*02:57 A*02:58 A*02:60 A*02:64 A*02:67 A*02:74 A*02:85 A*02:90 A*02:93 A*03:01 A*03:02 A*03:05 A*03:07 A*03:08 A*03:10 A*03:12 A*03:22 A*03:25 A*03:65 A*03:69N A*03:97 A*11:01 A*11:02 A*11:03 A*11:04 A*11:05 A*11:06 A*11:08 A*11:10 A*11:12 A*11:13 A*11:18 A*11:19 A*11:20 A*11:29 A*11:40 A*23:01 A*23:02 A*23:03 A*23:04 A*23:05 A*23:09 A*24:02 A*24:03 A*24:04 A*24:05 A*24:06 A*24:07 A*24:08 A*24:09N A*24:10 A*24:13 A*24:14 A*24:15 A*24:17 A*24:18 A*24:20 A*24:21 A*24:22 A*24:23 A*24:24 A*24:25 A*24:26 A*24:27 A*24:28 A*24:29 A*24:31 A*24:35 A*24:46 A*24:51 A*24:56 A*24:63 A*24:93 A*25:01 A*25:02 A*25:04 A*26:01 A*26:02 A*26:03 A*26:04 A*26:05 A*26:06 A*26:07 A*26:08 A*26:09 A*26:10 A*26:11N A*26:12 A*26:14 A*26:15 A*26:16 A*26:17 A*26:18 A*26:20 A*26:49 A*29:01 A*29:02 A*29:03 A*29:04 A*29:10 A*29:12 A*30:01 A*30:02 A*30:03 A*30:04 A*30:06 A*30:08 A*30:09 A*30:10 A*30:11 A*30:12 A*30:16 A*31:01 A*31:02 A*31:03 A*31:04 A*31:05 A*31:06 A*31:08 A*31:09 A*31:12 A*32:01 A*32:02 A*32:03 A*32:04 A*32:05 A*32:06 A*32:08 A*32:13 A*32:20 A*32:22 A*33:01 A*33:03 A*33:04 A*33:05 A*33:10 A*33:26 A*34:01 A*34:02 A*34:03 A*34:05 A*36:01 A*36:03 A*43:01 A*66:01 A*66:02 A*66:03 A*68:01 A*68:02 A*68:03 A*68:04 A*68:05 A*68:06 A*68:07 A*68:08 A*68:12 A*68:13 A*68:15 A*68:16 A*68:17 A*68:18N A*68:23 A*68:24 A*68:38 A*69:01 A*74:01 A*74:02 A*74:03 A*74:04 A*74:06 A*74:09 A*74:11 A*80:01 A*80:02 B*07:02 B*07:03 B*07:04 B*07:05 B*07:06 B*07:07 B*07:08 B*07:09 B*07:10 B*07:12 B*07:13 B*07:14 B*07:15 B*07:17 B*07:20 B*07:22 B*07:26 B*07:33 B*07:36 B*07:47 B*07:53 B*07:85 B*08:01 B*08:02 B*08:03 B*08:04 B*08:05 B*08:09 B*08:12 B*08:18 B*08:23 B*13:01 B*13:02 B*13:03 B*13:04 B*13:07N B*13:09 B*13:11 B*13:13 B*14:01 B*14:02 B*14:03 B*14:04 B*14:05 B*14:06 B*15:01 B*15:02 B*15:03 B*15:04 B*15:05 B*15:06 B*15:07 B*15:08 B*15:09 B*15:10 B*15:108 B*15:11 B*15:12 B*15:123 B*15:125 B*15:13 B*15:135 B*15:15 B*15:153 B*15:16 B*15:17 B*15:18 B*15:20 B*15:21 B*15:23 B*15:24 B*15:25 B*15:27 B*15:28 B*15:29 B*15:30 B*15:31 B*15:32 B*15:33 B*15:34 B*15:35 B*15:36 B*15:37 B*15:38 B*15:39 B*15:40 B*15:42 B*15:45 B*15:46 B*15:47 B*15:48 B*15:50 B*15:52 B*15:53 B*15:54 B*15:55 B*15:56 B*15:58 B*15:61 B*15:63 B*15:67 B*15:68 B*15:70 B*15:71 B*15:73 B*15:82 B*15:86 B*18:01 B*18:02 B*18:03 B*18:04 B*18:05 B*18:06 B*18:07 B*18:08 B*18:09 B*18:11 B*18:13 B*18:14 B*18:18 B*18:19 B*18:20 B*18:28 B*18:33 B*27:01 B*27:02 B*27:03 B*27:04 B*27:05 B*27:06 B*27:07 B*27:08 B*27:09 B*27:10 B*27:11 B*27:12 B*27:13 B*27:14 B*27:19 B*27:20 B*27:21 B*27:30 B*27:39 B*35:01 B*35:02 B*35:03 B*35:04 B*35:05 B*35:06 B*35:08 B*35:09 B*35:10 B*35:11 B*35:12 B*35:13 B*35:14 B*35:15 B*35:16 B*35:17 B*35:18 B*35:19 B*35:20 B*35:21 B*35:22 B*35:23 B*35:24 B*35:25 B*35:27 B*35:28 B*35:29 B*35:30 B*35:31 B*35:32 B*35:33 B*35:34 B*35:36 B*35:43 B*35:46 B*35:51 B*35:77 B*35:89 B*37:01 B*37:02 B*37:04 B*37:05 B*38:01 B*38:02 B*38:04 B*38:05 B*38:06 B*38:15 B*39:01 B*39:02 B*39:03 B*39:04 B*39:05 B*39:06 B*39:07 B*39:08 B*39:09 B*39:10 B*39:11 B*39:12 B*39:13 B*39:14 B*39:15 B*39:23 B*39:24 B*39:31 B*39:34 B*40:01 B*40:02 B*40:03 B*40:04 B*40:05 B*40:06 B*40:07 B*40:08 B*40:09 B*40:10 B*40:11 B*40:12 B*40:14 B*40:15 B*40:16 B*40:18 B*40:19 B*40:20 B*40:21 B*40:23 B*40:27 B*40:31 B*40:35 B*40:36 B*40:37 B*40:38 B*40:39 B*40:40 B*40:42 B*40:44 B*40:49 B*40:50 B*40:52 B*40:64 B*40:80 B*41:01 B*41:02 B*41:03 B*42:01 B*42:02 B*44:02 B*44:03 B*44:04 B*44:05 B*44:06 B*44:07 B*44:08 B*44:09 B*44:10 B*44:12 B*44:13 B*44:15 B*44:18 B*44:20 B*44:21 B*44:22 B*44:26 B*44:27 B*44:29 B*44:31 B*44:59 B*45:01 B*45:02 B*45:04 B*45:06 B*46:01 B*46:02 B*46:13 B*47:01 B*47:02 B*47:03 B*48:01 B*48:02 B*48:03 B*48:04 B*48:05 B*48:06 B*48:07 B*48:08 B*49:01 B*49:02 B*49:03 B*50:01 B*50:02 B*50:04 B*50:05 B*51:01 B*51:02 B*51:03 B*51:04 B*51:05 B*51:06 B*51:07 B*51:08 B*51:09 B*51:10 B*51:12 B*51:13 B*51:14 B*51:15 B*51:18 B*51:21 B*51:22 B*51:27N B*51:29 B*51:31 B*51:32 B*51:33 B*51:34 B*51:36 B*51:37 B*51:63 B*51:65 B*52:01 B*52:02 B*52:06 B*53:01 B*53:02 B*53:03 B*53:04 B*53:05 B*53:07 B*53:08 B*54:01 B*54:02 B*55:01 B*55:02 B*55:03 B*55:04 B*55:07 B*55:08 B*55:10 B*55:12 B*55:16 B*55:46 B*56:01 B*56:02 B*56:03 B*56:04 B*56:05 B*56:06 B*56:07 B*56:09 B*56:11 B*57:01 B*57:02 B*57:03 B*57:04 B*57:05 B*57:06 B*57:10 B*58:01 B*58:02 B*58:06 B*59:01 B*67:01 B*67:02 B*73:01 B*78:01 B*78:02 B*78:03 B*78:05 B*81:01 B*81:02 B*82:01 B*82:02 B*83:01 C*01:02 C*01:03 C*01:04 C*01:05 C*01:06 C*01:08 C*01:14 C*01:17 C*01:30 C*01:32 C*02:02 C*02:03 C*02:04 C*02:06 C*02:08 C*02:09 C*02:10 C*02:19 C*02:20 C*02:27 C*03:02 C*03:03 C*03:04 C*03:05 C*03:06 C*03:07 C*03:08 C*03:09 C*03:10 C*03:13 C*03:14 C*03:15 C*03:16 C*03:17 C*03:19 C*03:21 C*03:32 C*03:42 C*03:43 C*03:56 C*03:67 C*03:81 C*04:01 C*04:03 C*04:04 C*04:05 C*04:06 C*04:07 C*04:10 C*04:11 C*04:14 C*04:15 C*04:24 C*04:29 C*04:33 C*04:37 C*05:01 C*05:03 C*05:04 C*05:07N C*05:09 C*06:02 C*06:03 C*06:04 C*06:06 C*06:08 C*06:09 C*06:17 C*06:24 C*06:53 C*07:01 C*07:02 C*07:03 C*07:04 C*07:05 C*07:06 C*07:07 C*07:08 C*07:09 C*07:10 C*07:109 C*07:123 C*07:13 C*07:14 C*07:17 C*07:18 C*07:19 C*07:21 C*07:22 C*07:27 C*07:29 C*07:32N C*07:37 C*07:43 C*07:46 C*07:49 C*07:56 C*07:66 C*07:67 C*07:68 C*07:80 C*07:95 C*08:01 C*08:02 C*08:03 C*08:04 C*08:05 C*08:06 C*08:13 C*08:15 C*08:20 C*08:21 C*08:27 C*12:02 C*12:03 C*12:04 C*12:05 C*12:07 C*12:12 C*12:15 C*12:16 C*14:02 C*14:03 C*14:04 C*15:02 C*15:03 C*15:04 C*15:05 C*15:06 C*15:07 C*15:08 C*15:09 C*15:11 C*15:12 C*15:13 C*15:17 C*16:01 C*16:02 C*16:04 C*16:08 C*16:09 C*17:01 C*17:02 C*17:03 C*17:04 C*18:01 C*18:02 A*30:07 B*15:64 B*18:12""".split() + + +@dataclass +class PipelineConfig: + """Configuration for the OptiType pipeline.""" + + razers3_path: str = "razers3" + mapping_threads: int = 4 + solver: str = "glpk" + ilp_threads: int = 1 + delete_bam: bool = True + unpaired_weight: float = 0.0 + use_discordant: bool = False + + +@dataclass +class PipelineResult: + """Results from the OptiType pipeline.""" + + result_df: pd.DataFrame + result_4digit: pd.DataFrame + output_csv: str + output_plot: str + + +def get_num_threads(configured_threads: int) -> int: + """Get the number of threads to use, capped by available CPUs.""" + try: + cpu_count = multiprocessing.cpu_count() + except (ImportError, NotImplementedError): + return 2 + + return min(cpu_count, configured_threads) + + +def _is_frequent(allele_id: str, table: pd.DataFrame) -> bool: + """Check if an allele is in the frequent alleles list.""" + allele_id = allele_id.split("_")[0] + return ( + table.loc[allele_id]["4digit"] in FREQ_ALLELES + and table.loc[allele_id]["flags"] == 0 + ) or (table.loc[allele_id]["locus"] in "HGJ") + + +def _get_4digit(allele_id: str, table: pd.DataFrame) -> str: + """Get the 4-digit type for an allele.""" + allele_id = allele_id.split("_")[0] + return table.loc[allele_id]["4digit"] + + +def _get_types(allele_id: str | float, table: pd.DataFrame) -> str | float: + """Get the 4-digit type, handling non-string inputs.""" + if not isinstance(allele_id, str): + return allele_id + aa = allele_id.split("_") + return table.loc[aa[0]]["4digit"] + + +def load_config(config_path: str | Path | None = None) -> PipelineConfig: + """ + Load configuration from an INI file. + + Args: + config_path: Path to config file. If None, uses environment or defaults. + + Returns: + PipelineConfig with loaded settings. + """ + config = ConfigParser(os.environ) + + if config_path and Path(config_path).exists(): + config.read(config_path) + + return PipelineConfig( + razers3_path=config.get("mapping", "razers3", fallback="razers3"), + mapping_threads=config.getint("mapping", "threads", fallback=4), + solver=config.get("ilp", "solver", fallback="glpk"), + ilp_threads=config.getint("ilp", "threads", fallback=1), + delete_bam=config.getboolean("behavior", "deletebam", fallback=True), + unpaired_weight=config.getfloat("behavior", "unpaired_weight", fallback=0.0), + use_discordant=config.getboolean("behavior", "use_discordant", fallback=False), + ) + + +def run_pipeline( + input_files: list[str], + seq_type: str, + output_dir: str, + prefix: str | None = None, + beta: float = 0.009, + enumerate_count: int = 1, + config: PipelineConfig | None = None, + verbose: bool = False, +) -> PipelineResult: + """ + Run the OptiType HLA typing pipeline. + + Args: + input_files: List of input FASTQ or BAM files (1 or 2 files). + seq_type: Sequencing type ('dna' or 'rna'). + output_dir: Output directory path. + prefix: Output file prefix (defaults to timestamp). + beta: Homozygosity detection parameter (0.0 to 0.1). + enumerate_count: Number of solutions to enumerate. + config: Pipeline configuration. + verbose: Enable verbose output. + + Returns: + PipelineResult with typing results. + """ + if config is None: + config = PipelineConfig() + + # Set verbosity + configure_logging(verbose) + + # Validate inputs + if len(input_files) not in (1, 2): + raise ValueError("Number of input files must be 1 (single-end) or 2 (paired-end)") + + if seq_type not in ("dna", "rna"): + raise ValueError("seq_type must be 'dna' or 'rna'") + + if not 0.0 <= beta < 0.1: + raise ValueError("beta must be between 0.0 and 0.1") + + if enumerate_count <= 0: + raise ValueError("enumerate_count must be > 0") + + # Check input file extensions + input_extension = input_files[0].split(".")[-1].lower() + bam_input = input_extension in ("sam", "bam") + + # Create output directory + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Set up paths + if prefix is None: + prefix = datetime.datetime.fromtimestamp(time.time()).strftime("%Y_%m_%d_%H_%M_%S") + out_dir = os.path.join(output_dir, prefix) + else: + out_dir = output_dir + + Path(out_dir).mkdir(parents=True, exist_ok=True) + + extension = "bam" if PYSAM_AVAILABLE else "sam" + bam_paths = ( + input_files + if bam_input + else [os.path.join(out_dir, f"{prefix}_{i + 1}.{extension}") for i in range(len(input_files))] + ) + + ref_type = "nuc" if seq_type == "rna" else "gen" + is_paired = len(input_files) > 1 + + out_csv = os.path.join(out_dir, f"{prefix}_result.tsv") + out_plot = os.path.join(out_dir, f"{prefix}_coverage_plot.pdf") + + # Run mapping if needed + if not bam_input: + if not shutil.which(config.razers3_path): + raise FileNotFoundError( + f"RazerS3 not found: {config.razers3_path!r}. " + "Install with: conda install -c bioconda razers3" + ) + + threads = get_num_threads(config.mapping_threads) + logger.debug("Mapping with %d threads...", threads) + + mapping_ref = get_reference_fasta(seq_type) + for sample, outbam in zip(input_files, bam_paths): + logger.debug("%s Mapping %s to %s reference...", elapsed(), os.path.basename(sample), ref_type.upper()) + + cmd = [ + config.razers3_path, + "-i", "97", + "-m", "99999", + "--distance-range", "0", + "-pa", + "-tc", str(threads), + "-o", str(outbam), + str(mapping_ref), + str(sample), + ] + subprocess.run(cmd, check=True) + + # Load reference data + table, features = load_reference_data() + + logger.debug("%s Generating binary hit matrix.", elapsed()) + + # Process alignments + if is_paired: + pos, read_details = pysam_to_dataframe(bam_paths[0]) + binary1 = np.sign(pos) + + pos2, read_details2 = pysam_to_dataframe(bam_paths[1]) + binary2 = np.sign(pos2) + + if not bam_input and config.delete_bam: + os.remove(bam_paths[0]) + os.remove(bam_paths[1]) + + id1 = set(binary1.index) + id2 = set(binary2.index) + + # Handle read ID suffixes + if len(set(r[-1] for r in id1)) == 1 and len(set(r[-1] for r in id2)) == 1: + cut_last_char = lambda x: x[:-1] + binary1.index = list(map(cut_last_char, binary1.index)) + binary2.index = list(map(cut_last_char, binary2.index)) + pos.index = list(map(cut_last_char, pos.index)) + pos2.index = list(map(cut_last_char, pos2.index)) + read_details.index = list(map(cut_last_char, read_details.index)) + read_details2.index = list(map(cut_last_char, read_details2.index)) + + binary_p, binary_mis, binary_un = ht.create_paired_matrix(binary1, binary2) + + if binary_p.shape[0] < len(id1) * 0.1: + logger.warning( + "Less than 10%% of reads could be paired. Consider an appropriate " + "unpaired_weight setting in your config file (currently %.3f), " + "because you may need to resort to using unpaired reads.", + config.unpaired_weight, + ) + + if config.unpaired_weight > 0: + if config.use_discordant: + binary = pd.concat([binary_p, binary_un, binary_mis]) + else: + binary = pd.concat([binary_p, binary_un]) + else: + binary = binary_p + else: + pos, read_details = pysam_to_dataframe(bam_paths[0]) + + if not bam_input and config.delete_bam: + os.remove(bam_paths[0]) + + binary = np.sign(pos) + + # Filter to frequent alleles + alleles_to_keep = [col for col in binary.columns if _is_frequent(col, table)] + binary = binary[alleles_to_keep] + + logger.debug("%s Temporary pruning of identical rows and columns", elapsed()) + + unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True) + representing_df = pd.DataFrame( + [[a1, a2] for a1, a_l in representing.items() for a2 in a_l], + columns=["representative", "represented"], + ) + + temp_pruned = ht.prune_identical_reads(unique_col) + + logger.debug("%s Size of mtx with unique rows and columns: %s", elapsed(), temp_pruned.shape) + logger.debug("%s Determining minimal set of non-overshadowed alleles", elapsed()) + + minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned) + + logger.debug("%s Keeping only the minimal number of required alleles %s", elapsed(), minimal_alleles.shape) + + binary = binary[minimal_alleles] + + logger.debug("%s Creating compact model...", elapsed()) + + if is_paired and config.unpaired_weight > 0: + if config.use_discordant: + compact_mtx, compact_occ = ht.get_compact_model( + binary_p[minimal_alleles], + pd.concat([binary_un, binary_mis])[minimal_alleles], + weight=config.unpaired_weight, + ) + else: + compact_mtx, compact_occ = ht.get_compact_model( + binary_p[minimal_alleles], + binary_un[minimal_alleles], + weight=config.unpaired_weight, + ) + else: + compact_mtx, compact_occ = ht.get_compact_model(binary) + + allele_ids = binary.columns + + groups_4digit = defaultdict(list) + for allele in allele_ids: + type_4digit = _get_4digit(allele, table) + groups_4digit[type_4digit].append(allele) + + sparse_dict = ht.mtx_to_sparse_dict(compact_mtx) + threads = get_num_threads(config.ilp_threads) + + logger.debug("Starting ILP solver with %d threads...", threads) + logger.debug("%s Initializing OptiType model...", elapsed()) + + op = OptiType( + sparse_dict, + compact_occ, + groups_4digit, + table, + beta, + 2, + config.solver, + threads, + verbosity=verbose, + ) + result = op.solve(enumerate_count) + + logger.debug("%s Result dataframe has been constructed...", elapsed()) + + result_4digit = result.map(lambda x: _get_types(x, table)) + for col in ["A1", "A2", "B1", "B2", "C1", "C2"]: + if col not in result_4digit: + result_4digit[col] = None + + r = result_4digit[["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"]] + + # Write results + r.to_csv( + out_csv, + sep="\t", + columns=["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"], + header=["A1", "A2", "B1", "B2", "C1", "C2", "Reads", "Objective"], + ) + + # Generate coverage plot + hlatype = result.iloc[0].reindex(["A1", "A2", "B1", "B2", "C1", "C2"]).drop_duplicates().dropna() + features_used = ( + [("intron", 1), ("exon", 2), ("intron", 2), ("exon", 3), ("intron", 3)] + if seq_type == "dna" + else [("exon", 2), ("exon", 3)] + ) + + plot_variables = ( + [pos, read_details, pos2, read_details2, (binary_p, binary_un, binary_mis)] + if is_paired + else [pos, read_details] + ) + + coverage_mat = ht.calculate_coverage(plot_variables, features, hlatype, features_used) + ht.plot_coverage(out_plot, coverage_mat, table, features, features_used) + + return PipelineResult( + result_df=result, + result_4digit=r, + output_csv=out_csv, + output_plot=out_plot, + ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..f9849c8 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""OptiType test suite.""" diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..158df44 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,48 @@ +"""Tests for CLI interface.""" + +from click.testing import CliRunner + +from optitype.cli import main + + +def test_cli_help(): + """Test that help command works.""" + runner = CliRunner() + result = runner.invoke(main, ["--help"]) + assert result.exit_code == 0 + assert "OptiType" in result.output + + +def test_cli_version(): + """Test that version command works.""" + runner = CliRunner() + result = runner.invoke(main, ["--version"]) + assert result.exit_code == 0 + assert "1.5.0" in result.output + + +def test_cli_run_help(): + """Test that run subcommand help works.""" + runner = CliRunner() + result = runner.invoke(main, ["run", "--help"]) + assert result.exit_code == 0 + assert "--input" in result.output + assert "--dna" in result.output + assert "--rna" in result.output + + +def test_cli_check_deps(): + """Test that check-deps command works.""" + runner = CliRunner() + result = runner.invoke(main, ["check-deps"]) + # Exit code may vary based on installed dependencies + assert "RazerS3" in result.output + assert "GLPK" in result.output + + +def test_cli_info(): + """Test that info command works.""" + runner = CliRunner() + result = runner.invoke(main, ["info"]) + assert result.exit_code == 0 + assert "OptiType version" in result.output diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 0000000..d1b1a77 --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,50 @@ +"""Tests for I/O utilities.""" + +import pytest +import pandas as pd + +from optitype.io.data import get_data_path, load_reference_data, get_reference_fasta + + +def test_get_data_path(): + """Test that data path can be found.""" + data_path = get_data_path() + assert data_path.exists() + assert data_path.is_dir() + + +def test_load_reference_data(): + """Test loading reference data from Parquet files.""" + table, features = load_reference_data() + + assert isinstance(table, pd.DataFrame) + assert isinstance(features, pd.DataFrame) + + # Check table structure + assert "id" in table.columns + assert "4digit" in table.columns + assert "locus" in table.columns + assert len(table) > 0 + + # Check features structure + assert "id" in features.columns + assert "feature" in features.columns + assert "number" in features.columns + assert len(features) > 0 + + +def test_get_reference_fasta(): + """Test getting reference FASTA paths.""" + dna_ref = get_reference_fasta("dna") + assert dna_ref.exists() + assert "dna" in dna_ref.name + + rna_ref = get_reference_fasta("rna") + assert rna_ref.exists() + assert "rna" in rna_ref.name + + +def test_get_reference_fasta_invalid(): + """Test that invalid seq_type raises ValueError.""" + with pytest.raises(ValueError): + get_reference_fasta("invalid") diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..dc7bf1a --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,175 @@ +"""Smoke tests for the pipeline and API.""" + +import shutil +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from optitype import hlatyper as ht +from optitype.io.data import load_reference_data +from optitype.pipeline import PipelineConfig, run_pipeline, load_config, _is_frequent, _get_4digit, _get_types + +# Check for external dependencies +HAS_RAZERS3 = shutil.which("razers3") is not None +HAS_SOLVER = shutil.which("glpsol") is not None or shutil.which("cbc") is not None + +# Test data paths +TEST_RNA_1 = Path(__file__).parent.parent / "test" / "rna" / "CRC_81_N_1_fished.fastq" +TEST_RNA_2 = Path(__file__).parent.parent / "test" / "rna" / "CRC_81_N_2_fished.fastq" + + +class TestPipelineConfig: + """Tests for PipelineConfig and load_config.""" + + def test_default_config(self): + config = PipelineConfig() + assert config.razers3_path == "razers3" + assert config.solver == "glpk" + assert config.mapping_threads == 4 + assert config.ilp_threads == 1 + assert config.delete_bam is True + + def test_load_config_no_file(self): + config = load_config(None) + assert config.razers3_path == "razers3" + assert config.solver == "glpk" + + def test_load_config_missing_file(self): + config = load_config("/nonexistent/config.ini") + assert config.razers3_path == "razers3" + + +class TestHelperFunctions: + """Tests for pipeline helper functions.""" + + def test_is_frequent(self): + table, _ = load_reference_data() + # Find a known frequent allele in the table + for idx in table.index: + if table.loc[idx]["4digit"] == "A*02:01" and table.loc[idx]["flags"] == 0: + assert _is_frequent(idx, table) + break + + def test_get_4digit(self): + table, _ = load_reference_data() + # Pick any allele and check it returns a string with '*' + idx = table.index[0] + result = _get_4digit(idx, table) + assert isinstance(result, str) + assert "*" in result + + def test_get_types_with_string(self): + table, _ = load_reference_data() + idx = table.index[0] + result = _get_types(idx, table) + assert isinstance(result, str) + + def test_get_types_with_non_string(self): + table, _ = load_reference_data() + assert _get_types(3.14, table) == 3.14 + assert np.isnan(_get_types(float("nan"), table)) + + +class TestHlatyper: + """Tests for hlatyper matrix operations.""" + + def test_get_compact_model(self): + df = pd.DataFrame( + [[1, 0, 1], [1, 0, 1], [0, 1, 0], [0, 0, 0]], + columns=["A", "B", "C"], + index=["r1", "r2", "r3", "r4"], + ) + unique_mtx, occ = ht.get_compact_model(df) + # r4 is all-zero, removed. r1 and r2 are duplicates. + assert unique_mtx.shape[0] == 2 + assert sum(occ.values()) == 3 # 2 for r1/r2 + 1 for r3 + + def test_mtx_to_sparse_dict(self): + df = pd.DataFrame( + [[1, 0], [0, 1]], + columns=["A", "B"], + index=["r1", "r2"], + ) + sparse = ht.mtx_to_sparse_dict(df) + assert ("r1", "A") in sparse + assert ("r2", "B") in sparse + assert ("r1", "B") not in sparse + + def test_prune_identical_reads(self): + df = pd.DataFrame( + [[1, 0], [1, 0], [0, 1]], + columns=["A", "B"], + index=["r1", "r2", "r3"], + ) + pruned = ht.prune_identical_reads(df) + assert pruned.shape[0] == 2 + + def test_create_paired_matrix(self): + b1 = pd.DataFrame( + [[1, 0], [1, 1]], + columns=["A", "B"], + index=["r1", "r2"], + ) + b2 = pd.DataFrame( + [[1, 0], [0, 1]], + columns=["A", "B"], + index=["r1", "r3"], + ) + paired, mispaired, unpaired = ht.create_paired_matrix(b1, b2) + assert "r1" in paired.index # r1 is in both + assert "r2" in unpaired.index or "r3" in unpaired.index + + +class TestPipelineValidation: + """Tests for pipeline input validation.""" + + def test_invalid_file_count(self, tmp_path): + f1 = tmp_path / "a.fq" + f2 = tmp_path / "b.fq" + f3 = tmp_path / "c.fq" + for f in (f1, f2, f3): + f.write_text("") + with pytest.raises(ValueError, match="1.*or 2"): + run_pipeline([str(f1), str(f2), str(f3)], "dna", str(tmp_path)) + + def test_invalid_seq_type(self, tmp_path): + f = tmp_path / "a.fq" + f.write_text("") + with pytest.raises(ValueError, match="dna.*rna"): + run_pipeline([str(f)], "invalid", str(tmp_path)) + + def test_invalid_beta(self, tmp_path): + f = tmp_path / "a.fq" + f.write_text("") + with pytest.raises(ValueError, match="beta"): + run_pipeline([str(f)], "dna", str(tmp_path), beta=0.5) + + def test_invalid_enumerate(self, tmp_path): + f = tmp_path / "a.fq" + f.write_text("") + with pytest.raises(ValueError, match="enumerate"): + run_pipeline([str(f)], "dna", str(tmp_path), enumerate_count=0) + + +@pytest.mark.skipif( + not (HAS_RAZERS3 and HAS_SOLVER and TEST_RNA_1.exists()), + reason="Requires razers3, ILP solver, and test data", +) +class TestPipelineIntegration: + """Integration tests that run the full pipeline (require external tools).""" + + def test_rna_paired_end(self, tmp_path): + result = run_pipeline( + input_files=[str(TEST_RNA_1), str(TEST_RNA_2)], + seq_type="rna", + output_dir=str(tmp_path), + verbose=False, + ) + assert result.result_df is not None + assert Path(result.output_csv).exists() + assert Path(result.output_plot).exists() + # Check result has expected columns + for col in ["A1", "A2", "B1", "B2", "C1", "C2"]: + assert col in result.result_4digit.columns