diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..fb3f7e1
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,83 @@
+name: CI
+
+on:
+  push:
+    branches: [master, main]
+  pull_request:
+    branches: [master, main]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ruff mypy
+
+      - name: Lint with ruff
+        run: ruff check src/
+
+      - name: Type check with mypy
+        run: mypy src/optitype --ignore-missing-imports
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y glpk-utils
+
+      - name: Install package
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Run tests
+        run: pytest tests/ -v
+
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build twine
+
+      - name: Build package
+        run: python -m build
+
+      - name: Check package
+        run: twine check dist/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..68375f2
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,70 @@
+name: Publish to PyPI
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+
+      - name: Build package
+        run: python -m build
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+  publish-testpypi:
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/optitype
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+      - name: Publish to TestPyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+
+  publish-pypi:
+    needs: publish-testpypi
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/optitype
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitignore b/.gitignore
index bc6e23a..88b8022 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,57 @@
-# Created by .gitignore support plugin (hsz.mobi)
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
 
-*.pyc
-.gitignore
\ No newline at end of file
+# Virtual environments
+venv/
+ENV/
+env/
+.venv/
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.nox/
+
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+config.ini
+test/**/baseline/
+test/**/new/
+test/expected_*.tsv
+*.bam
+*.sam
+
+# Claude Code
+CLAUDE.md
+.claude/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 68b17f2..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-dist: bionic
-sudo: false
-
-language: python
-python:
-- '2.7'
-- '3.6'
-- '3.7'
-- '3.8'
-env: COMPILER_NAME=gcc CXX=g++ CC=gcc
-addons:
-  apt:
-    packages:
-    - coinor-cbc
-
-before_script:
-- git clone https://github.com/seqan/seqan.git seqan-src 
-    && cd seqan-src 
-    && cmake -DCMAKE_BUILD_TYPE=Release 
-    && make razers3 
-    && cd ..
-- pip install six
-- pip install future
-- pip install numpy
-- pip install pyomo
-- pip install --upgrade pip
-- pip install pysam
-- pip install matplotlib
-- wget https://support.hdfgroup.org/ftp/HDF5/current18/bin/hdf5-1.8.21-Std-centos7-x86_64-shared_64.tar.gz
-- tar xf hdf5-1.8.21-Std-centos7-x86_64-shared_64.tar.gz
-- export LD_LIBRARY_PATH=${PWD}/hdf5-1.8.21-Std-centos7-x86_64-shared_64/lib:$LD_LIBRARY_PATH
-- export HDF5_DIR=${PWD}/hdf5-1.8.21-Std-centos7-x86_64-shared_64/
-- pip install tables
-- pip install cython
-- pip install pandas
-
-script: python OptiTypePipeline.py -i ./test/rna/CRC_81_N_2_fished.fastq --rna -v -o ./test/rna/ -c ./test/test_config.ini
diff --git a/OptiTypePipeline.py b/OptiTypePipeline.py
deleted file mode 100755
index 8cfbc74..0000000
--- a/OptiTypePipeline.py
+++ /dev/null
@@ -1,437 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-"""
-###################################################################
-
-OptiType: precision HLA typing from next-generation sequencing data
-
-###################################################################
-
-Authors: András Szolek, Benjamin Schubert, Christopher Mohr
-Date: August 2017
-Version: 1.3.1
-License: OptiType is released under a three-clause BSD license
-
-
-Introduction:
--------------
-OptiType, is a novel HLA genotyping algorithm based on integer linear
-programming, capable of producing accurate 4-digit HLA genotyping predictions
-from NGS data by simultaneously selecting all minor and major HLA-I alleles.
-
-
-Requirements:
--------------
-OptiType uses the following software and libraries:
-1) Python 2.7
-2) Biopython 1.63
-3) Pyomo 4.1
-4) Matplotlib 1.3.1
-5) Pandas 0.12 (with HDF5 support)
-6) HDF5 1.8.11
-7) RazerS 3.1
-8) Cplex 12.5
-
-Please make sure you have installed said software/libraries
-and their dependencies.
-
-
-Installation:
--------------
-First install all required software and libraries and register the static path
-in the configuration file for RazerS 3.1. CPLEX should be globally executable
-via command line. Alternative ILP solver supported by Cooper are also usable.
-Please do not change the folder structure or make sure you changed the necessary
-entries in the config file.
-
-
-Usage:
--------------
-1) First filter the read files with the following settings:
-
->razers3 --percent-identity 90 --max-hits 1 --distance-range 0
-         --output-format sam --output sample_fished.sam
-         ./data/hla_reference.fasta sample.fastq
-
-where reference.fasta is either nuc_reference.fasta or gen_reference.fasta
-depending on the type of NGS data. The references can be found in the ./data
-sub-folder or in the supplementary material. To use the results as input
-for OptiType the sam-files have to be converted into fastq format. On Unix-
-based operating system you can convert from sam to fastq with the following
-command:
-
->cat sample_fished.sam | grep -v ^@
-	| awk '{print "@"$1"\n"$10"\n+\n"$11}' > sample_fished.fastq
-
-For paired-end data pre-process each file individually.
-
-2) After pre-filtering, OptiType can be called as follows:
-
->python OptiTypePipeline.py -i sample_fished_1.fastq [sample_fished_2.fastq]
-                    (--rna | --dna) [--beta BETA] [--enumerate ENUMERATE]
-                    --o ./out_dir/
-
-This will produce a CSV with the optimal typing and possible sub-optimal
-typings if specified, as well as a coverage plot of the genotype for
-diagnostic purposes and a HTML file containing a summary of the results.
-
->python OptiTypePipeline.py --help
-usage: OptiType [-h] --input INPUT [INPUT ...] (--rna | --dna) [--beta BETA]
-                [--enumerate ENUMERATE] --outdir OUTDIR [--verbose]
-
-OptiType: 4-digit HLA typer
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --input INPUT [INPUT ...], -i INPUT [INPUT ...]
-                        Fastq files with fished HLA reads. Max two files (for
-                        paired-end)
-  --rna, -r             Specifiying the mapped data as RNA.
-  --dna, -d             Specifiying the mapped data as DNA.
-  --beta BETA, -b BETA  The beta value for for homozygosity detection.
-  --enumerate ENUMERATE, -e ENUMERATE
-                        The number of enumerations.
-  --outdir OUTDIR, -o   OUTDIR
-                        Specifies the out directory to which all files should
-                        be written
-  --verbose, -v         Set verbose mode on.
-"""
-from __future__ import print_function
-from future import standard_library
-standard_library.install_aliases()
-from builtins import zip
-from builtins import filter
-from builtins import map
-from builtins import range
-
-## eliminate dependency on an X11 server
-import matplotlib
-matplotlib.use('Agg')
-
-import sys
-import subprocess
-import os
-import argparse
-if sys.version_info > (3,0):
-    import configparser
-else:
-    import ConfigParser as configparser
-import time
-import datetime
-import pandas as pd
-import hlatyper as ht
-
-import numpy as np
-from model import OptiType
-from collections import defaultdict
-
-# Try to import pysam, important for RazerS 3 output format
-try:
-    import pysam
-    PYSAM_AVAILABLE = True
-except ImportError:
-    PYSAM_AVAILABLE = False
-
-
-freq_alleles = '''A*01:01 A*01:02 A*01:03 A*01:06 A*01:09 A*01:23 A*01:38 A*01:44 A*02:01 A*02:02 A*02:03 A*02:04 A*02:05 A*02:06 A*02:07 A*02:08 A*02:09 A*02:10 A*02:11 A*02:12 A*02:13 A*02:133 A*02:14 A*02:141 A*02:146 A*02:15N A*02:16 A*02:17 A*02:18 A*02:19 A*02:20 A*02:21 A*02:22 A*02:226N A*02:24 A*02:25 A*02:26 A*02:27 A*02:28 A*02:29 A*02:30 A*02:33 A*02:34 A*02:35 A*02:36 A*02:37 A*02:38 A*02:40 A*02:42 A*02:44 A*02:45 A*02:46 A*02:48 A*02:49 A*02:51 A*02:53N A*02:54 A*02:55 A*02:57 A*02:58 A*02:60 A*02:64 A*02:67 A*02:74 A*02:85 A*02:90 A*02:93 A*03:01 A*03:02 A*03:05 A*03:07 A*03:08 A*03:10 A*03:12 A*03:22 A*03:25 A*03:65 A*03:69N A*03:97 A*11:01 A*11:02 A*11:03 A*11:04 A*11:05 A*11:06 A*11:08 A*11:10 A*11:12 A*11:13 A*11:18 A*11:19 A*11:20 A*11:29 A*11:40 A*23:01 A*23:02 A*23:03 A*23:04 A*23:05 A*23:09 A*24:02 A*24:03 A*24:04 A*24:05 A*24:06 A*24:07 A*24:08 A*24:09N A*24:10 A*24:13 A*24:14 A*24:15 A*24:17 A*24:18 A*24:20 A*24:21 A*24:22 A*24:23 A*24:24 A*24:25 A*24:26 A*24:27 A*24:28 A*24:29 A*24:31 A*24:35 A*24:46 A*24:51 A*24:56 A*24:63 A*24:93 A*25:01 A*25:02 A*25:04 A*26:01 A*26:02 A*26:03 A*26:04 A*26:05 A*26:06 A*26:07 A*26:08 A*26:09 A*26:10 A*26:11N A*26:12 A*26:14 A*26:15 A*26:16 A*26:17 A*26:18 A*26:20 A*26:49 A*29:01 A*29:02 A*29:03 A*29:04 A*29:10 A*29:12 A*30:01 A*30:02 A*30:03 A*30:04 A*30:06 A*30:08 A*30:09 A*30:10 A*30:11 A*30:12 A*30:16 A*31:01 A*31:02 A*31:03 A*31:04 A*31:05 A*31:06 A*31:08 A*31:09 A*31:12 A*32:01 A*32:02 A*32:03 A*32:04 A*32:05 A*32:06 A*32:08 A*32:13 A*32:20 A*32:22 A*33:01 A*33:03 A*33:04 A*33:05 A*33:10 A*33:26 A*34:01 A*34:02 A*34:03 A*34:05 A*36:01 A*36:03 A*43:01 A*66:01 A*66:02 A*66:03 A*68:01 A*68:02 A*68:03 A*68:04 A*68:05 A*68:06 A*68:07 A*68:08 A*68:12 A*68:13 A*68:15 A*68:16 A*68:17 A*68:18N A*68:23 A*68:24 A*68:38 A*69:01 A*74:01 A*74:02 A*74:03 A*74:04 A*74:06 A*74:09 A*74:11 A*80:01 A*80:02 B*07:02 B*07:03 B*07:04 B*07:05 B*07:06 B*07:07 B*07:08 B*07:09 B*07:10 B*07:12 B*07:13 B*07:14 B*07:15 B*07:17 B*07:20 B*07:22 B*07:26 B*07:33 B*07:36 B*07:47 B*07:53 B*07:85 B*08:01 B*08:02 B*08:03 B*08:04 B*08:05 B*08:09 B*08:12 B*08:18 B*08:23 B*13:01 B*13:02 B*13:03 B*13:04 B*13:07N B*13:09 B*13:11 B*13:13 B*14:01 B*14:02 B*14:03 B*14:04 B*14:05 B*14:06 B*15:01 B*15:02 B*15:03 B*15:04 B*15:05 B*15:06 B*15:07 B*15:08 B*15:09 B*15:10 B*15:108 B*15:11 B*15:12 B*15:123 B*15:125 B*15:13 B*15:135 B*15:15 B*15:153 B*15:16 B*15:17 B*15:18 B*15:20 B*15:21 B*15:23 B*15:24 B*15:25 B*15:27 B*15:28 B*15:29 B*15:30 B*15:31 B*15:32 B*15:33 B*15:34 B*15:35 B*15:36 B*15:37 B*15:38 B*15:39 B*15:40 B*15:42 B*15:45 B*15:46 B*15:47 B*15:48 B*15:50 B*15:52 B*15:53 B*15:54 B*15:55 B*15:56 B*15:58 B*15:61 B*15:63 B*15:67 B*15:68 B*15:70 B*15:71 B*15:73 B*15:82 B*15:86 B*18:01 B*18:02 B*18:03 B*18:04 B*18:05 B*18:06 B*18:07 B*18:08 B*18:09 B*18:11 B*18:13 B*18:14 B*18:18 B*18:19 B*18:20 B*18:28 B*18:33 B*27:01 B*27:02 B*27:03 B*27:04 B*27:05 B*27:06 B*27:07 B*27:08 B*27:09 B*27:10 B*27:11 B*27:12 B*27:13 B*27:14 B*27:19 B*27:20 B*27:21 B*27:30 B*27:39 B*35:01 B*35:02 B*35:03 B*35:04 B*35:05 B*35:06 B*35:08 B*35:09 B*35:10 B*35:11 B*35:12 B*35:13 B*35:14 B*35:15 B*35:16 B*35:17 B*35:18 B*35:19 B*35:20 B*35:21 B*35:22 B*35:23 B*35:24 B*35:25 B*35:27 B*35:28 B*35:29 B*35:30 B*35:31 B*35:32 B*35:33 B*35:34 B*35:36 B*35:43 B*35:46 B*35:51 B*35:77 B*35:89 B*37:01 B*37:02 B*37:04 B*37:05 B*38:01 B*38:02 B*38:04 B*38:05 B*38:06 B*38:15 B*39:01 B*39:02 B*39:03 B*39:04 B*39:05 B*39:06 B*39:07 B*39:08 B*39:09 B*39:10 B*39:11 B*39:12 B*39:13 B*39:14 B*39:15 B*39:23 B*39:24 B*39:31 B*39:34 B*40:01 B*40:02 B*40:03 B*40:04 B*40:05 B*40:06 B*40:07 B*40:08 B*40:09 B*40:10 B*40:11 B*40:12 B*40:14 B*40:15 B*40:16 B*40:18 B*40:19 B*40:20 B*40:21 B*40:23 B*40:27 B*40:31 B*40:35 B*40:36 B*40:37 B*40:38 B*40:39 B*40:40 B*40:42 B*40:44 B*40:49 B*40:50 B*40:52 B*40:64 B*40:80 B*41:01 B*41:02 B*41:03 B*42:01 B*42:02 B*44:02 B*44:03 B*44:04 B*44:05 B*44:06 B*44:07 B*44:08 B*44:09 B*44:10 B*44:12 B*44:13 B*44:15 B*44:18 B*44:20 B*44:21 B*44:22 B*44:26 B*44:27 B*44:29 B*44:31 B*44:59 B*45:01 B*45:02 B*45:04 B*45:06 B*46:01 B*46:02 B*46:13 B*47:01 B*47:02 B*47:03 B*48:01 B*48:02 B*48:03 B*48:04 B*48:05 B*48:06 B*48:07 B*48:08 B*49:01 B*49:02 B*49:03 B*50:01 B*50:02 B*50:04 B*50:05 B*51:01 B*51:02 B*51:03 B*51:04 B*51:05 B*51:06 B*51:07 B*51:08 B*51:09 B*51:10 B*51:12 B*51:13 B*51:14 B*51:15 B*51:18 B*51:21 B*51:22 B*51:27N B*51:29 B*51:31 B*51:32 B*51:33 B*51:34 B*51:36 B*51:37 B*51:63 B*51:65 B*52:01 B*52:02 B*52:06 B*53:01 B*53:02 B*53:03 B*53:04 B*53:05 B*53:07 B*53:08 B*54:01 B*54:02 B*55:01 B*55:02 B*55:03 B*55:04 B*55:07 B*55:08 B*55:10 B*55:12 B*55:16 B*55:46 B*56:01 B*56:02 B*56:03 B*56:04 B*56:05 B*56:06 B*56:07 B*56:09 B*56:11 B*57:01 B*57:02 B*57:03 B*57:04 B*57:05 B*57:06 B*57:10 B*58:01 B*58:02 B*58:06 B*59:01 B*67:01 B*67:02 B*73:01 B*78:01 B*78:02 B*78:03 B*78:05 B*81:01 B*81:02 B*82:01 B*82:02 B*83:01 C*01:02 C*01:03 C*01:04 C*01:05 C*01:06 C*01:08 C*01:14 C*01:17 C*01:30 C*01:32 C*02:02 C*02:03 C*02:04 C*02:06 C*02:08 C*02:09 C*02:10 C*02:19 C*02:20 C*02:27 C*03:02 C*03:03 C*03:04 C*03:05 C*03:06 C*03:07 C*03:08 C*03:09 C*03:10 C*03:13 C*03:14 C*03:15 C*03:16 C*03:17 C*03:19 C*03:21 C*03:32 C*03:42 C*03:43 C*03:56 C*03:67 C*03:81 C*04:01 C*04:03 C*04:04 C*04:05 C*04:06 C*04:07 C*04:10 C*04:11 C*04:14 C*04:15 C*04:24 C*04:29 C*04:33 C*04:37 C*05:01 C*05:03 C*05:04 C*05:07N C*05:09 C*06:02 C*06:03 C*06:04 C*06:06 C*06:08 C*06:09 C*06:17 C*06:24 C*06:53 C*07:01 C*07:02 C*07:03 C*07:04 C*07:05 C*07:06 C*07:07 C*07:08 C*07:09 C*07:10 C*07:109 C*07:123 C*07:13 C*07:14 C*07:17 C*07:18 C*07:19 C*07:21 C*07:22 C*07:27 C*07:29 C*07:32N C*07:37 C*07:43 C*07:46 C*07:49 C*07:56 C*07:66 C*07:67 C*07:68 C*07:80 C*07:95 C*08:01 C*08:02 C*08:03 C*08:04 C*08:05 C*08:06 C*08:13 C*08:15 C*08:20 C*08:21 C*08:27 C*12:02 C*12:03 C*12:04 C*12:05 C*12:07 C*12:12 C*12:15 C*12:16 C*14:02 C*14:03 C*14:04 C*15:02 C*15:03 C*15:04 C*15:05 C*15:06 C*15:07 C*15:08 C*15:09 C*15:11 C*15:12 C*15:13 C*15:17 C*16:01 C*16:02 C*16:04 C*16:08 C*16:09 C*17:01 C*17:02 C*17:03 C*17:04 C*18:01 C*18:02 A*30:07 B*15:64 B*18:12'''.split(' ')
-
-
-def is_frequent(allele_id):
-    # prepare for HLA12345_HLA67890 and use the first part
-    allele_id = allele_id.split('_')[0]
-    return table.loc[allele_id]['4digit'] in freq_alleles and table.loc[allele_id]['flags'] == 0 or (table.loc[allele_id]['locus'] in 'HGJ')
-
-
-def get_4digit(allele_id):
-    allele_id = allele_id.split('_')[0]  # for reconstructed IDs like HLA12345_HLA67890 return HLA12345's 4-digit type
-    return table.loc[allele_id]['4digit']
-
-
-def get_types(allele_id):
-    if not isinstance(allele_id, str):
-        return allele_id
-    else:
-        aa = allele_id.split('_')
-        if len(aa) == 1:
-            return table.loc[aa[0]]['4digit']
-        else:
-            return table.loc[aa[0]]['4digit']  #+ '/' + table.loc[aa[1]]['4digit']
-
-def get_num_threads(configured_threads):
-  try:
-    import multiprocessing
-  except (ImportError, NotImplementedError):
-    return 2
-  if(multiprocessing.cpu_count() < configured_threads):
-    return multiprocessing.cpu_count()
-  return configured_threads
-
-
-if __name__ == '__main__':
-
-    this_dir = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-    config_default = os.path.join(this_dir, 'config.ini')
-
-    parser = argparse.ArgumentParser(description=' OptiType: 4-digit HLA typer', prog='OptiType')
-    parser.add_argument('--input','-i',
-                      nargs='+',
-                      required=True,
-                      metavar='FQ',
-                      help=(".fastq file(s) (fished or raw) or .bam files stored for re-use, generated by "
-                          "an earlier OptiType run. One file: single-end mode, two files: paired-end mode.")
-                      )
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--rna','-r',
-                      action="store_true",
-                      help="Use with RNA sequencing data."
-                      )
-    group.add_argument('--dna','-d',
-                      action="store_true",
-                      help="Use with DNA sequencing data."
-                      )
-    parser.add_argument('--beta','-b',
-                      type=float,
-                      metavar='B',
-                      default=0.009,
-                      help="The beta value for for homozygosity detection (see paper). Default: 0.009. Handle with care."
-                      )
-    parser.add_argument('--enumerate','-e',
-                      type=int,
-                      default=1,
-                      metavar='N',
-                      help=("Number of enumerations. OptiType will output the optimal solution and "
-                          "the top N-1 suboptimal solutions in the results CSV. Default: 1")
-                      )
-    parser.add_argument('--outdir','-o',
-                      required=True,
-                      help="Specifies the out directory to which all files should be written."
-                      )
-    parser.add_argument('--prefix', '-p',
-                      default=None, dest="prefix", type=str,
-                      help="Specifies prefix of output files"
-                      )
-    parser.add_argument('--verbose','-v',
-                      required=False,
-                      action="store_true",
-                      help="Set verbose mode on."
-                      )
-    parser.add_argument('--config', '-c',
-                      type=argparse.FileType('r'),
-                      default=config_default,
-                      help="Path to config file. Default: config.ini in the same directory as this script"
-                      )
-
-    args = parser.parse_args()
-
-    if not os.path.isfile(args.config.name):
-        print ("Config file not found. Place config.ini either alongside this script or use the -c option. "
-            "See config.ini.example and note that its fields have changed recently.")
-        sys.exit(-1)
-
-    config = configparser.ConfigParser(os.environ)
-    config.read(args.config.name)
-
-    unpaired_weight = config.getfloat('behavior', 'unpaired_weight')
-    use_discordant = config.getboolean('behavior', 'use_discordant')
-
-    if not os.path.exists(args.outdir):
-        os.makedirs(args.outdir)        
-
-    # test if inputs are legit:
-    if args.beta < 0.0 or args.beta >= 0.1:
-        print("Beta value is not correctly chosen. Please choose another beta value between [0,0.1]")
-        sys.exit(-1)
-
-    if args.enumerate <= 0:
-        print("The specified number of enumerations must be bigger than %i"%args.enumeration)
-        sys.exit(-1)
-
-    if len(args.input) not in (1, 2):
-        print("Number of input files can only be 1 (single-end) or 2 (paired-end)")
-        sys.exit(-1)
-
-    input_extension = args.input[0].split('.')[-1]
-    assert all(ii.endswith('.' + input_extension) for ii in args.input), 'Mixed input file extensions'
-
-    bam_input = (input_extension in ('sam', 'bam', 'SAM', 'BAM'))  # otherwise treated as fastq
-
-    # Constants
-    VERBOSE = ht.VERBOSE = bool(args.verbose)  # set verbosity setting in hlatyper too
-    COMMAND = "-i 97 -m 99999 --distance-range 0 -pa -tc %d -o %s %s %s"
-    ALLELE_HDF = os.path.join(this_dir, 'data/alleles.h5')
-    MAPPING_REF = {'gen': os.path.join(this_dir, 'data/hla_reference_dna.fasta'),
-                   'nuc': os.path.join(this_dir, 'data/hla_reference_rna.fasta')}
-    MAPPING_CMD = config.get("mapping", "razers3") + " " + COMMAND
-    date = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')
-    if args.prefix == None:
-        prefix = date
-        out_dir = os.path.join(args.outdir, date)
-    else:
-        prefix = args.prefix
-        out_dir = args.outdir
-
-    if not os.path.exists(out_dir):
-        os.makedirs(out_dir)
-
-    if PYSAM_AVAILABLE:
-        extension = 'bam'
-    else:
-        extension = 'sam'
-
-    bam_paths = args.input if bam_input else [os.path.join(out_dir, ("%s_%i.%s" % (prefix, i+1, extension))) for i in range(len(args.input))]
-
-    # SETUP variables and OUTPUT samples
-    ref_type = "nuc" if args.rna else "gen"
-    is_paired = len(args.input) > 1
-    
-    out_csv = os.path.join(out_dir, ("%s_result.tsv" % prefix))
-    out_plot = os.path.join(out_dir, ("%s_coverage_plot.pdf" % prefix))
-
-    # mapping fished file to reference
-    if not bam_input:
-        threads = get_num_threads(config.getint("mapping", "threads"))
-        if VERBOSE:
-          print("\nmapping with %s threads..." % threads)
-        for (i, sample), outbam in zip(enumerate(args.input), bam_paths):          
-            if VERBOSE:
-                print("\n", ht.now(), "Mapping %s to %s reference..." % (os.path.basename(sample), ref_type.upper()))
-
-            subprocess.call(MAPPING_CMD % (threads, outbam,
-                                           MAPPING_REF[ref_type], sample), shell=True)
-
-    # sam-to-hdf5
-    table, features = ht.load_hdf(ALLELE_HDF, False, 'table', 'features')
-    if VERBOSE:
-        print("\n", ht.now(), "Generating binary hit matrix.")
-
-    if is_paired:
-        # combine matrices for paired-end mapping
-        pos, read_details = ht.pysam_to_hdf(bam_paths[0])
-        binary1 = np.sign(pos)  # dtype=np.uint16
-        
-        pos2, read_details2 = ht.pysam_to_hdf(bam_paths[1])
-        binary2 = np.sign(pos2)  # dtype=np.uint16
-
-        if not bam_input and config.getboolean('behavior', 'deletebam'):
-            os.remove(bam_paths[0])
-            os.remove(bam_paths[1])
-        
-        id1 = set(binary1.index)
-        id2 = set(binary2.index)
-        
-        '''
-            test if we actually can do paired-end mapping
-            1) look at the last character 2-1 character if they are always the same if so -> proon them away and do
-               paired-end
-            2) if not test if the intersection of ID-binary1 and ID-binary2 has at least 10% of the former read
-               number -> do paired-end
-            3) if nothing worked ( perhaps pair-end ID was in the middle or something) raise flag and do single-end
-               mapping on first input
-        '''
-        if len(set([r[-1] for r in id1])) == 1 and len(set([r[-1] for r in id2])) == 1:
-            # if this case is true you have to edit also all pos,etc,desc indices such that the plotting works correctly
-            # again .. maybe it is also neccessary to test for the last two characters
-            cut_last_char = lambda x: x[:-1]
-            binary1.index = list(map(cut_last_char, binary1.index))
-            binary2.index = list(map(cut_last_char, binary2.index))
-            pos.index = list(map(cut_last_char, pos.index))
-            pos2.index = list(map(cut_last_char, pos2.index))
-            read_details.index = list(map(cut_last_char, read_details.index))
-            read_details2.index = list(map(cut_last_char, read_details2.index))
-            
-        binary_p, binary_mis, binary_un =  ht.create_paired_matrix(binary1, binary2)
-
-        if binary_p.shape[0] < len(id1) * 0.1:
-            print(("\nWARNING: Less than 10%% of reads could be paired. Consider an appropriate unpaired_weight setting "
-             "in your config file (currently %.3f), because you may need to resort to using unpaired reads.") % unpaired_weight)
-
-        if unpaired_weight > 0:
-            if use_discordant:
-                binary = pd.concat([binary_p, binary_un, binary_mis])
-            else:
-                binary = pd.concat([binary_p, binary_un])
-        else:
-            binary = binary_p
-
-    else:
-        pos, read_details = ht.pysam_to_hdf(bam_paths[0])
-
-        if not bam_input and config.getboolean('behavior', 'deletebam'):
-            os.remove(bam_paths[0])
-
-        binary = np.sign(pos)  # dtype=np.uint16
-
-    # dimensionality reduction and typing
-
-    alleles_to_keep = list(filter(is_frequent, binary.columns))
-    binary = binary[alleles_to_keep]
-
-    if VERBOSE:
-        print("\n", ht.now(), 'temporary pruning of identical rows and columns')
-    unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True)
-    representing_df = pd.DataFrame([[a1, a2] for a1, a_l in representing.items() for a2 in a_l],
-                                   columns=['representative', 'represented'])
-
-    temp_pruned = ht.prune_identical_reads(unique_col)
-
-    if VERBOSE:
-        print("\n", ht.now(), 'Size of mtx with unique rows and columns:', temp_pruned.shape)
-        print(ht.now(), 'determining minimal set of non-overshadowed alleles')
-
-    minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned)
-
-    if VERBOSE:
-        print("\n", ht.now(), 'Keeping only the minimal number of required alleles', minimal_alleles.shape)
-
-    binary = binary[minimal_alleles]
-
-    if VERBOSE:
-        print("\n", ht.now(), 'Creating compact model...')
-
-    if is_paired and unpaired_weight > 0:
-        if use_discordant:
-            compact_mtx, compact_occ = ht.get_compact_model(binary_p[minimal_alleles],
-                pd.concat([binary_un, binary_mis])[minimal_alleles], weight=unpaired_weight)
-        else:
-            compact_mtx, compact_occ = ht.get_compact_model(binary_p[minimal_alleles],
-                binary_un[minimal_alleles], weight=unpaired_weight)
-    else:
-        compact_mtx, compact_occ = ht.get_compact_model(binary)
-
-    allele_ids = binary.columns
-
-    groups_4digit = defaultdict(list)
-    for allele in allele_ids:
-        type_4digit = get_4digit(allele)
-        groups_4digit[type_4digit].append(allele)
-
-    sparse_dict = ht.mtx_to_sparse_dict(compact_mtx)
-    threads = get_num_threads(config.getint("ilp", "threads"))               
-    if VERBOSE:
-        print("\nstarting ilp solver with %s threads..." % threads)
-        print("\n", ht.now(), 'Initializing OptiType model...')
-
-    op = OptiType(sparse_dict, compact_occ, groups_4digit, table, args.beta, 2,
-                  config.get("ilp", "solver"), threads, verbosity=VERBOSE)
-    result = op.solve(args.enumerate)
-
-    if VERBOSE:
-        print("\n", ht.now(), 'Result dataframe has been constructed...')
-
-    result_4digit = result.applymap(get_types)
-    for iii in ["A1", "A2", "B1", "B2", "C1", "C2"]:
-        if not iii in result_4digit:
-            result_4digit[iii] = None
-    r = result_4digit[["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"]]
-
-    # write CSV to out. And generate plots
-    r.to_csv(out_csv, sep="\t",
-                         columns=["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"],
-                         header=["A1", "A2", "B1", "B2", "C1", "C2", "Reads", "Objective"])
-    
-    hlatype = result.iloc[0].reindex(["A1", "A2", "B1", "B2", "C1", "C2"]).drop_duplicates().dropna()
-    features_used = [('intron', 1), ('exon', 2), ('intron', 2), ('exon', 3), ('intron', 3)] \
-                     if not args.rna else [('exon',2),('exon',3)]
-    plot_variables = [pos, read_details, pos2, read_details2, (binary_p, binary_un, binary_mis)] if is_paired else [pos, read_details]
-    coverage_mat = ht.calculate_coverage(plot_variables, features, hlatype, features_used)
-    ht.plot_coverage(out_plot, coverage_mat, table, features, features_used)
diff --git a/README.md b/README.md
index 7bcd94b..29db5d3 100644
--- a/README.md
+++ b/README.md
@@ -1,213 +1,220 @@
-[![Build Status](https://travis-ci.org/FRED-2/OptiType.svg?branch=master)](https://travis-ci.org/FRED-2/OptiType)
-OptiType
-========
+[![CI](https://github.com/FRED-2/OptiType/actions/workflows/ci.yml/badge.svg)](https://github.com/FRED-2/OptiType/actions/workflows/ci.yml)
+[![PyPI version](https://badge.fury.io/py/optitype.svg)](https://badge.fury.io/py/optitype)
+
+# OptiType
 
 Precision HLA typing from next-generation sequencing data
 
-Authors: András Szolek, Benjamin Schubert, Christopher Mohr  
-Date: April 2014  
-Version: 1.3.3  
-License: OptiType is released under a three-clause BSD license
+**Authors:** András Szolek, Benjamin Schubert, Christopher Mohr, Jonas Scheid
+**Version:** 1.5.0
+**License:** BSD-3-Clause
 
+## Introduction
 
-Introduction
--------------
 OptiType is a novel HLA genotyping algorithm based on integer linear
 programming, capable of producing accurate 4-digit HLA genotyping predictions
 from NGS data by simultaneously selecting all major and minor HLA Class I alleles.
 
+## Installation
 
-Requirements
--------------
-OptiType uses the following software and libraries:
-
-1. [Python 2.7](https://www.python.org/)
-2. [RazerS 3.4](http://www.seqan.de/projects/razers/)
-3. [SAMtools 1.2](http://www.htslib.org/)
-4. [HDF5 1.8.15](https://www.hdfgroup.org/HDF5/)
-5. [CPLEX 12.5](http://www-01.ibm.com/software/commerce/optimization/cplex-optimizer/)
-   or other Pyomo-supported ILP solver ([GLPK](https://www.gnu.org/software/glpk/), 
-   [CBC](https://projects.coin-or.org/Cbc), ...)
+### Via pip (Recommended)
 
-And the following Python modules:
+```bash
+pip install optitype
+```
 
-1. NumPy 1.9.3
-2. Pyomo 4.2
-3. PyTables 3.2.2
-4. Pandas 0.16.2
-5. Pysam 0.8.3
-6. Matplotlib 1.4.3
-7. Future 0.15.2
+### From source
 
-Note: CPLEX has a proprietary license but is free for academic use. See IBM's
-[academic initiative.](http://www-304.ibm.com/ibm/university/academic/pub/page/academic_initiative)
+```bash
+git clone https://github.com/FRED-2/OptiType.git
+cd OptiType
+pip install -e .
+```
 
-Installation via Docker
------------------------
+### External Dependencies
 
-1. Install Docker on your computer and make sure it works.
+OptiType requires external tools that cannot be installed via pip:
 
-2. Call `docker pull fred2/optitype` which will download the Docker image.
+1. **RazerS3** - Read mapper
+   ```bash
+   # Via conda/bioconda
+   conda install -c bioconda razers3
 
-3. You can use the image as followes:
+   # Or build from source: https://github.com/seqan/seqan
+   ```
 
-`docker run -v /path/to/data/folder:/data/ -t fred2/optitype -i input1 [input2] (-r|-d) -o /data/`
+2. **ILP Solver** - At least one of:
+   - **GLPK** (open source)
+     ```bash
+     apt install glpk-utils  # Debian/Ubuntu
+     conda install -c conda-forge glpk  # Conda
+     ```
+   - **CBC** (open source)
+     ```bash
+     apt install coinor-cbc  # Debian/Ubuntu
+     conda install -c conda-forge coincbc  # Conda
+     ```
+   - **CPLEX** (commercial, free for academia)
 
-OptiType uses the CBC-Solver and RazerS3 internally with one thread if no other configuration file is provided. RazerS3's binary can be found at `/usr/local/bin` within the Docker image. 
+### Verify Installation
 
-Installation from Source
-------------------------
-1. Install all required software and libraries from the first list.
+```bash
+optitype check-deps
+```
 
-2. Include SAMtools and your ILP solver in your `PATH` environment variable.
-They both have to be globally accessible every time you run OptiType, so make
-them permanent (put in in your `.bashrc` or similar shell startup script).
+## Quick Start
 
-3. Add HDF5's `lib` directory to your `LD_LIBRARY_PATH`. Make sure it's
-permanent too.
+### Command Line
 
-4. Create and activate a Python virtual environment with the package
-[virtualenv](https://virtualenv.pypa.io/en/latest/). This will automatically
-install the package manager `pip` which you will need for the next steps.
-Always run OptiType from this virtual environment.
+```bash
+# DNA sequencing (paired-end)
+optitype run -i reads_1.fastq -i reads_2.fastq --dna -o results/
 
-5. Install NumPy, Pyomo, Pysam and Matplotlib with
-the following commands:
+# RNA sequencing (single-end)
+optitype run -i sample.fastq --rna -o results/
 
-    ```
-    pip install numpy
-    pip install pyomo
-    pip install pysam
-    pip install matplotlib
-    ```
+# Re-analyze from BAM
+optitype run -i mapped.bam --dna -o results/
+```
 
-6. Create a new environment variable containing the path to your HDF5
-installation. It doesn't have to be permanent, but it has to be accessible
-when you install PyTables. On the bash shell it would be
-`export HDF5_DIR=/path/to/hdf5-1.8.15`
+### Python API
 
-7. Install PyTables, Pandas and Future with
+```python
+from optitype import run_hla_typing, HLATypingConfig
 
-    ```
-    pip install tables
-    pip install pandas
-    pip install future
-    ```
+result = run_hla_typing(
+    fastq_files=["sample_1.fastq", "sample_2.fastq"],
+    seq_type="dna",
+    config=HLATypingConfig(solver="cbc", threads=4)
+)
 
-8. Create a configuration file following `config.ini.example`. You will find
-all necessary instructions in there. OptiType will look for the configuration
-file at `config.ini` in the same directory by default, but you can put it
-anywhere and pass it with the `-c` option when running OptiType.
+print(result.best_result)
+# {'A1': 'A*02:01', 'A2': 'A*03:01', 'B1': 'B*07:02', ...}
+```
 
+## Usage
 
-Usage
--------------
+```
+optitype run --help
+
+Usage: optitype run [OPTIONS]
+
+  Run HLA typing analysis.
+
+Options:
+  -i, --input PATH       Input FASTQ or BAM files (use multiple times for paired-end)
+  -r, --rna              Input data is RNA sequencing
+  -d, --dna              Input data is DNA sequencing (default)
+  -o, --outdir PATH      Output directory for results (required)
+  -p, --prefix TEXT      Output filename prefix (default: timestamp)
+  -b, --beta FLOAT       Homozygosity detection parameter (0.0-0.1)
+  -e, --enumerate INT    Number of solutions to enumerate
+  --solver [glpk|cbc|cplex]  ILP solver to use
+  --razers3 PATH         Path to RazerS3 binary
+  --threads INT          Number of threads for mapping
+  -v, --verbose          Enable verbose output
+  -c, --config PATH      Path to config.ini file
+  --help                 Show this message and exit
+```
 
-Optional step zero: you might want to filter your sequencing data for
-HLA reads. Should you have to re-run OptiType multiple times on the same sample
-(different settings, etc.) it could save you time if instead of giving OptiType
-the full, multi-gigabyte sequencing data multiple times, you would rather give
-it the relevant reads only, on the order of megabytes.
+### Additional Commands
 
-You can use any read mapper to do this step, although we suggest you use RazerS3.
-Its only drawback is that due to way RazerS3 was designed, it loads all reads
-into memory, which could be a problem on older, low-memory computing nodes.
+```bash
+# Check dependencies
+optitype check-deps
 
-Make sure to filter your files correctly depending on whether you have DNA
-(exome, WGS) or RNA-Seq data. The reference fasta files are
-`data/hla_reference_dna.fasta` and `data/hla_reference_rna.fasta` respectively.
-Below is an example for DNA sequencing data:
+# Generate config file
+optitype init-config
 
+# Show installation info
+optitype info
 ```
->razers3 -i 95 -m 1 -dr 0 -o fished_1.bam /path/to/OptiType/data/hla_reference_dna.fasta sample_1.fastq
 
->samtools bam2fq fished_1.bam > sample_1_fished.fastq
+## Configuration
 
->rm fished_1.bam
+Generate a config file with:
+
+```bash
+optitype init-config -o config.ini
 ```
 
-If you have paired-end data, repeat this with the second ends' fastq file as well.
-Note: it's important that you filter the two ends individually. Don't use the
-read mapper's paired-end capabilities.
+Key settings:
 
-After the optional filtering, OptiType can be called as follows:
-```
->python /path/to/OptiTypePipeline.py -i sample_fished_1.fastq [sample_fished_2.fastq]
-                    (--rna | --dna) [--beta BETA] [--enumerate N]
-                    [-c CONFIG] [--verbose] --outdir /path/to/out_dir/
+```ini
+[mapping]
+threads=4           # Threads for read mapping
+
+[ilp]
+solver=glpk        # ILP solver: glpk, cbc, or cplex
+threads=1          # Threads for ILP solver
+
+[behavior]
+deletebam=true     # Delete intermediate BAM files
+unpaired_weight=0  # Weight for unpaired reads (0-1)
 ```
 
-This will produce a time-stamped directory inside the specified output directory
-containing a CSV file with the predicted optimal (and if enumerated, sub-optimal)
-HLA genotype, and a pdf file containing a coverage plot of the predicted alleles
-for diagnostic purposes.
+## Docker
 
+```bash
+docker pull fred2/optitype
+docker run -v /path/to/data:/data -t fred2/optitype \
+    -i /data/reads_1.fastq -i /data/reads_2.fastq --dna -o /data/results/
 ```
->python OptiTypePipeline.py --help  
-
-usage: OptiType [-h] --input FQ [FQ] (--rna | --dna) [--beta B]
-                [--enumerate N] --outdir OUTDIR [--verbose] [--config CONFIG]
-
-OptiType: 4-digit HLA typer
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --input FQ [FQ], -i FQ [FQ]
-                        .fastq file(s) (fished or raw) or .bam files stored
-                        for re-use, generated by an earlier OptiType run. One
-                        file: single-end mode, two files: paired-end mode.
-  --rna, -r             Use with RNA sequencing data.
-  --dna, -d             Use with DNA sequencing data.
-  --beta B, -b B        The beta value for for homozygosity detection (see
-                        paper). Default: 0.009. Handle with care.
-  --enumerate N, -e N   Number of enumerations. OptiType will output the
-                        optimal solution and the top N-1 suboptimal solutions
-                        in the results CSV. Default: 1
-  --outdir OUTDIR, -o OUTDIR
-                        Specifies the out directory to which all files should
-                        be written.
-  --prefix PREFIX, -p PREFIX
-                        Specifies prefix of output files
-  --verbose, -v         Set verbose mode on.
-  --config CONFIG, -c CONFIG
-                        Path to config file. Default: config.ini in the same
-                        directory as this script
+
+## Test Examples
+
+```bash
+# DNA (paired-end)
+optitype run \
+    -i ./test/exome/NA11995_SRR766010_1_fished.fastq \
+    -i ./test/exome/NA11995_SRR766010_2_fished.fastq \
+    --dna -v -o ./test/exome/
+
+# RNA (paired-end)
+optitype run \
+    -i ./test/rna/CRC_81_N_1_fished.fastq \
+    -i ./test/rna/CRC_81_N_2_fished.fastq \
+    --rna -v -o ./test/rna/
 ```
 
-Furthermore, depending on your settings in `config.ini` you can choose to keep
-the bam files OptiType produces when all-mapping reads against the reference:
-these will be stored in the output directory of your current run.
+## Output
 
-Then, if you want to re-run OptiType on the same sample, you can give it those
-intermediate `.bam` files as input instead of `.fastq` files, and spare on the
-mapping part of the pipeline. Note: these `.bam` files have nothing to do with
-those produced during the filtering/fishing step.
+OptiType produces:
+- `*_result.tsv` - HLA typing results
+- `*_coverage_plot.pdf` - Coverage visualization
 
+Example output:
 
-Test examples
--------------
-DNA data (paired end):
 ```
-python OptiTypePipeline.py -i ./test/exome/NA11995_SRR766010_1_fished.fastq ./test/exome/NA11995_SRR766010_2_fished.fastq --dna -v -o ./test/exome/
+	A1	A2	B1	B2	C1	C2	Reads	Objective
+0	A*02:01	A*03:01	B*07:02	B*44:02	C*07:02	C*05:01	1234	1156.5
 ```
 
-RNA data (paired end):
-```
-python OptiTypePipeline.py -i ./test/rna/CRC_81_N_1_fished.fastq ./test/rna/CRC_81_N_2_fished.fastq --rna -v -o ./test/rna/
-```
+## Migration from v1.x
 
-Contact
--------------
-András Szolek  
-szolek@informatik.uni-tuebingen.de  
-University of Tübingen, Applied Bioinformatics,  
-Center for Bioinformatics, Quantitative Biology Center,  
-and Dept. of Computer Science,  
-Sand 14, 72076 Tübingen, Germany
+Version 1.5 introduces a modernized CLI. Main changes:
 
+- Install with `pip install optitype`
+- Use `optitype run` instead of `python OptiTypePipeline.py`
+- Multiple input files: use `-i file1 -i file2` instead of `-i file1 file2`
+- Data bundled with package (no need to set paths)
+
+The core algorithm and output format remain unchanged.
+
+## Requirements
+
+- Python 3.10+
+- External: RazerS3, ILP solver (GLPK/CBC/CPLEX)
+
+## Reference
 
-Reference
--------------
 Szolek, A, Schubert, B, Mohr, C, Sturm, M, Feldhahn, M, and Kohlbacher, O (2014).
-OptiType: precision HLA typing from next-generation sequencing data
-Bioinformatics, 30(23):3310-6.
+**OptiType: precision HLA typing from next-generation sequencing data**
+*Bioinformatics*, 30(23):3310-6.
+[doi:10.1093/bioinformatics/btu548](https://doi.org/10.1093/bioinformatics/btu548)
+
+## Contact
+
+András Szolek
+szolek@informatik.uni-tuebingen.de
+University of Tübingen, Applied Bioinformatics
diff --git a/data/alleles.h5 b/data/alleles.h5
deleted file mode 100644
index 0710a20..0000000
Binary files a/data/alleles.h5 and /dev/null differ
diff --git a/data/alleles/features.parquet b/data/alleles/features.parquet
new file mode 100644
index 0000000..92f9503
Binary files /dev/null and b/data/alleles/features.parquet differ
diff --git a/data/alleles/table.parquet b/data/alleles/table.parquet
new file mode 100644
index 0000000..cfb2f62
Binary files /dev/null and b/data/alleles/table.parquet differ
diff --git a/hlatyper.py b/hlatyper.py
deleted file mode 100644
index 0ff466a..0000000
--- a/hlatyper.py
+++ /dev/null
@@ -1,778 +0,0 @@
-from __future__ import division
-from __future__ import print_function
-from builtins import zip
-from builtins import str
-from builtins import map
-from builtins import range
-from past.utils import old_div
-import pandas as pd
-import numpy as np
-import re
-import pylab
-import warnings
-from collections import OrderedDict
-from datetime import datetime
-import sys
-
-try:
-    import pysam
-    PYSAM_AVAILABLE = True
-except ImportError:
-    PYSAM_AVAILABLE = False
-
-VERBOSE = False
-
-def now(start=datetime.now()):
-    # function argument NOT to be ever set! It gets initialized at function definition time
-    # so we can calculate difference without further hassle.
-    return str(datetime.now()-start)[:-4]
-
-
-def memoize(f):  # from http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/
-    class MemoDict(dict):
-        def __missing__(self, key):
-            ret = self[key] = f(key)
-            return ret
-    return MemoDict().__getitem__
-
-# if we used MDINSHP=X chars this could be a full cigar parser, however, we only need M and D to
-# determine a read's span on the reference sequence to create a coverage plot.
-CIGAR_SLICER = re.compile(r'[0-9]+[MD]')
-
-@memoize
-def length_on_reference(cigar_string):
-    return sum([int(p[:-1]) for p in re.findall(CIGAR_SLICER, cigar_string)])
-
-
-def feature_order(feature):
-    # to use it in sorted(..., key=feature_order) this returns:
-    # 0 for ('UTR', 1)
-    # 2 for exon, 1
-    # 3 for intron, 1
-    # 4 for exon, 2
-    # 5 for intron, 2
-    # ...
-    # 999 for UTR, 2
-    feat_type, feat_number = feature
-    assert feat_type in ('intron', 'exon', 'UTR'), 'Unknown feature in list (function accepts intron/exon/UTR)'
-    assert isinstance(feat_number, int), 'Feature number has to be integer'
-    return 0 if feature==('UTR', 1) else 999 if feature==('UTR', 2) else (feat_number*2 + (feat_type=='intron'))
-
-
-def store_dataframes(out_hdf, **kwargs):
-    # DataFrames to serialize have to be passed by keyword arguments. An argument matrix1=DataFrame(...)
-    # will be written into table 'matrix1' in the HDF file.
-
-    complevel = kwargs.pop('complevel', 9)   # default complevel & complib values if
-    complib = kwargs.pop('complib', 'zlib')  # not explicitly asked for as arguments
-
-    if VERBOSE:
-        print(now(), 'Storing %d DataFrames in file %s with compression settings %d %s...' % (len(kwargs), out_hdf, complevel, complib))
-
-    store = pd.HDFStore(out_hdf, complevel=complevel, complib=complib)  # TODO: WRITE ONLY? it probably appends now
-    for table_name, dataframe in kwargs.items():
-        store[table_name] = dataframe
-    store.close()
-
-    if VERBOSE:
-        print(now(), 'DataFrames stored in file.')
-
-
-def load_hdf(in_hdf, as_dict=False, *args):  # isn't really neccesary, but makes a read-only flag on it to be sure
-
-    store = pd.HDFStore(in_hdf, 'r')
-    if len(args):
-        if as_dict:
-            to_return = {table: store[table] for table in args}
-        else:
-            to_return = tuple((store[table] for table in args))
-        store.close()
-        return to_return
-    else:
-        return store  # store doesn't get closed! Either the user closes it manually or gets closed on exit
-
-
-def sam_to_hdf(samfile):
-    if VERBOSE:
-        print(now(), 'Loading alleles and read IDs from %s...' % samfile)
-
-    # run through the SAM file once to see how many reads and alleles we are dealing with
-    # for a one-step DataFrame initialization instead of slow growing
-
-    read_ids, allele_ids = [], []
-    first_hit_row = True
-    total_hits = 0
-
-    with open(samfile, 'rb') as f:
-        last_read_id = None
-        for line in f:
-            line = line.decode('utf-8')
-            if line.startswith('@'):
-                if line.startswith('@SQ'):
-                    allele_ids.append(line.split('\t')[1][3:]) # SN:HLA:HLA00001
-                continue
-
-            total_hits += 1
-            read_id = line.split('\t')[0]
-            if last_read_id != read_id:
-                read_ids.append(read_id)
-                last_read_id = read_id
-
-            if first_hit_row:  # analyze SAM file structure and find MD tag column (describes mismatches).
-                first_hit_row = False
-                columns = line.split()
-                try:
-                    nm_index = list(map(lambda x: x.startswith('NM:'), columns)).index(True)
-                except ValueError:
-                    # TODO: we don't really handle the case if NM-tag is not present, code will fail later
-                    print('\tNo NM-tag found in SAM file!')
-                    nm_index = None
-
-    if VERBOSE:
-        print(now(), '%d alleles and %d reads found.' % (len(allele_ids), len(read_ids)))
-        print(now(), 'Initializing mapping matrix...')
-
-    # major performance increase if we initialize a numpy zero-matrix and pass that in the constructor
-    # than if we just let pandas initialize its default NaN matrix
-    matrix_pos = pd.DataFrame(np.zeros((len(read_ids), len(allele_ids)), dtype=np.uint16), columns=allele_ids, index=read_ids)
-
-    # read_details contains NM and read length tuples, calculated from the first encountered CIGAR string.
-    read_details = OrderedDict()
-
-    if VERBOSE:
-        print(now(), '%dx%d mapping matrix initialized. Populating %d hits from SAM file...' % (len(read_ids), len(allele_ids), total_hits))
-
-    milestones = [x * total_hits / 10 for x in range(1, 11)]  # for progress bar
-
-    with open(samfile, 'rb') as f:
-        counter = 0
-        percent = 0
-        for line in f:
-            line = line.decode('utf-8')
-            if line.startswith('@'):
-                continue
-
-            fields = line.strip().split('\t')
-            read_id, allele_id, position, cigar, nm = (fields[i] for i in (0, 2, 3, 5, nm_index))
-
-            if read_id not in read_details:
-                read_details[read_id] = (int(nm[5:]), length_on_reference(cigar))
-
-            matrix_pos[allele_id][read_id] = int(position)  # SAM indexes from 1, so null elements are not hits
-
-            counter += 1
-            if counter in milestones:
-                percent += 10
-                if VERBOSE:
-                    print('\t%d%% completed' % percent)
-    if VERBOSE:
-        print(now(), '%d elements filled. Matrix sparsity: 1 in %.2f' % (counter, matrix_pos.shape[0]*matrix_pos.shape[1]/float(counter)))
-
-    # convert HLA:HLA00001 identifiers to HLA00001
-    matrix_pos.rename(columns=lambda x: x.replace('HLA:', ''), inplace=True)
-
-    details_df = pd.DataFrame.from_dict(read_details, orient='index')
-    details_df.columns = ['mismatches', 'read_length']
-
-    return matrix_pos, details_df
-
-
-def pysam_to_hdf(samfile):
-    if not PYSAM_AVAILABLE:
-        print("Warning: PySam not available on the system. Falling back to primitive SAM parsing.")
-        return sam_to_hdf(samfile)
-
-    sam_or_bam = 'rb' if samfile.endswith('.bam') else 'r'
-    sam = pysam.AlignmentFile(samfile, sam_or_bam)
-    is_yara = (sam.header['PG'][0]['ID'] in ('Yara', 'yara'))
-    # If yara produced the sam/bam file, we need to know in what form to expect secondary alignments.
-    # If the -os flag was used in the call, they are proper secondary alignments. Otherwise, they are
-    # one line per read with a long XA custom tag containing alternative hits.
-    xa_tag = is_yara and (' -os ' not in sam.header['PG'][0]['CL'])
-
-    nref = sam.nreferences
-    hits = OrderedDict()
-
-    allele_id_to_index = {aa: ii for ii, aa in enumerate(sam.references)}
-
-    # read_details contains NM and read length tuples. We ignore length on reference from now on.
-    # Not worth the effort and calculation. Indels are rare and they have to be dealt with differently. The coverage
-    # plot wil still be fine and +-1 bp regarding read end isn't concerning.
-    read_details = OrderedDict()
-
-    if VERBOSE:
-        print(now(), 'Loading %s started. Number of HLA reads loaded (updated every thousand):' % samfile)
-
-    read_counter = 0
-    hit_counter = 0
-
-    for aln in sam:
-        # TODO: we could spare on accessing hits[aln.qname] if we were guaranteed alignments of one read come in batches
-        # and never mix. Most aligners behave like this but who knows...
-        if aln.qname not in hits:  # for primary alignments (first hit of read)
-            # not using defaultdict because we have other one-time things to do when new reads come in anyway
-            hits[aln.qname] = np.zeros(nref, dtype=np.uint16)  # 16 bits are enough for 65K positions, enough for HLA.
-            # TODO: as soon as we don't best-map but let in suboptimal alignments, this below is not good enough,
-            # as we need to store NM for every read-allele pair.
-            # and then there are cases (usually 1D/1I artefacts at the end) where aligned reference length isn't the same
-            # for all alignments of a read. How to handle that? Maybe needless if we re-map reads anyway.
-            read_details[aln.qname] = (aln.get_tag('NM'), aln.query_length)  # aln.reference_length it used to be. Soft-trimming is out of question now.
-            read_counter += 1
-            if VERBOSE and not (read_counter % 1000):
-                sys.stdout.write('%dK...' % (old_div(len(hits),1000)))
-                sys.stdout.flush()
-            if xa_tag and aln.has_tag('XA'):
-                current_row = hits[aln.qname]  # we may access this hundreds of times, better do it directly
-                subtags = aln.get_tag('XA').split(';')[:-1]
-                hit_counter += len(subtags)
-                for subtag in subtags:
-                    allele, pos = subtag.split(',')[:2]  # subtag is like HLA02552,691,792,-,1 (id, start, end, orient, nm)
-                    current_row[allele_id_to_index[allele]] = int(pos)  # 1-based positions
-
-        # this runs for primary and secondary alignments as well:
-        hits[aln.qname][aln.reference_id] = aln.reference_start + 1  # pysam reports 0-based positions
-        hit_counter += 1
-        # num_mismatches = aln.get_tag('NM')  # if we ever need suboptimal alignments... doubtful.
-
-    if VERBOSE:
-        print('\n', now(), len(hits), 'reads loaded. Creating dataframe...')
-    pos_df = pd.DataFrame.from_dict(hits, orient='index')
-    pos_df.columns = sam.references[:]
-    details_df = pd.DataFrame.from_dict(read_details, orient='index')
-    details_df.columns = ['mismatches', 'read_length']
-    if VERBOSE:
-        print(now(), 'Dataframes created. Shape: %d x %d, hits: %d (%d), sparsity: 1 in %.2f' % (
-            pos_df.shape[0], pos_df.shape[1], np.sign(pos_df).sum().sum(), hit_counter, pos_df.shape[0]*pos_df.shape[1]/float(hit_counter)
-            ))  # TODO: maybe return the binary here right away if we're using it to calculate density anyway.
-    return pos_df, details_df
-
-
-def get_compact_model(hit_df, weak_hit_df=None, weight=None):
-# turn a binary hit matrix dataframe into a smaller matrix DF that removes duplicate rows and
-# creates the "occurence" vector with the number of rows the representative read represents.
-# Note: one can pass "weak" hits (e.g., unpaired reads) and use them with a lower weight.
-
-    hit_df = hit_df.loc[hit_df.any(axis=1)]  # remove all-zero rows
-    occurence = {r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.values()}
-
-    if weak_hit_df is not None:
-        weak_hit_df = weak_hit_df.loc[weak_hit_df.any(axis=1)]
-        assert 0 < weight <= 1, 'weak hit weight must be in (0, 1]'
-        weak_occ = {r[0]: len(r)*weight for r in weak_hit_df.groupby(weak_hit_df.columns.tolist()).groups.values()}
-        occurence.update(weak_occ)
-        unique_mtx = pd.concat([hit_df.drop_duplicates(), weak_hit_df.drop_duplicates()])
-    else:
-        unique_mtx = hit_df.drop_duplicates()
-    
-    return unique_mtx, occurence
-
-
-def mtx_to_sparse_dict(hit_df):
-    # takes a hit matrix and creates a dictionary of (read, allele):1 tuples corresponding to hits
-    # (needed by OptiType)
-    all_hits = {}
-    for read_id, alleles in hit_df.iterrows():
-        hit_alleles = alleles[alleles!=0].index
-        for hit_allele in hit_alleles:
-            all_hits[(read_id, hit_allele)] = 1
-    return all_hits
-    #return {(read, allele): 1 for read in hit_df.index for allele in hit_df.columns if hit_df[allele][read]>0}  # awfully inefficient
-
-
-def create_allele_dataframes(imgt_dat, fasta_gen, fasta_nuc):
-    from Bio import SeqIO
-    if VERBOSE:
-        print(now(), 'Loading IMGT allele dat file...')
-
-    alleles = OrderedDict()
-
-    with open(imgt_dat, 'rU') as handle:
-        for i, record in enumerate(SeqIO.parse(handle, "imgt")):
-            # TODO: IMGT has changed the ID system. Now it's HLA00001.1 or HLA12345.2 showing versions. I'll get rid of this now though
-            record.id = record.id.split('.')[0]
-            alleles[record.id] = record
-
-    if VERBOSE:
-        print(now(), 'Initializing allele DataFrame...')
-
-    '''
-    id      HLA000001
-    type    A*01:01:01:01
-    locus   A
-    class   I --- I, II, TAP, MIC, other
-    flags   0 --- problems with allele stored here, see below
-    len_gen  3503
-    len_nuc  1098
-    full_gen  1
-    full_nuc  1
-
-    flagging: sum of the below codes
-    +1 if HLA type ends in a letter (N, Q, etc.)
-    +2 if CDS annotation != exon annotation
-    +4 if features don't add up to gen sequence
-    +8 if exons don't add up to nuc sequence
-    '''
-
-    allele_info = 'id type 4digit locus flags len_dat len_gen len_nuc full_gen full_nuc'
-
-    table = pd.DataFrame(index=list(alleles.keys()), columns=allele_info.split())
-    sequences = []
-
-    if VERBOSE:
-        print(now(), 'Filling DataFrame with allele data...')
-
-    all_features = []  # contains tuples: (HLA id, feature type, feature number, feature start, feature end)
-
-    for allele in alleles.values():
-
-        allele_type = allele.description.replace('HLA-', '').split(',')[0]
-
-        table.loc[allele.id]['id'] = allele.id
-        table.loc[allele.id]['type'] = allele_type
-        table.loc[allele.id]['4digit'] = ':'.join(allele_type.split(':')[:2])
-        table.loc[allele.id]['locus'] = allele_type.split('*')[0]
-        table.loc[allele.id]['flags'] = 0 if allele_type[-1].isdigit() else 1
-        table.loc[allele.id]['len_dat'] = len(str(allele.seq))
-        table.loc[allele.id]['len_gen'] = 0   # TODO: IT STILL DOESNT SOLVE PERFORMANCEWARNING!
-        table.loc[allele.id]['len_nuc'] = 0   # we initialize these nulls so that we don't get a
-        table.loc[allele.id]['full_gen'] = 0  # PerformanceWarning + pickling when storing HDF
-        table.loc[allele.id]['full_nuc'] = 0  # because of NaNs (they don't map to ctypes)
-        sequences.append((allele.id, 'dat', str(allele.seq)))
-
-
-        # number of features in 2013-04-30 hla.dat:
-        # 9296 source (total # of alleles)
-        # 9291 CDS, 5 gene (HLA-P pseudogenes)
-        # 24493 exons, 3697 introns, 1027 UTRs
-        # CDS matches exons in 99+% of cases, some have a single base-pair extension at the end for some
-        # weird single-base exons or such (all on unimportant pseudogene loci)
-        # so we extract exons, introns and UTRs
-        features = [f for f in allele.features if f.type in ('exon', 'intron', 'UTR')]
-
-        for feature in features:
-            if feature.type in ('exon', 'intron'):
-                feature_num = int(feature.qualifiers['number'][0])
-            else:  # UTR
-                # UTR either starts at the beginning or ends at the end
-                assert feature.location.start == 0 or feature.location.end == len(allele.seq)
-                feature_num = 1 if feature.location.start == 0 else 2  # 1 if 5' UTR, 2 if 3' UTR
-
-            all_features.append(
-                (allele.id, feature.type, feature_num,
-                int(feature.location.start), int(feature.location.end), len(feature), feature_order((feature.type, feature_num)))
-                )
-
-        # small sanity check, can be commented out
-        cds = [f for f in allele.features if f.type == 'CDS']
-        if cds:
-            if sum(map(len, [f for f in features if f.type == 'exon'])) != len(cds[0]):
-                if VERBOSE:
-                    print("\tCDS length doesn't match sum of exons for", allele.id, allele_type)
-                table.loc[allele.id]['flags'] += 2
-        else:
-            if VERBOSE:
-                print("\tNo CDS found for", allele.id, allele_type)
-            table.loc[allele.id]['flags'] += 2
-
-    if VERBOSE:
-        print(now(), 'Loading gen and nuc files...')
-
-    with open(fasta_gen, 'r') as fasta_gen:
-        for record in SeqIO.parse(fasta_gen, 'fasta'):
-            allele_id = record.id.replace('HLA:', '')
-            table.loc[allele_id]['len_gen'] = len(record.seq)
-            sequences.append((allele_id, 'gen', str(record.seq)))
-
-    with open(fasta_nuc, 'r') as fasta_nuc:
-        for record in SeqIO.parse(fasta_nuc, 'fasta'):
-            allele_id = record.id.replace('HLA:', '')
-            table.loc[allele_id]['len_nuc'] = len(record.seq)
-            sequences.append((allele_id, 'nuc', str(record.seq)))
-
-    # convert list of tuples into DataFrame for features and sequences
-    all_features = pd.DataFrame(all_features, columns=['id', 'feature', 'number', 'start', 'end', 'length', 'order'])
-    sequences = pd.DataFrame(sequences, columns=['id', 'source', 'sequence'])
-
-
-    joined = pd.merge(table, all_features, how='inner', on='id')
-
-    exons_for_locus = {}
-
-    for i_locus, i_group in joined.groupby('locus'):
-        exons_for_locus[i_locus] = i_group[i_group['feature']=='exon']['number'].max()
-
-    # for i_id, i_group in joined.groupby('id'):
-    #     max_exons = exons_for_locus[table.loc[i_id]['locus']]
-    #     if len(i_group) >= 2*max_exons-1:
-    #         print i_id, 'is fully annotated on all exons and introns and UTRs'
-
-    if VERBOSE:
-        print(now(), 'Checking dat features vs gen/nuc sequences...')
-
-    for allele, features in joined.groupby('id'):
-        row = features.irow(0)  # first row of the features subtable. Contains all allele information because of the join
-        sum_features_length = features['length'].sum()
-        sum_exons_length = features.loc[features['feature']=='exon']['length'].sum()
-        if row['len_gen']>0 and row['len_gen'] != sum_features_length:
-            if VERBOSE:
-                print("\tFeature lengths don't add up to gen sequence length", allele, row['len_gen'], sum_features_length, row['type'])
-            table.loc[allele]['flags'] += 4
-        if row['len_nuc']>0 and row['len_nuc'] != sum_exons_length:
-            if VERBOSE:
-                print("\tExon lengths don't add up to nuc sequence length", allele, row['len_nuc'], sum_exons_length, row['type'])
-            table.loc[allele]['flags'] += 8
-
-    if VERBOSE:
-        print(now(), 'Sanity check finished. Computing feature sequences...')
-
-    ft_seq_lookup = OrderedDict()
-    ft_seq_lookup['---DUMMY---'] = 0  # it will be useful later on if 0 isn't used. lookup_id*boolean operation, etc.
-    ft_counter = 1
-    all_ft_counter = 0
-    all_features['seq_id'] = 0
-    for i_id, i_features in all_features.groupby('id'):
-        seq = sequences.loc[(sequences['id']==i_id) & (sequences['source']=='dat')].irow(0)['sequence']
-        for ft_idx, feature in i_features.iterrows():
-            ft_seq = seq[feature['start']:feature['end']]
-            all_ft_counter += 1
-            if ft_seq not in ft_seq_lookup:
-                ft_seq_lookup[ft_seq] = ft_counter
-                all_features.loc[ft_idx, 'seq_id'] = ft_counter
-                ft_counter += 1
-            else:
-                all_features.loc[ft_idx, 'seq_id'] = ft_seq_lookup[ft_seq]
-
-    feature_sequences = pd.DataFrame([seq for seq in list(ft_seq_lookup.keys())], columns=['sequence'])  # , index=ft_seq_lookup.values() but it's 0,1,2,... anyway, as the default
-
-    return table, all_features, sequences, feature_sequences
-
-
-def prune_identical_alleles(binary_mtx, report_groups=False):
-    # return binary_mtx.transpose().drop_duplicates().transpose()
-    # # faster:
-    hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0]))  # dtype np.uint16 okay here because result will be float64
-    if report_groups:
-        grouper = hash_columns.groupby(hash_columns)
-        groups = {g[1].index[0]: g[1].index.tolist() for g in grouper}
-    alleles_to_keep = hash_columns.drop_duplicates().index  # relying on keeping the first representative
-    # TODO: maybe return sets of alleles that were collapsed into the representative (identical patterned sets)
-    return binary_mtx[alleles_to_keep] if not report_groups else (binary_mtx[alleles_to_keep], groups)
-
-
-def prune_identical_reads(binary_mtx):
-    # almost the same as ht.get_compact_model() except it doesn't return an occurence vector.
-    # It should only be used to compactify a matrix before passing it to the function finding
-    # overshadowed alleles. Final compactifying should be later done on the original binary matrix.
-    #
-    # return binary_mtx.drop_duplicates()
-    # # faster:
-    reads_to_keep = binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index  # dtype np.uint16 okay because result will be float64
-    return binary_mtx.loc[reads_to_keep]
-
-
-def prune_overshadowed_alleles(binary_mtx):
-    # Calculates B_T*B of the (pruned) binary matrix to determine if certain alleles "overshadow" others, i.e.
-    # have the same hits as other alleles plus more. In this case, these "other alleles" can be thrown out early, as
-    # they would be never chosen over the one overshadowing them.
-    #
-    # For a 1000reads x 3600alleles matrix it takes 15 seconds.
-    # So you should always prune identical columns and rows before to give it a matrix as small as possible.
-    # So ideal usage:
-    # prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx)))
-
-    # np.dot() is prone to overflow and doesn't expand to int64 like np.sum(). Our binary matrices are np.uint16.
-    # If we have less than 65K rows in the binary mtx, we're good with uint16. We're also good if there's no
-    # column with 65K+ hits. We check for both with lazy 'or' to avoid calculating the sum if not needed anyway.
-    # If we would overflow, we change to uint32.
-    if (binary_mtx.shape[0] < np.iinfo(np.uint16).max) or (binary_mtx.sum(axis=0).max() < np.iinfo(np.uint16).max):
-        # In case we would reduce the binary mtx to np.uint8 we should to ensure it's at least uint16.
-        bb = binary_mtx if all(binary_mtx.dtypes == np.uint16) else binary_mtx.astype(np.uint16)
-    else:
-        bb = binary_mtx.astype(np.uint32)
-
-    covariance = bb.transpose().dot(bb)
-    diagonal = pd.Series([covariance[ii][ii] for ii in covariance.columns], index=covariance.columns)
-    new_covariance = covariance[covariance.columns]
-    for ii in new_covariance.columns:
-        new_covariance[ii][ii] = 0
-    overshadowed = []
-    for ii in new_covariance.columns:
-        potential_superiors = new_covariance[ii][new_covariance[ii]==diagonal[ii]].index
-        if any(diagonal[potential_superiors] > diagonal[ii]):
-            overshadowed.append(ii)
-    non_overshadowed = covariance.columns.difference(overshadowed)
-    return non_overshadowed
-
-def create_paired_matrix(binary_1, binary_2, id_cleaning=None):
-    # id_cleaning is a function object that turns a read ID that contains pair member information (like XYZ_1234:5678/1) into an
-    # ID that identifies the pair itself (like XYZ_1234:5678) so we can match the two DF's IDs. In the above case a suitable
-    # id_cleaning function would be lambda x: x[:-2] (gets rid of /1 and /2 from the end). Sometimes it's trickier as pair member
-    # information can be somewhere in the middle, and sometimes it's not even required at all as both pair members have the same ID
-    # just in two different files (1000 genomes).
-
-    if id_cleaning is not None:
-        binary_1.index = list(map(id_cleaning, binary_1.index))
-        binary_2.index = list(map(id_cleaning, binary_2.index))
-
-    common_read_ids = binary_1.index.intersection(binary_2.index)
-    only_1 = binary_1.index.difference(binary_2.index)
-    only_2 = binary_2.index.difference(binary_1.index)
-
-    b_1 = binary_1.loc[common_read_ids]
-    b_2 = binary_2.loc[common_read_ids]
-    b_12 = b_1 * b_2  # elementwise AND
-    b_ispaired = b_12.any(axis=1)  # reads with at least one allele w/ paired hits
-    b_paired = b_12.loc[b_ispaired]
-    b_mispaired = b_1.loc[~b_ispaired] + b_2.loc[~b_ispaired]  # elementwise AND where two ends only hit different alleles
-    b_unpaired = pd.concat([binary_1.loc[only_1], binary_2.loc[only_2]])  # concatenation for reads w/ just one end mapping anywhere
-
-    if VERBOSE:
-        print(now(), ('Alignment pairing completed. %d paired, %d unpaired, %d discordant ' %
-            (b_paired.shape[0], b_unpaired.shape[0], b_mispaired.shape[0])))
-
-    return b_paired, b_mispaired, b_unpaired
-
-
-def get_features(allele_id, features, feature_list):
-    # allele_id can be like HLA12345_HLA67890: then we take features from HLA12345 first if possible, then from HLA67890
-    # for sequence reconstruction (in that case HLA67890 should be a nearest neighbor of HLA12345)
-    if '_' in allele_id:
-        partial_allele, complete_allele = allele_id.split('_')
-    else:
-        complete_allele = allele_id
-
-    feats_complete = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==complete_allele].iterrows()}
-    feats_partial = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==partial_allele].iterrows()} if '_' in allele_id else feats_complete
-
-    feats_to_include = []
-
-    for feat in sorted(feature_list, key=feature_order):
-        if feat in feats_partial:
-            feats_to_include.append(feats_partial[feat])
-        elif feat in feats_complete:
-            feats_to_include.append(feats_complete[feat])
-        else:
-            warnings.warn('Feature %s not found for allele %s' % (feat, allele_id))
-
-    return pd.DataFrame(feats_to_include)
-
-
-def calculate_coverage(alignment, features, alleles_to_plot, features_used):
-    assert len(alignment) in (2, 4, 5), ("Alignment tuple either has to have 2, 4 or 5 elements. First four: pos, read_details "
-        "once or twice depending on single or paired end, and an optional binary DF at the end for PROPER paired-end plotting")
-    has_pairing_info = (len(alignment) == 5)
-
-    if len(alignment) == 2:
-        matrix_pos, read_details = alignment[:2]
-        pos = matrix_pos[alleles_to_plot]
-        pairing = np.sign(pos) * 2  # see explanation on the else branch. These mean unpaired hits.
-        hit_counts = np.sign(pos).sum(axis=1)  # how many of the selected alleles does a particular read hit? (unambiguous hit or not)
-        # if only a single allele is fed to this function that has no hits at all, we still want to create a null-matrix
-        # for it. Its initialization depends on max_ambiguity (it's the size of one of its dimensions) so we set this to 1 at least.
-        max_ambiguity = max(1, hit_counts.max())
-        to_process = [(pos, read_details, hit_counts, pairing)]
-    elif len(alignment) == 4:  # TODO: deprecated. We don't use this, it should probably be taken out.
-        matrix_pos1, read_details1, matrix_pos2, read_details2 = alignment
-        pos1 = matrix_pos1[alleles_to_plot]
-        pos2 = matrix_pos2[alleles_to_plot]
-        pairing1 = np.sign(pos1) * 2  # every read is considered unpaired
-        pairing2 = np.sign(pos2) * 2
-        hit_counts1 = np.sign(pos1).sum(axis=1)
-        hit_counts2 = np.sign(pos2).sum(axis=1)
-        max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
-        to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)]
-    else:
-        matrix_pos1, read_details1, matrix_pos2, read_details2, pairing_binaries = alignment
-        bin_p, bin_u, bin_m = pairing_binaries
-        pos1 = matrix_pos1[alleles_to_plot]
-        pos2 = matrix_pos2[alleles_to_plot]
-        # pairing's values - 0: no hit, 1: paired hit, 2: unpaired hit, 3: discordantly paired hit
-        pairing = pd.concat([bin_p[alleles_to_plot], bin_u[alleles_to_plot]*2, bin_m[alleles_to_plot]*3])
-        pairing1 = pairing.loc[pos1.index]  # pairing information for first ends' hit matrix
-        pairing2 = pairing.loc[pos2.index]  # pairing information for second ends' hit matrix
-        hit_counts1 = np.sign(pairing1).sum(axis=1)
-        hit_counts2 = np.sign(pairing2).sum(axis=1)
-        max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
-        to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)]
-
-    coverage_matrices = []
-
-    for allele in alleles_to_plot:
-        allele_features = get_features(allele, features, features_used)
-        allele_length = allele_features['length'].sum()
-
-        # Dimensions:
-        #   1st - 0: perfect hit, 1: hit w/ mismatch(es)
-        #   2nd - 0: paired, 1: unpaired, 2: discordantly paired  # maybe introduce unpaired despite being paired on other alleles
-        #   3rd - 0: unique hit, 1: ambiguous hit between two alleles, ... n: ambiguous bw/ n+1 alleles
-        #   4th - 0 ... [sequence length]
-        coverage = np.zeros((2, 3, max_ambiguity, allele_length), dtype=int)
-
-        # to_process is a list containing tuples with mapping/position/hit property dataframes to superimpose.
-        # For single-end plotting to_process has just one element. For paired end, to_process has two elements that
-        # were pre-processed to only plot reads where both pairs map, etc. but the point is that superimposing the two
-        # will provide the correct result.
-
-        for pos, read_details, hit_counts, pairing_info in to_process:
-            reads = pos[pos[allele]!=0].index  # reads hitting allele: coverage plot will be built using these
-            for i_pos, i_read_length, i_mismatches, i_hitcount, i_pairing in zip(
-                    pos.loc[reads][allele],
-                    read_details.loc[reads]['read_length'],
-                    read_details.loc[reads]['mismatches'],
-                    hit_counts[reads],
-                    pairing_info.loc[reads][allele]):
-                if not i_pairing:
-                    continue  # or i_pairing = 4. Happens if one end maps to the allele but a different allele has a full paired hit.
-                coverage[int(bool(i_mismatches))][i_pairing-1][i_hitcount-1][i_pos-1:i_pos-1+i_read_length] += 1
-
-        coverage_matrices.append((allele, coverage))
-    return coverage_matrices
-
-
-def plot_coverage(outfile, coverage_matrices, allele_data, features, features_used, columns=2):
-
-    def start_end_zeros(cov_array):
-        # the areaplot polygon gets screwed up if the series don't end/start with zero
-        # this adds a zero to the sides to circumvent that
-        return np.append(np.append([0], cov_array), [0])
-
-    def allele_sorter(allele_cov_mtx_tuple):
-        allele, _ = allele_cov_mtx_tuple
-        return allele_data.loc[allele.split('_')[0]]['type']
-
-    def get_allele_locus(allele):
-        return allele_data.loc[allele.split('_')[0]]['locus']
-
-    number_of_loci = len(set((get_allele_locus(allele) for allele, _ in coverage_matrices)))
-
-    dpi = 50
-    box_size = (7, 1)
-
-    # subplot_rows = len(coverage_matrices)/columns + bool(len(coverage_matrices) % columns)  # instead of float division + rounding up
-    # subplot_rows = 3 # TODO
-    subplot_rows = 3 * number_of_loci + 1  # rowspan=3 for each plot, legend at the bottom with third height and colspan=2
-
-    area_colors = [  # stacked plot colors
-        (0.26, 0.76, 0.26), # perfect, paired, unique
-        (0.40, 0.84, 0.40), # perfect, paired, shared
-
-        (0.99, 0.75, 0.20), # perfect, unpaired, unique
-        (0.99, 0.75, 0.20), # perfect, mispaired, unique
-        (0.99, 0.85, 0.35), # perfect, unpaired, shared
-        (0.99, 0.85, 0.35), # perfect, mispaired, shared
-
-        (0.99, 0.23, 0.23), # mismatch, paired, unique
-        (0.99, 0.49, 0.49), # mismatch, paired, shared
-        
-        (0.14, 0.55, 0.72), # mismatch, unpaired, unique
-        (0.14, 0.55, 0.72), # mismatch, mispaired, unique
-        (0.33, 0.70, 0.88), # mismatch, unpaired, shared
-        (0.33, 0.70, 0.88)] # mismatch, mispaired, shared
-
-    figure = pylab.figure(figsize=(box_size[0]*columns, box_size[1]*subplot_rows), dpi=dpi)  # TODO: dpi doesn't seem to do shit. Is it stuck in 100?
-
-    coverage_matrices = sorted(coverage_matrices, key=allele_sorter)  # so that the A alleles come first, then B, and so on.
-    prev_locus = ''
-    i_locus = -1
-
-    for allele, coverage in coverage_matrices:
-
-        if '_' in allele:
-            partial, complete = allele.split('_')
-            plot_title = '%s (introns from %s)' % (allele_data.loc[partial]['type'], allele_data.loc[complete]['type']) # , allele for debugging (original ID)
-        else:
-            plot_title = allele_data.loc[allele]['type'] # + allele for debugging original ID
-
-        if prev_locus != get_allele_locus(allele):  # new locus, start new row
-            i_locus += 1
-            i_allele_in_locus = 0
-        else:
-            i_allele_in_locus = 1
-
-        prev_locus = get_allele_locus(allele)
-
-        plot = pylab.subplot2grid((subplot_rows, columns), (3*i_locus, i_allele_in_locus), rowspan=3, adjustable='box')
-
-        _, _, max_ambig, seq_length = coverage.shape  # first two dimensions known (mismatched[2], pairing[3])
-
-        shared_weighting = np.reciprocal(np.arange(max_ambig)+1.0)  # --> 1, 1/2, 1/3...
-        shared_weighting[0] = 0  # --> 0, 1/2, 1/3, so the unique part doesn't get mixed in
-
-        perfect_paired_unique = start_end_zeros(coverage[0][0][0])
-        mismatch_paired_unique = start_end_zeros(coverage[1][0][0])
-        perfect_unpaired_unique = start_end_zeros(coverage[0][1][0])
-        mismatch_unpaired_unique = start_end_zeros(coverage[1][1][0])
-        perfect_mispaired_unique = start_end_zeros(coverage[0][2][0])
-        mismatch_mispaired_unique = start_end_zeros(coverage[1][2][0])
-
-        perfect_paired_shared = start_end_zeros(shared_weighting.dot(coverage[0][0]))
-        mismatch_paired_shared = start_end_zeros(shared_weighting.dot(coverage[1][0]))
-        perfect_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][1]))
-        mismatch_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][1]))
-        perfect_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][2]))
-        mismatch_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][2]))
-
-        # Exon annotation
-        i_start = 1  # position of last feature's end. It's one because we padded with zeros above
-        for _, ft in get_features(allele, features, features_used).iterrows():
-            if ft['feature'] == 'exon':
-                plot.axvspan(i_start, i_start + ft['length'], facecolor='black', alpha=0.1, linewidth=0, zorder=1)
-            i_start += ft['length']
-
-
-        areas = plot.stackplot(np.arange(seq_length+2), # seq_length+2 because of the zero padding at either end
-            perfect_paired_unique + 0.001,  # so that 0 isn't -inf on the logplot, but still below cutoff
-            perfect_paired_shared,
-
-            perfect_unpaired_unique,
-            perfect_mispaired_unique,
-            perfect_unpaired_shared,
-            perfect_mispaired_shared,
-
-            mismatch_paired_unique,
-            mismatch_paired_shared,
-            
-            mismatch_unpaired_unique,
-            mismatch_mispaired_unique,
-            mismatch_unpaired_shared,
-            mismatch_mispaired_shared,
-
-            linewidth=0, colors=area_colors, zorder=5)
-
-        for aa in areas:
-            # if you output to pdf it strangely doesn't respect linewidth=0 perfectly, you'll have
-            # to set line colors identical to area color to avoid having a black line between them
-            aa.set_edgecolor(aa.get_facecolor())
-
-        plot.tick_params(axis='both', labelsize=10, direction='out', which='both', top=False)
-
-        plot.text(.015, 0.97, plot_title, horizontalalignment='left', verticalalignment='top', transform=plot.transAxes, fontsize=10, zorder=6)
-        _, _, _, y2 = plot.axis()
-        plot.axis((0, seq_length, 1, y2))  # enforce y axis minimum at 10^0. This corresponds to zero coverage because of the +1 above
-        plot.set_yscale('log')
-        plot.set_ylim(bottom=0.5)
-
-    legend = pylab.subplot2grid((subplot_rows, columns), (subplot_rows-1, 0), colspan=2, adjustable='box')
-    ppp = pylab.matplotlib.patches
-    legend.add_patch(ppp.Rectangle((0, 2), 2, 2, color=area_colors[0]))
-    legend.add_patch(ppp.Rectangle((0, 0), 2, 2, color=area_colors[1]))
-    legend.add_patch(ppp.Rectangle((25, 2), 2, 2, color=area_colors[2]))
-    legend.add_patch(ppp.Rectangle((25, 0), 2, 2, color=area_colors[4]))
-    legend.add_patch(ppp.Rectangle((50, 2), 2, 2, color=area_colors[6]))
-    legend.add_patch(ppp.Rectangle((50, 0), 2, 2, color=area_colors[7]))
-    legend.add_patch(ppp.Rectangle((75, 2), 2, 2, color=area_colors[8]))
-    legend.add_patch(ppp.Rectangle((75, 0), 2, 2, color=area_colors[10]))
-    legend.text( 2.5, 3, 'paired, no mismatches, unique', va='center', size='smaller')
-    legend.text( 2.5, 1, 'paired, no mismatches, ambiguous', va='center', size='smaller')
-    legend.text(27.5, 3, 'unpaired, no mismatches, unique', va='center', size='smaller')
-    legend.text(27.5, 1, 'unpaired, no mismatches, ambiguous', va='center', size='smaller')
-    legend.text(52.5, 3, 'paired, mismatched, unique', va='center', size='smaller')
-    legend.text(52.5, 1, 'paired, mismatched, ambiguous', va='center', size='smaller')
-    legend.text(77.5, 3, 'unpaired, mismatched, unique', va='center', size='smaller')
-    legend.text(77.5, 1, 'unpaired, mismatched, ambiguous', va='center', size='smaller')
-    legend.set_xlim(0, 100)
-    legend.set_ylim(0, 4)
-    legend.axison = False
-
-    figure.tight_layout()
-    figure.savefig(outfile)
diff --git a/model.py b/model.py
deleted file mode 100644
index f6ce188..0000000
--- a/model.py
+++ /dev/null
@@ -1,496 +0,0 @@
-"""
-Created on Jul 19, 2013
-
-
-This class represents the OptiType model for HLA typing based on NGS data
-
-It is dependent on Coopr and uses an external ILP solver such as GLPK or CPLEX
-
-
-
-@author: Benjamin Schubert
-"""
-
-from __future__ import division
-from __future__ import print_function
-from builtins import str
-from builtins import range
-from builtins import object
-from pyomo.environ import ConcreteModel, Set, Param, Var, Binary, Objective, Constraint, ConstraintList, maximize
-from pyomo.opt import SolverFactory, TerminationCondition
-from collections import defaultdict
-import pandas as pd
-import itertools
-
-
-class OptiType(object):
-    """
-    classdocs
-
-    """
-
-    def __init__(self, cov, occ, groups_4digit, allele_table, beta, t_max_allele=2, solver="glpk", threads=1,
-                 verbosity=0):
-        """
-        Constructor
-        """
-
-        self.__allele_table = allele_table
-        self.__beta = float(beta)
-        self.__t_max_allele = t_max_allele
-        self.__solver = SolverFactory(solver)
-        self.__threads = threads
-        self.__opts = {"threads": threads} if threads > 1 else {}
-        self.__verbosity = verbosity
-        self.__changed = True  # model needs to know if it changed from last run or not
-        self.__ks = 1
-        self.__groups_4digit = groups_4digit
-
-        loci_alleles = defaultdict(list)
-        for type_4digit, group_alleles in groups_4digit.items():
-            # print type_4digit, group_alleles
-            loci_alleles[type_4digit.split('*')[0]].extend(group_alleles)
-
-        loci = loci_alleles
-
-        self.__allele_to_4digit = {allele: type_4digit for type_4digit, group in groups_4digit.items() for allele in
-                                   group}
-
-        '''
-            generates the basic ILP model
-        '''
-
-        model = ConcreteModel()
-
-        # init Sets
-        model.LociNames = Set(initialize=list(loci.keys()))
-        model.Loci = Set(model.LociNames, initialize=lambda m, l: loci[l])
-
-        L = list(itertools.chain(*list(loci.values())))
-        reconst = {allele_id: 0.01 for allele_id in L if '_' in allele_id}
-        R = set([r for (r, _) in list(cov.keys())])
-        model.L = Set(initialize=L)
-        model.R = Set(initialize=R)
-
-        # init Params
-        model.cov = Param(model.R, model.L, initialize=lambda model, r, a: cov.get((r, a), 0))
-        model.reconst = Param(model.L, initialize=lambda model, a: reconst.get(a, 0))
-
-        model.occ = Param(model.R, initialize=occ)
-        model.t_allele = Param(initialize=self.__t_max_allele, mutable=True)
-
-        model.beta = Param(initialize=self.__beta,
-                           validate=lambda val, model: 0.0 <= float(self.__beta) <= 0.999,
-                           mutable=True)
-        model.nof_loci = Param(initialize=len(loci))
-
-        # init variables
-        model.x = Var(model.L, domain=Binary)
-        model.y = Var(model.R, domain=Binary)
-
-        model.re = Var(model.R, bounds=(0.0, None))
-        model.hetero = Var(bounds=(0.0, model.nof_loci))
-
-        # init objective
-        model.read_cov = Objective(
-            rule=lambda model: sum(model.occ[r] * (model.y[r] - model.beta * (model.re[r])) for r in model.R) - sum(
-                model.reconst[a] * model.x[a] for a in model.L), sense=maximize)
-
-        # init Constraints
-        model.max_allel_selection = Constraint(model.LociNames, rule=lambda model, l: sum(
-            model.x[a] for a in model.Loci[l]) <= model.t_allele)
-        model.min_allel_selection = Constraint(model.LociNames,
-                                               rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) >= 1)
-        model.is_read_cov = Constraint(model.R,
-                                       rule=lambda model, r: sum(model.cov[r, a] * model.x[a] for a in model.L) >=
-                                                             model.y[r])
-        model.heterozygot_count = Constraint(
-            rule=lambda model: model.hetero >= sum(model.x[a] for a in model.L) - model.nof_loci)
-
-        # regularization constraints
-        model.reg1 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.nof_loci * model.y[r])
-        model.reg2 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.hetero)
-        model.reg3 = Constraint(model.R,
-                                rule=lambda model, r: model.re[r] >= model.hetero - model.nof_loci * (1 - model.y[r]))
-
-        # generate constraint list for solution enumeration
-        model.c = ConstraintList()
-        # Generate instance. Used to be .create() but deprecated since,
-        # as ConcreteModels are instances on their own now.
-        self.__instance = model
-
-    def set_beta(self, beta):
-        """
-            Sets the parameter beta
-        """
-        self.__changed = True
-        getattr(self.__instance, str(self.__instance.beta)).set_value(float(beta))
-
-    def set_t_max_allele(self, t_max_allele):
-        """
-            Sets the upper bound of alleles selected per loci
-        """
-        self.__changed = True
-        getattr(self.__instance, str(self.__instance.t_allele)).set_value(t_max_allele)
-
-    def solve(self, ks):
-        """
-            solves the problem k times and discards the found solutions in the next run.
-        """
-        d = defaultdict(list)  # in there we store the typing +objective and generate afterwards a DatarFrame with it
-
-        if self.__changed or self.__ks != ks:
-            self.__ks = ks
-            for k in range(ks):
-                expr = 0
-
-                self.__instance.preprocess()
-                try:
-                    res = self.__solver.solve(self.__instance, options=self.__opts, tee=self.__verbosity)
-                except:
-                    print ("WARNING: Solver does not support multi-threading. Please change the config"
-                          " file accordingly. Falling back to single-threading.")
-                    res = self.__solver.solve(self.__instance, options={}, tee=self.__verbosity)
-                self.__instance.solutions.load_from(res)  # solution loading changed recently.
-
-                # if self.__verbosity > 0:
-                #     res.write(num=1)
-
-                if res.solver.termination_condition != TerminationCondition.optimal:
-                    print("Optimal solution hasn't been obtained. This is a terminal problem.")  # TODO message, exit
-                    break
-
-                selected = []
-                indices = []
-                encountered_4digit = []
-                for j in self.__instance.x:
-                    if self.__allele_to_4digit[j][0] in 'HJG':
-                        if 0.99 <= self.__instance.x[j].value <= 1.01:
-                            selected.append(j)
-                        indices.append(j)
-                        continue
-                    if 0.99 <= self.__instance.x[j].value <= 1.01:
-                        selected.append(j)
-                        exp_i = 0
-                        exp_i += self.__instance.x[j]
-                        if self.__allele_to_4digit[j] in encountered_4digit:
-                            continue
-                        encountered_4digit.append(self.__allele_to_4digit[j])
-                        for i_allele in self.__groups_4digit[self.__allele_to_4digit[j]]:
-                            if self.__instance.x[i_allele].value <= 0:
-                                exp_i += self.__instance.x[i_allele]
-                            indices.append(i_allele)
-                        expr += (1.0 - exp_i)
-                zero_indices = set([j for j in self.__instance.x]).difference(set(indices))
-                for j in zero_indices:
-                    expr += self.__instance.x[j]
-
-                self.__instance.c.add(expr >= 1)
-
-                # if self.__verbosity > 0:
-                #     print selected
-                #     self.__instance.c.pprint()
-                aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected]
-                c = dict.fromkeys(aas, 1)
-                for i in range(len(aas)):
-                    if aas.count(aas[i]) < 2:
-                        d[aas[i] + "1"].append(selected[i])
-                        d[aas[i] + "2"].append(selected[i])
-                    else:
-                        d[aas[i] + str(c[aas[i]])].append(selected[i])
-                        c[aas[i]] += 1
-
-                nof_reads = sum((self.__instance.occ[j] * self.__instance.y[j].value for j in self.__instance.y))
-                # if self.__verbosity > 0:
-                #     print "Obj", res.Solution.Objective.__default_objective__.Value
-                d['obj'].append(self.__instance.read_cov())
-                d['nof_reads'].append(nof_reads)
-
-            self.__instance.c.clear()
-            self.__changed = False
-            self.__enumeration = pd.DataFrame(d)
-
-            # self.__rank()
-            return self.__enumeration
-        else:
-            return self.__enumeration
-
-    def solve_for_k_alleles(self, k, ks=1):
-        """
-            EXPERIMENTAL!
-
-            generates a solution without the regularization term and only k selected alleles
-        """
-        if k < int(self.__instance.nof_loci.value) or k > int(self.__instance.nof_loci.value * self.__t_max_allele):
-            raise Warning("k " + str(k) + " is out of range [" + str(self.__instance.nof_loci.value) + "," + str(
-                self.__instance.nof_loci * self.__t_max_allele) + "]")
-
-
-        # copy the instance
-        inst = self.__instance.clone()
-        # set beta = 0  # because we do homozygosity calling manually
-        getattr(inst, str(inst.beta)).set_value(float(0.0))
-
-        inst.del_component("heterozygot_count")
-        inst.del_component("reg1")
-        inst.del_component("reg2")
-        inst.del_component("reg3")
-        # generate constraint which allows only k alleles to be selected
-        expr1 = 0
-        for j in inst.x:
-            expr1 += inst.x[j]
-
-        inst.c.add(expr1 == k)
-        d = defaultdict(list)
-
-        for _ in range(ks):
-            inst.preprocess()
-            try:
-                res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity)
-            except:
-                print ("WARNING: Solver does not support multi-threading. Please change the config"
-                      " file accordingly. Falling back to single-threading.")
-                res = self.__solver.solve(inst, options={}, tee=self.__verbosity)
-            inst.solutions.load_from(res)
-
-            if self.__verbosity > 0:
-                res.write(num=1)
-
-            if res.solver.termination_condition != TerminationCondition.optimal:
-                print("Optimal solution hasn't been obtained. This is a terminal problem.")  # TODO message, exit
-                break
-
-            selected = []
-            expr = 0
-
-            indices = []
-            encountered_4digit = []
-            for j in inst.x:
-                if 0.99 <= inst.x[j].value <= 1.01:
-                    exp_i = 0
-                    selected.append(j)
-                    exp_i += inst.x[j]
-                    if self.__allele_to_4digit[j] in encountered_4digit:
-                        continue
-
-                    encountered_4digit.append(self.__allele_to_4digit[j])
-                    for i_allele in self.__groups_4digit[self.__allele_to_4digit[j]]:
-                        if inst.x[i_allele].value <= 0:
-                            exp_i += inst.x[i_allele]
-                        indices.append(i_allele)
-                    expr += (1 - exp_i)
-            zero_indices = set([j for j in inst.x]).difference(set(indices))
-            for j in zero_indices:
-                expr += inst.x[j]
-
-            inst.c.add(expr >= 1)
-
-            if self.__verbosity > 0:
-                print(selected)
-            aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected]
-            c = dict.fromkeys(aas, 1)
-            for i in range(len(aas)):
-                if aas.count(aas[i]) < 2:
-                    d[aas[i] + "1"].append(selected[i])
-                    d[aas[i] + "2"].append(selected[i])
-                else:
-                    d[aas[i] + str(c[aas[i]])].append(selected[i])
-                    c[aas[i]] += 1
-
-            # print "Obj", res.Solution.Objective.__default_objective__.Value
-            nof_reads = sum((inst.occ[j] * inst.y[j].value for j in inst.y))
-            d['obj'].append(inst.read_cov())
-            d['nof_reads'].append(nof_reads)
-
-        return pd.DataFrame(d)
-
-    def solve_fixed_typing(self, fixed_alleles):
-        """
-            EXPERIMENTAL!
-
-            forces the allele to pic a 4-digit of the provided alleles
-        """
-        k = len(set(fixed_alleles))
-        if k < int(self.__instance.nof_loci.value) or k > int(self.__instance.nof_loci.value * self.__t_max_allele):
-            raise Warning("k " + str(k) + " is out of range [" + str(self.__instance.nof_loci.value) + "," + str(
-                self.__instance.nof_loci * self.__t_max_allele) + "]")
-
-
-        # copy the instance
-        inst = self.__instance.clone()
-        # set beta = 0 because we do homozygocity calling manually
-        getattr(inst, str(inst.beta)).set_value(float(0.0))
-
-        inst.del_component("heterozygot_count")
-        inst.del_component("reg1")
-        inst.del_component("reg2")
-        inst.del_component("reg3")
-        # generate constraint which allows only k alleles to be selected
-        expr1 = 0
-        for j in inst.x:
-            expr1 += inst.x[j]
-        inst.c.add(expr1 == k)
-
-        # generate for each of the provided alleles the fixation constraint:
-        for a in set(fixed_alleles):
-            expr_f = 0
-            print(self.__groups_4digit)
-            for ids in self.__groups_4digit[a]:
-                print(ids)
-                expr_f += inst.x[ids]
-            inst.c.add(expr_f == 1)
-
-        d = defaultdict(list)
-
-        inst.preprocess()
-        try:
-            res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity)
-        except:
-            print ("WARNING: Solver does not support multi-threading. Please change the config"
-                  " file accordingly. Falling back to single-threading.")
-            res = self.__solver.solve(inst, options={}, tee=self.__verbosity)
-        inst.solutions.load_from(res)
-
-        opt_ids = [j for j in inst.x if 0.99 <= inst.x[j].value <= 1.01]
-
-        aas = [self.__allele_to_4digit[x].split('*')[0] for x in opt_ids]
-        c = dict.fromkeys(aas, 1)
-        for i in range(len(aas)):
-            if aas.count(aas[i]) < 2:
-                d[aas[i] + "1"].append(opt_ids[i])
-                d[aas[i] + "2"].append(opt_ids[i])
-            else:
-                d[aas[i] + str(c[aas[i]])].append(opt_ids[i])
-                c[aas[i]] += 1
-        nof_reads = sum((inst.occ[j] * inst.y[j].value for j in inst.y))
-        d['obj'].append(self.inst.read_cov())
-        d['nof_reads'].append(nof_reads)
-
-        return pd.DataFrame(d)
-
-    def enumerate_allele_wise(self):
-        """
-            EXPERIMENTAL!
-
-            fixes all but one allele and solves it again to investigate the influence of this
-            particular allele on the objective value.
-        """
-        d = defaultdict(list)
-
-        self.__instance.preprocess()
-        try:
-            res = self.__solver.solve(self.__instance, options=self.__opts, tee=self.__verbosity)
-        except:
-            print ("WARNING: Solver does not support multi-threading. Please change the config"
-                  " file accordingly. Falling back to single-threading.")
-            res = self.__solver.solve(self.__instance, options={}, tee=self.__verbosity)
-        self.__instance.solutions.load_from(res)
-
-        opt_ids = [j for j in self.__instance.x if 0.99 <= self.__instance.x[j].value <= 1.01]
-
-        aas = [self.__allele_to_4digit[x].split('*')[0] for x in opt_ids]
-        c = dict.fromkeys(aas, 1)
-        for i in range(len(aas)):
-            if aas.count(aas[i]) < 2:
-                d[aas[i] + "1"].append(opt_ids[i])
-                d[aas[i] + "2"].append(opt_ids[i])
-            else:
-                d[aas[i] + str(c[aas[i]])].append(opt_ids[i])
-                c[aas[i]] += 1
-        nof_reads = sum((self.__instance.occ[j] * self.__instance.y[j].value for j in self.__instance.y))
-        d['obj'].append(self.__instance.read_cov())
-        d['nof_reads'].append(nof_reads)
-        d['discarded'].append(0)
-
-        for j in opt_ids:
-            if self.__verbosity > 0:
-                self.__instance.c.pprint()
-            self.__instance.c.clear()
-            # fix all but j'th variable
-            fix = 0
-            for i in opt_ids:
-                if i != j:
-                    fix += (1 - self.__instance.x[i])
-            self.__instance.c.add(fix == 0.0)
-
-            # discard j'th allele and all its 4digit equivalent alleles form the next solution
-            discard = 0
-            for k in self.__groups_4digit[self.__allele_to_4digit[j]]:
-                discard += self.__instance.x[k]
-            self.__instance.c.add(discard == 0.0)
-
-            # solve with new constraints
-            self.__instance.preprocess()
-            try:
-                res = self.__solver.solve(self.__instance, tee=self.__verbosity)  # ,tee=True) verbose solvinf
-                self.__instance.solutions.load_from(res)
-            except:
-                print(Warning("There is no replacement for allele " + self.__allele_to_4digit[j]))
-                continue
-
-            selected = [al for al in self.__instance.x if 0.99 <= self.__instance.x[al].value <= 1.01]
-            aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected]
-            c = dict.fromkeys(aas, 1)
-            for q in range(len(aas)):
-                if aas.count(aas[q]) < 2:
-                    d[aas[q] + "1"].append(selected[q])
-                    d[aas[q] + "2"].append(selected[q])
-                else:
-                    d[aas[q] + str(c[aas[q]])].append(selected[q])
-                    c[aas[q]] += 1
-            nof_reads = sum((self.__instance.occ[h] * self.__instance.y[h].value for h in self.__instance.y))
-            d['obj'].append(self.__instance.read_cov())
-            d['nof_reads'].append(nof_reads)
-            d['discarded'].append(j)
-        return pd.DataFrame(d)
-
-    def solve_enforced_zygosity(self, gosity_dict):
-        """
-            EXPERIMENTAL!
-
-            solves the ilp without regularization but enforced homo/heterozygosity for each locus
-            @param gosity_dict: a dictionary with all loci as keys and value = number of alleles per locus (default is 2)
-        """
-
-        inst = self.__instance.clone()
-        # set beta = 0 because we do homozygocity calling manually
-        getattr(inst, str(inst.beta)).set_value(float(0.0))
-
-        inst.del_component("heterozygot_count")
-        inst.del_component("reg1")
-        inst.del_component("reg2")
-        inst.del_component("reg3")
-
-        # now delete max_allele_constraint and reconstruct it again
-        inst.del_component("max_allel_selection")
-        for locus in inst.LociNames:
-            cons = 0
-            for a in inst.Loci[locus]:
-                cons += inst.x[a]
-            inst.c.add(cons <= gosity_dict.get(locus, 2))
-
-        d = defaultdict(list)
-
-        inst.preprocess()
-        try:
-            res = self.__solver.solve(inst, options=self.__opts, tee=self.__verbosity)
-        except:
-            print ("WARNING: Solver does not support multi-threading. Please change the config"
-                  " file accordingly. Falling back to single-threading.")
-            res = self.__solver.solve(inst, options={}, tee=self.__verbosity)
-        inst.solutions.load_from(res)
-
-        selected = [al for al in inst.x if 0.99 <= inst.x[al].value <= 1.01]
-        aas = [self.__allele_to_4digit[x].split('*')[0] for x in selected]
-        c = dict.fromkeys(aas, 1)
-        for q in range(len(aas)):
-            if aas.count(aas[q]) < 2:
-                d[aas[q] + "1"].append(selected[q])
-                d[aas[q] + "2"].append(selected[q])
-            else:
-                d[aas[q] + str(c[aas[q]])].append(selected[q])
-                c[aas[q]] += 1
-        nof_reads = sum((inst.occ[h] * inst.y[h].value for h in inst.y))
-        d['obj'].append(inst.read_cov())
-        d['nof_reads'].append(nof_reads)
-        return pd.DataFrame(d)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5dff81f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,115 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "optitype"
+version = "1.5.0"
+description = "Precision HLA typing from next-generation sequencing data"
+readme = "README.md"
+license = {text = "BSD-3-Clause"}
+requires-python = ">=3.10"
+authors = [
+    {name = "Andras Szolek"},
+    {name = "Benjamin Schubert"},
+    {name = "Christopher Mohr"},
+    {name = "Jonas Scheid"},
+]
+maintainers = [
+    {name = "OptiType Contributors"}
+]
+keywords = [
+    "HLA",
+    "typing",
+    "bioinformatics",
+    "NGS",
+    "next-generation-sequencing",
+    "immunogenetics",
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "click>=8.0",
+    "numpy>=1.23",
+    "pandas>=2.0",
+    "pyarrow>=12.0",
+    "pyomo>=6.6",
+    "pysam>=0.21",
+    "matplotlib>=3.5",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "pytest-cov>=4.0",
+    "ruff>=0.1.0",
+    "mypy>=1.0",
+    "build>=1.0",
+    "twine>=4.0",
+]
+
+[project.scripts]
+optitype = "optitype.cli:main"
+
+[project.urls]
+Homepage = "https://github.com/FRED-2/OptiType"
+Documentation = "https://github.com/FRED-2/OptiType#readme"
+Repository = "https://github.com/FRED-2/OptiType"
+Issues = "https://github.com/FRED-2/OptiType/issues"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/optitype"]
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/data",
+    "/tests",
+    "/README.md",
+    "/LICENSE",
+]
+
+[tool.hatch.build.targets.wheel.shared-data]
+"data/alleles" = "share/optitype/data/alleles"
+"data/hla_reference_dna.fasta" = "share/optitype/data/hla_reference_dna.fasta"
+"data/hla_reference_rna.fasta" = "share/optitype/data/hla_reference_rna.fasta"
+
+[tool.ruff]
+target-version = "py310"
+line-length = 100
+select = [
+    "E",   # pycodestyle errors
+    "W",   # pycodestyle warnings
+    "F",   # pyflakes
+    "I",   # isort
+    "B",   # flake8-bugbear
+    "UP",  # pyupgrade
+]
+ignore = [
+    "E501",  # line too long - handled by formatter
+]
+
+[tool.ruff.isort]
+known-first-party = ["optitype"]
+
+[tool.mypy]
+python_version = "3.10"
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_functions = ["test_*"]
+addopts = "-v --tb=short"
diff --git a/src/optitype/__init__.py b/src/optitype/__init__.py
new file mode 100644
index 0000000..3be77c7
--- /dev/null
+++ b/src/optitype/__init__.py
@@ -0,0 +1,19 @@
+"""
+OptiType: Precision HLA typing from next-generation sequencing data.
+
+OptiType is a novel HLA genotyping algorithm based on integer linear
+programming, capable of producing accurate 4-digit HLA genotyping predictions
+from NGS data by simultaneously selecting all minor and major HLA-I alleles.
+"""
+
+__version__ = "1.5.0"
+__author__ = "Andras Szolek, Benjamin Schubert, Christopher Mohr, Jonas Scheid"
+
+from optitype.api import HLATypingConfig, HLATypingResult, run_hla_typing
+
+__all__ = [
+    "__version__",
+    "run_hla_typing",
+    "HLATypingResult",
+    "HLATypingConfig",
+]
diff --git a/src/optitype/__main__.py b/src/optitype/__main__.py
new file mode 100644
index 0000000..e05ae51
--- /dev/null
+++ b/src/optitype/__main__.py
@@ -0,0 +1,6 @@
+"""Allow running optitype as a module: python -m optitype"""
+
+from optitype.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/src/optitype/_logging.py b/src/optitype/_logging.py
new file mode 100644
index 0000000..6112e5c
--- /dev/null
+++ b/src/optitype/_logging.py
@@ -0,0 +1,28 @@
+"""Logging configuration for OptiType."""
+
+import logging
+import time
+
+# Package-level logger
+logger = logging.getLogger("optitype")
+
+# Record start time at import for elapsed-time display
+_start_time = time.monotonic()
+
+
+def elapsed() -> str:
+    """Return elapsed time since package load as H:MM:SS.ss."""
+    delta = time.monotonic() - _start_time
+    minutes, seconds = divmod(delta, 60)
+    hours, minutes = divmod(int(minutes), 60)
+    return f"{hours}:{minutes:02d}:{seconds:05.2f}"
+
+
+def configure_logging(verbose: bool) -> None:
+    """Configure the optitype logger for CLI or API use."""
+    level = logging.DEBUG if verbose else logging.WARNING
+    logger.setLevel(level)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        handler.setFormatter(logging.Formatter("%(message)s"))
+        logger.addHandler(handler)
diff --git a/src/optitype/api.py b/src/optitype/api.py
new file mode 100644
index 0000000..de005a7
--- /dev/null
+++ b/src/optitype/api.py
@@ -0,0 +1,191 @@
+"""
+Programmatic API for OptiType.
+
+This module provides a clean Python API for running HLA typing analyses.
+"""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from optitype.pipeline import PipelineConfig, load_config, run_pipeline
+
+
+@dataclass
+class HLATypingConfig:
+    """
+    Configuration for HLA typing analysis.
+
+    Attributes:
+        solver: ILP solver to use ('glpk', 'cbc', 'cplex').
+        threads: Number of threads for ILP solving.
+        mapping_threads: Number of threads for read mapping.
+        razers3_path: Path to RazerS3 binary.
+        beta: Homozygosity detection parameter (0.0 to 0.1).
+        enumerate_count: Number of solutions to enumerate.
+        delete_bam: Whether to delete intermediate BAM files.
+        unpaired_weight: Weight for unpaired reads in paired-end mode.
+        use_discordant: Whether to use discordant read pairs.
+    """
+
+    solver: str = "glpk"
+    threads: int = 1
+    mapping_threads: int = 4
+    razers3_path: str = "razers3"
+    beta: float = 0.009
+    enumerate_count: int = 1
+    delete_bam: bool = True
+    unpaired_weight: float = 0.0
+    use_discordant: bool = False
+
+    def to_pipeline_config(self) -> PipelineConfig:
+        """Convert to internal PipelineConfig."""
+        return PipelineConfig(
+            razers3_path=self.razers3_path,
+            mapping_threads=self.mapping_threads,
+            solver=self.solver,
+            ilp_threads=self.threads,
+            delete_bam=self.delete_bam,
+            unpaired_weight=self.unpaired_weight,
+            use_discordant=self.use_discordant,
+        )
+
+
+@dataclass
+class HLATypingResult:
+    """
+    Results from HLA typing analysis.
+
+    Attributes:
+        best_result: Dictionary mapping locus positions to allele types.
+        all_results: DataFrame with all enumerated results.
+        reads: Number of reads explained by the best solution.
+        objective: Objective function value of the best solution.
+        output_csv: Path to the output CSV file.
+        output_plot: Path to the coverage plot PDF.
+    """
+
+    best_result: dict[str, str | None]
+    all_results: pd.DataFrame
+    reads: float
+    objective: float
+    output_csv: str | None = None
+    output_plot: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert result to a dictionary."""
+        return {
+            "typing": self.best_result,
+            "reads": self.reads,
+            "objective": self.objective,
+            "output_csv": self.output_csv,
+            "output_plot": self.output_plot,
+        }
+
+    def __repr__(self) -> str:
+        types = [f"{k}: {v}" for k, v in self.best_result.items() if v is not None]
+        return f"HLATypingResult({', '.join(types)}, reads={self.reads:.0f})"
+
+
+def run_hla_typing(
+    fastq_files: str | list[str],
+    seq_type: str,
+    output_dir: str | Path | None = None,
+    config: HLATypingConfig | None = None,
+    config_file: str | Path | None = None,
+    verbose: bool = False,
+) -> HLATypingResult:
+    """
+    Run HLA typing analysis on FASTQ or BAM files.
+
+    This is the main entry point for programmatic use of OptiType.
+
+    Args:
+        fastq_files: Path to input file(s). Single file for single-end,
+            list of two files for paired-end.
+        seq_type: Sequencing type ('dna' or 'rna').
+        output_dir: Output directory. If None, uses current directory.
+        config: HLATypingConfig with analysis settings. If None, uses defaults.
+        config_file: Path to INI config file for additional settings.
+        verbose: Enable verbose output.
+
+    Returns:
+        HLATypingResult with typing results.
+
+    Raises:
+        ValueError: If inputs are invalid.
+        FileNotFoundError: If input files or reference data not found.
+
+    Example:
+        >>> from optitype import run_hla_typing, HLATypingConfig
+        >>> result = run_hla_typing(
+        ...     fastq_files=["sample_1.fastq", "sample_2.fastq"],
+        ...     seq_type="dna",
+        ...     config=HLATypingConfig(solver="cbc", threads=4)
+        ... )
+        >>> print(result.best_result)
+        {'A1': 'A*02:01', 'A2': 'A*03:01', 'B1': 'B*07:02', ...}
+    """
+    # Normalize input files to list
+    if isinstance(fastq_files, str):
+        input_files = [fastq_files]
+    else:
+        input_files = list(fastq_files)
+
+    # Validate inputs
+    for f in input_files:
+        if not Path(f).exists():
+            raise FileNotFoundError(f"Input file not found: {f}")
+
+    # Set up output directory
+    if output_dir is None:
+        output_dir = Path.cwd() / "optitype_output"
+    else:
+        output_dir = Path(output_dir)
+
+    # Set up configuration
+    if config is None:
+        config = HLATypingConfig()
+
+    # Load config file if provided
+    pipeline_config = config.to_pipeline_config()
+    if config_file:
+        file_config = load_config(config_file)
+        # Merge file config with API config (API takes precedence)
+        if config.razers3_path == "razers3":
+            pipeline_config.razers3_path = file_config.razers3_path
+        if config.solver == "glpk":
+            pipeline_config.solver = file_config.solver
+
+    # Run pipeline
+    result = run_pipeline(
+        input_files=input_files,
+        seq_type=seq_type,
+        output_dir=str(output_dir),
+        beta=config.beta,
+        enumerate_count=config.enumerate_count,
+        config=pipeline_config,
+        verbose=verbose,
+    )
+
+    # Extract best result
+    best_row = result.result_4digit.iloc[0]
+    best_result = {
+        "A1": best_row.get("A1"),
+        "A2": best_row.get("A2"),
+        "B1": best_row.get("B1"),
+        "B2": best_row.get("B2"),
+        "C1": best_row.get("C1"),
+        "C2": best_row.get("C2"),
+    }
+
+    return HLATypingResult(
+        best_result=best_result,
+        all_results=result.result_4digit,
+        reads=best_row.get("nof_reads", 0),
+        objective=best_row.get("obj", 0),
+        output_csv=result.output_csv,
+        output_plot=result.output_plot,
+    )
diff --git a/src/optitype/cli.py b/src/optitype/cli.py
new file mode 100644
index 0000000..f603ab1
--- /dev/null
+++ b/src/optitype/cli.py
@@ -0,0 +1,428 @@
+"""
+Click-based command-line interface for OptiType.
+
+Usage:
+    optitype run -i sample_1.fq -i sample_2.fq --dna -o output/
+    optitype check-deps
+    optitype init-config
+"""
+
+import shutil
+import sys
+from pathlib import Path
+
+import click
+
+from optitype import __version__
+from optitype.io.data import get_data_path, get_reference_fasta
+
+
+@click.group()
+@click.version_option(version=__version__, prog_name="optitype")
+def main():
+    """OptiType: Precision HLA typing from next-generation sequencing data.
+
+    OptiType is a novel HLA genotyping algorithm based on integer linear
+    programming, capable of producing accurate 4-digit HLA genotyping
+    predictions from NGS data.
+
+    \b
+    External dependencies:
+      - RazerS3: For read mapping (conda install -c bioconda razers3)
+      - ILP solver: GLPK, CBC, or CPLEX
+
+    \b
+    Example usage:
+      optitype run -i reads_1.fq -i reads_2.fq --dna -o results/
+      optitype run -i sample.bam --rna -o results/
+    """
+    pass
+
+
+@main.command()
+@click.option(
+    "-i", "--input",
+    "input_files",
+    multiple=True,
+    required=True,
+    type=click.Path(exists=True),
+    help="Input FASTQ or BAM files. One file for single-end, two for paired-end.",
+)
+@click.option(
+    "-r", "--rna",
+    "seq_type",
+    flag_value="rna",
+    help="Input data is RNA sequencing.",
+)
+@click.option(
+    "-d", "--dna",
+    "seq_type",
+    flag_value="dna",
+    default=True,
+    help="Input data is DNA sequencing (default).",
+)
+@click.option(
+    "-o", "--outdir",
+    required=True,
+    type=click.Path(),
+    help="Output directory for results.",
+)
+@click.option(
+    "-p", "--prefix",
+    default=None,
+    help="Output filename prefix. Default: timestamp.",
+)
+@click.option(
+    "-b", "--beta",
+    type=float,
+    default=0.009,
+    help="Homozygosity detection parameter (0.0-0.1). Default: 0.009.",
+)
+@click.option(
+    "-e", "--enumerate",
+    "enumerate_count",
+    type=int,
+    default=1,
+    help="Number of solutions to enumerate. Default: 1.",
+)
+@click.option(
+    "--solver",
+    type=click.Choice(["glpk", "cbc", "cplex"]),
+    default="glpk",
+    envvar="OPTITYPE_SOLVER",
+    help="ILP solver to use. Default: glpk.",
+)
+@click.option(
+    "--razers3",
+    type=click.Path(),
+    default=None,
+    envvar="OPTITYPE_RAZERS3",
+    help="Path to RazerS3 binary. Default: search PATH.",
+)
+@click.option(
+    "--threads",
+    type=int,
+    default=4,
+    help="Number of threads for mapping. Default: 4.",
+)
+@click.option(
+    "--ilp-threads",
+    type=int,
+    default=1,
+    help="Number of threads for ILP solver. Default: 1.",
+)
+@click.option(
+    "--keep-bam/--delete-bam",
+    default=False,
+    help="Keep or delete intermediate BAM files. Default: delete.",
+)
+@click.option(
+    "-c", "--config",
+    type=click.Path(exists=True),
+    default=None,
+    help="Path to config.ini file for additional settings.",
+)
+@click.option(
+    "-v", "--verbose",
+    is_flag=True,
+    help="Enable verbose output.",
+)
+def run(
+    input_files,
+    seq_type,
+    outdir,
+    prefix,
+    beta,
+    enumerate_count,
+    solver,
+    razers3,
+    threads,
+    ilp_threads,
+    keep_bam,
+    config,
+    verbose,
+):
+    """Run HLA typing analysis.
+
+    \b
+    Examples:
+      # Paired-end DNA analysis
+      optitype run -i reads_1.fq -i reads_2.fq --dna -o results/
+
+      # Single-end RNA analysis
+      optitype run -i sample.fastq --rna -o results/
+
+      # Re-analyze from BAM file
+      optitype run -i mapped.bam --dna -o results/
+
+      # With custom settings
+      optitype run -i reads.fq --dna -o results/ --solver cbc --threads 8
+    """
+    from optitype.pipeline import PipelineConfig, run_pipeline
+
+    # Validate inputs
+    if len(input_files) not in (1, 2):
+        raise click.BadParameter(
+            "Number of input files must be 1 (single-end) or 2 (paired-end)",
+            param_hint="'-i/--input'",
+        )
+
+    if not 0.0 <= beta < 0.1:
+        raise click.BadParameter(
+            "Beta must be between 0.0 and 0.1",
+            param_hint="'-b/--beta'",
+        )
+
+    if enumerate_count < 1:
+        raise click.BadParameter(
+            "Enumerate count must be at least 1",
+            param_hint="'-e/--enumerate'",
+        )
+
+    # Determine razers3 path (only required for FASTQ input, not BAM/SAM)
+    bam_input = input_files[0].split(".")[-1].lower() in ("sam", "bam")
+    if razers3 is None:
+        razers3 = shutil.which("razers3") or "razers3"
+        if not bam_input and not shutil.which(razers3):
+            raise click.ClickException(
+                "RazerS3 not found in PATH. Install with: conda install -c bioconda razers3\n"
+                "Or specify path with --razers3 option or OPTITYPE_RAZERS3 environment variable."
+            )
+
+    # Create config
+    pipeline_config = PipelineConfig(
+        razers3_path=razers3,
+        mapping_threads=threads,
+        solver=solver,
+        ilp_threads=ilp_threads,
+        delete_bam=not keep_bam,
+        unpaired_weight=0.0,
+        use_discordant=False,
+    )
+
+    # Load additional settings from config file if provided
+    if config:
+        from optitype.pipeline import load_config
+        file_config = load_config(config)
+        if file_config.unpaired_weight > 0:
+            pipeline_config.unpaired_weight = file_config.unpaired_weight
+        if file_config.use_discordant:
+            pipeline_config.use_discordant = file_config.use_discordant
+
+    # Run the pipeline
+    if verbose:
+        click.echo(f"OptiType {__version__}")
+        click.echo(f"Input files: {list(input_files)}")
+        click.echo(f"Sequence type: {seq_type}")
+        click.echo(f"Output directory: {outdir}")
+        click.echo(f"Solver: {solver}")
+        click.echo()
+
+    try:
+        result = run_pipeline(
+            input_files=list(input_files),
+            seq_type=seq_type,
+            output_dir=outdir,
+            prefix=prefix,
+            beta=beta,
+            enumerate_count=enumerate_count,
+            config=pipeline_config,
+            verbose=verbose,
+        )
+
+        # Print results summary
+        click.echo()
+        click.echo("=" * 60)
+        click.echo("HLA Typing Results")
+        click.echo("=" * 60)
+
+        best = result.result_4digit.iloc[0]
+        for locus in ["A", "B", "C"]:
+            a1 = best.get(f"{locus}1", "-")
+            a2 = best.get(f"{locus}2", "-")
+            if a1 == a2:
+                click.echo(f"  HLA-{locus}: {a1} (homozygous)")
+            else:
+                click.echo(f"  HLA-{locus}: {a1}, {a2}")
+
+        click.echo()
+        click.echo(f"Reads: {best['nof_reads']:.0f}")
+        click.echo(f"Objective: {best['obj']:.2f}")
+        click.echo()
+        click.echo(f"Results written to: {result.output_csv}")
+        click.echo(f"Coverage plot: {result.output_plot}")
+
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+
+@main.command("check-deps")
+def check_deps():
+    """Check that all external dependencies are available.
+
+    Verifies that RazerS3 and an ILP solver are installed and accessible.
+    """
+    click.echo("Checking OptiType dependencies...")
+    click.echo()
+
+    all_ok = True
+
+    # Check RazerS3
+    razers3 = shutil.which("razers3")
+    if razers3:
+        click.echo(click.style("  [OK]", fg="green") + f" RazerS3: {razers3}")
+    else:
+        click.echo(click.style("  [MISSING]", fg="red") + " RazerS3")
+        click.echo("    Install with: conda install -c bioconda razers3")
+        all_ok = False
+
+    # Check GLPK
+    glpsol = shutil.which("glpsol")
+    if glpsol:
+        click.echo(click.style("  [OK]", fg="green") + f" GLPK: {glpsol}")
+    else:
+        click.echo(click.style("  [MISSING]", fg="yellow") + " GLPK")
+        click.echo("    Install with: apt install glpk-utils  (or conda install -c conda-forge glpk)")
+
+    # Check CBC
+    cbc = shutil.which("cbc")
+    if cbc:
+        click.echo(click.style("  [OK]", fg="green") + f" CBC: {cbc}")
+    else:
+        click.echo(click.style("  [MISSING]", fg="yellow") + " CBC")
+        click.echo("    Install with: apt install coinor-cbc  (or conda install -c conda-forge coincbc)")
+
+    # Check CPLEX (optional)
+    cplex = shutil.which("cplex")
+    if cplex:
+        click.echo(click.style("  [OK]", fg="green") + f" CPLEX: {cplex}")
+    else:
+        click.echo(click.style("  [N/A]", fg="cyan") + " CPLEX (optional, commercial solver)")
+
+    # Check reference data
+    click.echo()
+    try:
+        data_path = get_data_path()
+        click.echo(click.style("  [OK]", fg="green") + f" Reference data: {data_path}")
+    except FileNotFoundError:
+        click.echo(click.style("  [MISSING]", fg="red") + " Reference data")
+        click.echo("    Set OPTITYPE_DATA environment variable to data directory")
+        all_ok = False
+
+    click.echo()
+    if all_ok and (glpsol or cbc):
+        click.echo(click.style("All required dependencies are available!", fg="green"))
+    else:
+        click.echo(click.style("Some dependencies are missing.", fg="red"))
+        if not (glpsol or cbc):
+            click.echo("At least one ILP solver (GLPK or CBC) is required.")
+        raise SystemExit(1)
+
+
+@main.command("init-config")
+@click.option(
+    "-o", "--output",
+    type=click.Path(),
+    default="config.ini",
+    help="Output path for config file. Default: config.ini",
+)
+@click.option(
+    "--force",
+    is_flag=True,
+    help="Overwrite existing config file.",
+)
+def init_config(output, force):
+    """Generate a default configuration file.
+
+    Creates a config.ini file with default settings that can be customized.
+    """
+    config_content = """[mapping]
+
+# Path to RazerS3 binary. If not specified, searched in PATH.
+# razers3=/path/to/razers3
+
+# Number of threads for read mapping
+threads=4
+
+[ilp]
+
+# ILP solver to use: glpk, cbc, or cplex
+solver=glpk
+
+# Number of threads for ILP solver
+threads=1
+
+[behavior]
+
+# Delete intermediate BAM files after processing
+deletebam=true
+
+# Weight for unpaired reads in paired-end mode (0-1)
+# 0 means unpaired reads are ignored
+# 1 means unpaired reads are weighted equally to paired reads
+unpaired_weight=0
+
+# Use discordant read pairs (where ends map to different alleles)
+use_discordant=false
+"""
+
+    output_path = Path(output)
+    if output_path.exists() and not force:
+        raise click.ClickException(
+            f"Config file already exists: {output}\n"
+            "Use --force to overwrite."
+        )
+
+    output_path.write_text(config_content)
+    click.echo(f"Config file created: {output}")
+    click.echo("Edit the file to customize settings.")
+
+
+@main.command("info")
+def info():
+    """Show information about the OptiType installation."""
+    click.echo(f"OptiType version: {__version__}")
+    click.echo()
+
+    # Python info
+    click.echo(f"Python: {sys.version}")
+    click.echo()
+
+    # Check data path
+    try:
+        data_path = get_data_path()
+        click.echo(f"Data directory: {data_path}")
+
+        # Check for reference files
+        for ref_type in ["dna", "rna"]:
+            try:
+                ref_path = get_reference_fasta(ref_type)
+                click.echo(f"  {ref_type.upper()} reference: {ref_path}")
+            except FileNotFoundError:
+                click.echo(f"  {ref_type.upper()} reference: NOT FOUND")
+
+        # Check for allele data
+        alleles_path = data_path / "alleles"
+        if alleles_path.exists():
+            parquet_files = list(alleles_path.glob("*.parquet"))
+            click.echo(f"  Allele data: {len(parquet_files)} parquet files")
+        else:
+            click.echo("  Allele data: NOT FOUND")
+    except FileNotFoundError as e:
+        click.echo(f"Data directory: NOT FOUND ({e})")
+
+    click.echo()
+
+    # Package dependencies
+    click.echo("Dependencies:")
+    for pkg in ["click", "numpy", "pandas", "pyarrow", "pyomo", "pysam", "matplotlib"]:
+        try:
+            mod = __import__(pkg)
+            version = getattr(mod, "__version__", "unknown")
+            click.echo(f"  {pkg}: {version}")
+        except ImportError:
+            click.echo(f"  {pkg}: NOT INSTALLED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/optitype/hlatyper.py b/src/optitype/hlatyper.py
new file mode 100644
index 0000000..9e62540
--- /dev/null
+++ b/src/optitype/hlatyper.py
@@ -0,0 +1,538 @@
+"""
+Core HLA typing logic for OptiType.
+
+This module handles coverage matrix construction, statistical calculations,
+allele pruning, and visualization.
+"""
+
+import logging
+import warnings
+from collections.abc import Callable
+
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from optitype._logging import elapsed
+
+logger = logging.getLogger(__name__)
+
+
+def feature_order(feature: tuple[str, int]) -> int:
+    """
+    Return sort order for feature tuples.
+
+    Returns:
+        0 for ('UTR', 1)
+        2 for exon, 1
+        3 for intron, 1
+        4 for exon, 2
+        ...
+        999 for UTR, 2
+    """
+    feat_type, feat_number = feature
+    if feat_type not in ("intron", "exon", "UTR"):
+        raise ValueError(f"Unknown feature type: {feat_type!r}")
+    if not isinstance(feat_number, int):
+        raise TypeError(f"Feature number must be int, got {type(feat_number).__name__}")
+    if feature == ("UTR", 1):
+        return 0
+    elif feature == ("UTR", 2):
+        return 999
+    else:
+        return feat_number * 2 + (feat_type == "intron")
+
+
+def get_compact_model(
+    hit_df: pd.DataFrame,
+    weak_hit_df: pd.DataFrame | None = None,
+    weight: float | None = None,
+) -> tuple[pd.DataFrame, dict]:
+    """
+    Turn a binary hit matrix into a compact model.
+
+    Removes duplicate rows and creates an occurrence vector with the number
+    of rows each representative read represents.
+
+    Args:
+        hit_df: Binary hit matrix DataFrame.
+        weak_hit_df: Optional DataFrame of "weak" hits (e.g., unpaired reads).
+        weight: Weight for weak hits (0 < weight <= 1).
+
+    Returns:
+        Tuple of (unique_matrix, occurrence_dict).
+    """
+    hit_df = hit_df.loc[hit_df.any(axis=1)]  # Remove all-zero rows
+    occurrence = {
+        r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.values()
+    }
+
+    if weak_hit_df is not None:
+        weak_hit_df = weak_hit_df.loc[weak_hit_df.any(axis=1)]
+        if not (0 < weight <= 1):
+            raise ValueError(f"weak hit weight must be in (0, 1], got {weight}")
+        weak_occ = {
+            r[0]: len(r) * weight
+            for r in weak_hit_df.groupby(weak_hit_df.columns.tolist()).groups.values()
+        }
+        occurrence.update(weak_occ)
+        unique_mtx = pd.concat([hit_df.drop_duplicates(), weak_hit_df.drop_duplicates()])
+    else:
+        unique_mtx = hit_df.drop_duplicates()
+
+    return unique_mtx, occurrence
+
+
+def mtx_to_sparse_dict(hit_df: pd.DataFrame) -> dict:
+    """
+    Convert a hit matrix to a sparse dictionary.
+
+    Creates dictionary of (read, allele): 1 tuples for hits.
+
+    Args:
+        hit_df: Hit matrix DataFrame.
+
+    Returns:
+        Dictionary mapping (read_id, allele_id) to 1 for each hit.
+    """
+    all_hits = {}
+    for read_id, alleles in hit_df.iterrows():
+        hit_alleles = alleles[alleles != 0].index
+        for hit_allele in hit_alleles:
+            all_hits[(read_id, hit_allele)] = 1
+    return all_hits
+
+
+def prune_identical_alleles(
+    binary_mtx: pd.DataFrame, report_groups: bool = False
+) -> pd.DataFrame | tuple[pd.DataFrame, dict]:
+    """
+    Remove identical allele columns from the binary matrix.
+
+    Args:
+        binary_mtx: Binary hit matrix.
+        report_groups: If True, also return a dict mapping representatives to their groups.
+
+    Returns:
+        Pruned matrix, or tuple of (pruned_matrix, groups_dict) if report_groups=True.
+    """
+    hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0]))
+
+    if report_groups:
+        grouper = hash_columns.groupby(hash_columns)
+        groups = {g[1].index[0]: g[1].index.tolist() for g in grouper}
+
+    alleles_to_keep = hash_columns.drop_duplicates().index
+
+    if report_groups:
+        return binary_mtx[alleles_to_keep], groups
+    return binary_mtx[alleles_to_keep]
+
+
+def prune_identical_reads(binary_mtx: pd.DataFrame) -> pd.DataFrame:
+    """
+    Remove identical read rows from the binary matrix.
+
+    This should only be used to compactify a matrix before passing it to
+    the function finding overshadowed alleles. Final compactifying should
+    be done on the original binary matrix.
+
+    Args:
+        binary_mtx: Binary hit matrix.
+
+    Returns:
+        Pruned matrix with unique rows.
+    """
+    reads_to_keep = (
+        binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index
+    )
+    return binary_mtx.loc[reads_to_keep]
+
+
+def prune_overshadowed_alleles(binary_mtx: pd.DataFrame) -> pd.Index:
+    """
+    Find and remove alleles that are overshadowed by others.
+
+    An allele is overshadowed if another allele has all of its hits plus more.
+
+    For optimal performance, first prune identical columns and rows:
+    prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx)))
+
+    Args:
+        binary_mtx: Binary hit matrix.
+
+    Returns:
+        Index of non-overshadowed alleles.
+    """
+    # Handle potential overflow in dot product
+    if (binary_mtx.shape[0] < np.iinfo(np.uint16).max) or (
+        binary_mtx.sum(axis=0).max() < np.iinfo(np.uint16).max
+    ):
+        bb = (
+            binary_mtx
+            if all(binary_mtx.dtypes == np.uint16)
+            else binary_mtx.astype(np.uint16)
+        )
+    else:
+        bb = binary_mtx.astype(np.uint32)
+
+    covariance = bb.transpose().dot(bb)
+    diagonal = pd.Series(
+        [covariance[ii][ii] for ii in covariance.columns], index=covariance.columns
+    )
+
+    new_covariance = covariance[covariance.columns].copy()
+    for ii in new_covariance.columns:
+        new_covariance.loc[ii, ii] = 0
+
+    overshadowed = []
+    for ii in new_covariance.columns:
+        potential_superiors = new_covariance[ii][new_covariance[ii] == diagonal[ii]].index
+        if any(diagonal[potential_superiors] > diagonal[ii]):
+            overshadowed.append(ii)
+
+    non_overshadowed = covariance.columns.difference(overshadowed)
+    return non_overshadowed
+
+
+def create_paired_matrix(
+    binary_1: pd.DataFrame,
+    binary_2: pd.DataFrame,
+    id_cleaning: Callable | None = None,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Create paired, mispaired, and unpaired matrices from two binary hit matrices.
+
+    Args:
+        binary_1: Binary hit matrix for first read end.
+        binary_2: Binary hit matrix for second read end.
+        id_cleaning: Optional function to clean read IDs for matching.
+
+    Returns:
+        Tuple of (paired, mispaired, unpaired) DataFrames.
+    """
+    if id_cleaning is not None:
+        binary_1.index = list(map(id_cleaning, binary_1.index))
+        binary_2.index = list(map(id_cleaning, binary_2.index))
+
+    common_read_ids = binary_1.index.intersection(binary_2.index)
+    only_1 = binary_1.index.difference(binary_2.index)
+    only_2 = binary_2.index.difference(binary_1.index)
+
+    b_1 = binary_1.loc[common_read_ids]
+    b_2 = binary_2.loc[common_read_ids]
+    b_12 = b_1 * b_2  # Elementwise AND
+
+    b_ispaired = b_12.any(axis=1)
+    b_paired = b_12.loc[b_ispaired]
+    b_mispaired = b_1.loc[~b_ispaired] + b_2.loc[~b_ispaired]
+    b_unpaired = pd.concat([binary_1.loc[only_1], binary_2.loc[only_2]])
+
+    logger.debug(
+        "%s Alignment pairing completed. %d paired, %d unpaired, %d discordant",
+        elapsed(), b_paired.shape[0], b_unpaired.shape[0], b_mispaired.shape[0],
+    )
+
+    return b_paired, b_mispaired, b_unpaired
+
+
+def get_features(
+    allele_id: str, features: pd.DataFrame, feature_list: list[tuple[str, int]]
+) -> pd.DataFrame:
+    """
+    Get features for an allele from the features DataFrame.
+
+    Args:
+        allele_id: Allele ID (can be HLA12345 or HLA12345_HLA67890 for reconstructed).
+        features: Features DataFrame.
+        feature_list: List of (feature_type, feature_number) tuples to include.
+
+    Returns:
+        DataFrame of features to include.
+    """
+    if "_" in allele_id:
+        partial_allele, complete_allele = allele_id.split("_")
+    else:
+        complete_allele = allele_id
+        partial_allele = None
+
+    feats_complete = {
+        (of["feature"], of["number"]): of
+        for _, of in features.loc[features["id"] == complete_allele].iterrows()
+    }
+
+    if partial_allele:
+        feats_partial = {
+            (of["feature"], of["number"]): of
+            for _, of in features.loc[features["id"] == partial_allele].iterrows()
+        }
+    else:
+        feats_partial = feats_complete
+
+    feats_to_include = []
+    for feat in sorted(feature_list, key=feature_order):
+        if feat in feats_partial:
+            feats_to_include.append(feats_partial[feat])
+        elif feat in feats_complete:
+            feats_to_include.append(feats_complete[feat])
+        else:
+            warnings.warn(f"Feature {feat} not found for allele {allele_id}")
+
+    return pd.DataFrame(feats_to_include)
+
+
+def calculate_coverage(
+    alignment: tuple,
+    features: pd.DataFrame,
+    alleles_to_plot: pd.Series,
+    features_used: list[tuple[str, int]],
+) -> list[tuple[str, np.ndarray]]:
+    """
+    Calculate coverage matrices for alleles.
+
+    Args:
+        alignment: Tuple of alignment data (pos, read_details, ...).
+        features: Features DataFrame.
+        alleles_to_plot: Series of allele IDs to plot.
+        features_used: List of features to use.
+
+    Returns:
+        List of (allele_id, coverage_matrix) tuples.
+    """
+    if len(alignment) not in (2, 4, 5):
+        raise ValueError(f"Alignment tuple must have 2, 4 or 5 elements, got {len(alignment)}")
+    has_pairing_info = len(alignment) == 5
+
+    if len(alignment) == 2:
+        matrix_pos, read_details = alignment[:2]
+        pos = matrix_pos[alleles_to_plot]
+        pairing = np.sign(pos) * 2
+        hit_counts = np.sign(pos).sum(axis=1)
+        max_ambiguity = max(1, hit_counts.max())
+        to_process = [(pos, read_details, hit_counts, pairing)]
+    elif len(alignment) == 4:
+        matrix_pos1, read_details1, matrix_pos2, read_details2 = alignment
+        pos1 = matrix_pos1[alleles_to_plot]
+        pos2 = matrix_pos2[alleles_to_plot]
+        pairing1 = np.sign(pos1) * 2
+        pairing2 = np.sign(pos2) * 2
+        hit_counts1 = np.sign(pos1).sum(axis=1)
+        hit_counts2 = np.sign(pos2).sum(axis=1)
+        max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
+        to_process = [
+            (pos1, read_details1, hit_counts1, pairing1),
+            (pos2, read_details2, hit_counts2, pairing2),
+        ]
+    else:
+        matrix_pos1, read_details1, matrix_pos2, read_details2, pairing_binaries = alignment
+        bin_p, bin_u, bin_m = pairing_binaries
+        pos1 = matrix_pos1[alleles_to_plot]
+        pos2 = matrix_pos2[alleles_to_plot]
+        pairing = pd.concat([
+            bin_p[alleles_to_plot],
+            bin_u[alleles_to_plot] * 2,
+            bin_m[alleles_to_plot] * 3,
+        ])
+        pairing1 = pairing.loc[pos1.index]
+        pairing2 = pairing.loc[pos2.index]
+        hit_counts1 = np.sign(pairing1).sum(axis=1)
+        hit_counts2 = np.sign(pairing2).sum(axis=1)
+        max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
+        to_process = [
+            (pos1, read_details1, hit_counts1, pairing1),
+            (pos2, read_details2, hit_counts2, pairing2),
+        ]
+
+    coverage_matrices = []
+
+    for allele in alleles_to_plot:
+        allele_features = get_features(allele, features, features_used)
+        allele_length = allele_features["length"].sum()
+
+        coverage = np.zeros((2, 3, max_ambiguity, allele_length), dtype=int)
+
+        for pos, read_details, hit_counts, pairing_info in to_process:
+            reads = pos[pos[allele] != 0].index
+            for i_pos, i_read_length, i_mismatches, i_hitcount, i_pairing in zip(
+                pos.loc[reads][allele],
+                read_details.loc[reads]["read_length"],
+                read_details.loc[reads]["mismatches"],
+                hit_counts[reads],
+                pairing_info.loc[reads][allele],
+            ):
+                if not i_pairing:
+                    continue
+                coverage[int(bool(i_mismatches))][i_pairing - 1][i_hitcount - 1][
+                    i_pos - 1 : i_pos - 1 + i_read_length
+                ] += 1
+
+        coverage_matrices.append((allele, coverage))
+
+    return coverage_matrices
+
+
+def plot_coverage(
+    outfile: str,
+    coverage_matrices: list[tuple[str, np.ndarray]],
+    allele_data: pd.DataFrame,
+    features: pd.DataFrame,
+    features_used: list[tuple[str, int]],
+    columns: int = 2,
+) -> None:
+    """
+    Generate coverage plot PDF.
+
+    Args:
+        outfile: Output PDF path.
+        coverage_matrices: List of (allele_id, coverage_matrix) tuples.
+        allele_data: Allele table DataFrame.
+        features: Features DataFrame.
+        features_used: List of features used.
+        columns: Number of columns in the plot.
+    """
+    def start_end_zeros(cov_array):
+        return np.append(np.append([0], cov_array), [0])
+
+    def allele_sorter(allele_cov_mtx_tuple):
+        allele, _ = allele_cov_mtx_tuple
+        return allele_data.loc[allele.split("_")[0]]["type"]
+
+    def get_allele_locus(allele):
+        return allele_data.loc[allele.split("_")[0]]["locus"]
+
+    number_of_loci = len(set(get_allele_locus(allele) for allele, _ in coverage_matrices))
+
+    dpi = 50
+    box_size = (7, 1)
+    subplot_rows = 3 * number_of_loci + 1
+
+    area_colors = [
+        (0.26, 0.76, 0.26),  # perfect, paired, unique
+        (0.40, 0.84, 0.40),  # perfect, paired, shared
+        (0.99, 0.75, 0.20),  # perfect, unpaired, unique
+        (0.99, 0.75, 0.20),  # perfect, mispaired, unique
+        (0.99, 0.85, 0.35),  # perfect, unpaired, shared
+        (0.99, 0.85, 0.35),  # perfect, mispaired, shared
+        (0.99, 0.23, 0.23),  # mismatch, paired, unique
+        (0.99, 0.49, 0.49),  # mismatch, paired, shared
+        (0.14, 0.55, 0.72),  # mismatch, unpaired, unique
+        (0.14, 0.55, 0.72),  # mismatch, mispaired, unique
+        (0.33, 0.70, 0.88),  # mismatch, unpaired, shared
+        (0.33, 0.70, 0.88),  # mismatch, mispaired, shared
+    ]
+
+    figure = plt.figure(figsize=(box_size[0] * columns, box_size[1] * subplot_rows), dpi=dpi)
+
+    coverage_matrices = sorted(coverage_matrices, key=allele_sorter)
+    prev_locus = ""
+    i_locus = -1
+
+    for allele, coverage in coverage_matrices:
+        if "_" in allele:
+            partial, complete = allele.split("_")
+            plot_title = f"{allele_data.loc[partial]['type']} (introns from {allele_data.loc[complete]['type']})"
+        else:
+            plot_title = allele_data.loc[allele]["type"]
+
+        if prev_locus != get_allele_locus(allele):
+            i_locus += 1
+            i_allele_in_locus = 0
+        else:
+            i_allele_in_locus = 1
+
+        prev_locus = get_allele_locus(allele)
+
+        plot = plt.subplot2grid(
+            (subplot_rows, columns),
+            (3 * i_locus, i_allele_in_locus),
+            rowspan=3,
+            adjustable="box",
+        )
+
+        _, _, max_ambig, seq_length = coverage.shape
+        shared_weighting = np.reciprocal(np.arange(max_ambig) + 1.0)
+        shared_weighting[0] = 0
+
+        perfect_paired_unique = start_end_zeros(coverage[0][0][0])
+        mismatch_paired_unique = start_end_zeros(coverage[1][0][0])
+        perfect_unpaired_unique = start_end_zeros(coverage[0][1][0])
+        mismatch_unpaired_unique = start_end_zeros(coverage[1][1][0])
+        perfect_mispaired_unique = start_end_zeros(coverage[0][2][0])
+        mismatch_mispaired_unique = start_end_zeros(coverage[1][2][0])
+
+        perfect_paired_shared = start_end_zeros(shared_weighting.dot(coverage[0][0]))
+        mismatch_paired_shared = start_end_zeros(shared_weighting.dot(coverage[1][0]))
+        perfect_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][1]))
+        mismatch_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][1]))
+        perfect_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][2]))
+        mismatch_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][2]))
+
+        # Exon annotation
+        i_start = 1
+        for _, ft in get_features(allele, features, features_used).iterrows():
+            if ft["feature"] == "exon":
+                plot.axvspan(i_start, i_start + ft["length"], facecolor="black", alpha=0.1, linewidth=0, zorder=1)
+            i_start += ft["length"]
+
+        areas = plot.stackplot(
+            np.arange(seq_length + 2),
+            perfect_paired_unique + 0.001,
+            perfect_paired_shared,
+            perfect_unpaired_unique,
+            perfect_mispaired_unique,
+            perfect_unpaired_shared,
+            perfect_mispaired_shared,
+            mismatch_paired_unique,
+            mismatch_paired_shared,
+            mismatch_unpaired_unique,
+            mismatch_mispaired_unique,
+            mismatch_unpaired_shared,
+            mismatch_mispaired_shared,
+            linewidth=0,
+            colors=area_colors,
+            zorder=5,
+        )
+
+        for aa in areas:
+            aa.set_edgecolor(aa.get_facecolor())
+
+        plot.tick_params(axis="both", labelsize=10, direction="out", which="both", top=False)
+        plot.text(
+            0.015, 0.97, plot_title,
+            horizontalalignment="left",
+            verticalalignment="top",
+            transform=plot.transAxes,
+            fontsize=10,
+            zorder=6,
+        )
+        _, _, _, y2 = plot.axis()
+        plot.axis((0, seq_length, 1, y2))
+        plot.set_yscale("log")
+        plot.set_ylim(bottom=0.5)
+
+    # Legend
+    legend = plt.subplot2grid((subplot_rows, columns), (subplot_rows - 1, 0), colspan=2, adjustable="box")
+    legend.add_patch(mpatches.Rectangle((0, 2), 2, 2, color=area_colors[0]))
+    legend.add_patch(mpatches.Rectangle((0, 0), 2, 2, color=area_colors[1]))
+    legend.add_patch(mpatches.Rectangle((25, 2), 2, 2, color=area_colors[2]))
+    legend.add_patch(mpatches.Rectangle((25, 0), 2, 2, color=area_colors[4]))
+    legend.add_patch(mpatches.Rectangle((50, 2), 2, 2, color=area_colors[6]))
+    legend.add_patch(mpatches.Rectangle((50, 0), 2, 2, color=area_colors[7]))
+    legend.add_patch(mpatches.Rectangle((75, 2), 2, 2, color=area_colors[8]))
+    legend.add_patch(mpatches.Rectangle((75, 0), 2, 2, color=area_colors[10]))
+    legend.text(2.5, 3, "paired, no mismatches, unique", va="center", size="smaller")
+    legend.text(2.5, 1, "paired, no mismatches, ambiguous", va="center", size="smaller")
+    legend.text(27.5, 3, "unpaired, no mismatches, unique", va="center", size="smaller")
+    legend.text(27.5, 1, "unpaired, no mismatches, ambiguous", va="center", size="smaller")
+    legend.text(52.5, 3, "paired, mismatched, unique", va="center", size="smaller")
+    legend.text(52.5, 1, "paired, mismatched, ambiguous", va="center", size="smaller")
+    legend.text(77.5, 3, "unpaired, mismatched, unique", va="center", size="smaller")
+    legend.text(77.5, 1, "unpaired, mismatched, ambiguous", va="center", size="smaller")
+    legend.set_xlim(0, 100)
+    legend.set_ylim(0, 4)
+    legend.axison = False
+
+    figure.tight_layout()
+    figure.savefig(outfile)
+    plt.close(figure)
diff --git a/src/optitype/io/__init__.py b/src/optitype/io/__init__.py
new file mode 100644
index 0000000..1d63698
--- /dev/null
+++ b/src/optitype/io/__init__.py
@@ -0,0 +1,12 @@
+"""I/O utilities for OptiType."""
+
+from optitype.io.data import get_data_path, load_reference_data
+from optitype.io.readers import PYSAM_AVAILABLE, pysam_to_dataframe, sam_to_dataframe
+
+__all__ = [
+    "PYSAM_AVAILABLE",
+    "get_data_path",
+    "load_reference_data",
+    "pysam_to_dataframe",
+    "sam_to_dataframe",
+]
diff --git a/src/optitype/io/data.py b/src/optitype/io/data.py
new file mode 100644
index 0000000..9456e49
--- /dev/null
+++ b/src/optitype/io/data.py
@@ -0,0 +1,114 @@
+"""Reference data loading utilities."""
+
+import os
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+
+def get_data_path() -> Path:
+    """
+    Get the path to OptiType reference data.
+
+    The data can be located in:
+    1. Package installation directory (share/optitype/data)
+    2. Development directory (data/)
+    3. Custom path via OPTITYPE_DATA environment variable
+
+    Returns:
+        Path to the data directory.
+
+    Raises:
+        FileNotFoundError: If no valid data directory is found.
+    """
+    # Check environment variable first
+    if env_path := os.environ.get("OPTITYPE_DATA"):
+        data_path = Path(env_path)
+        if data_path.exists():
+            return data_path
+
+    # Check for installed package data
+    # When installed via pip, data goes to share/optitype/data
+    if sys.prefix != sys.base_prefix:  # In a virtual environment
+        share_path = Path(sys.prefix) / "share" / "optitype" / "data"
+        if share_path.exists():
+            return share_path
+
+    # Also check site-packages share location
+    for site_packages in sys.path:
+        share_path = Path(site_packages).parent / "share" / "optitype" / "data"
+        if share_path.exists():
+            return share_path
+
+    # Development mode: check relative to package
+    dev_data_path = Path(__file__).parent.parent.parent.parent / "data"
+    if dev_data_path.exists():
+        return dev_data_path
+
+    raise FileNotFoundError(
+        "Could not find OptiType reference data. "
+        "Set OPTITYPE_DATA environment variable to the data directory path, "
+        "or ensure the package is properly installed."
+    )
+
+
+def load_reference_data() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Load the allele reference data from Parquet files.
+
+    Returns:
+        Tuple of (table, features) DataFrames containing allele information.
+
+    Raises:
+        FileNotFoundError: If the data files are not found.
+    """
+    data_path = get_data_path()
+    alleles_path = data_path / "alleles"
+
+    if not alleles_path.exists():
+        raise FileNotFoundError(
+            f"Allele data directory not found at {alleles_path}. "
+            "Ensure the data has been converted to Parquet format."
+        )
+
+    table_path = alleles_path / "table.parquet"
+    features_path = alleles_path / "features.parquet"
+
+    if not table_path.exists() or not features_path.exists():
+        raise FileNotFoundError(
+            f"Parquet files not found in {alleles_path}. "
+            "Expected table.parquet and features.parquet."
+        )
+
+    table = pd.read_parquet(table_path)
+    features = pd.read_parquet(features_path)
+
+    return table, features
+
+
+def get_reference_fasta(seq_type: str) -> Path:
+    """
+    Get the path to the HLA reference FASTA file.
+
+    Args:
+        seq_type: Either 'dna' or 'rna'.
+
+    Returns:
+        Path to the reference FASTA file.
+
+    Raises:
+        ValueError: If seq_type is not 'dna' or 'rna'.
+        FileNotFoundError: If the reference file is not found.
+    """
+    if seq_type not in ("dna", "rna"):
+        raise ValueError(f"seq_type must be 'dna' or 'rna', got {seq_type!r}")
+
+    data_path = get_data_path()
+    fasta_name = f"hla_reference_{seq_type}.fasta"
+    fasta_path = data_path / fasta_name
+
+    if not fasta_path.exists():
+        raise FileNotFoundError(f"Reference FASTA not found: {fasta_path}")
+
+    return fasta_path
diff --git a/src/optitype/io/readers.py b/src/optitype/io/readers.py
new file mode 100644
index 0000000..0a98e03
--- /dev/null
+++ b/src/optitype/io/readers.py
@@ -0,0 +1,198 @@
+"""SAM/BAM file parsing utilities."""
+
+import logging
+import re
+import sys
+from collections import OrderedDict
+from functools import lru_cache
+
+import numpy as np
+import pandas as pd
+
+from optitype._logging import elapsed
+
+try:
+    import pysam
+    PYSAM_AVAILABLE = True
+except ImportError:
+    PYSAM_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# CIGAR pattern for calculating read span on reference
+CIGAR_SLICER = re.compile(r"[0-9]+[MD]")
+
+
+@lru_cache(maxsize=None)
+def _length_on_reference(cigar_string: str) -> int:
+    """Calculate the length a read spans on the reference from its CIGAR string."""
+    return sum(int(p[:-1]) for p in CIGAR_SLICER.findall(cigar_string))
+
+
+def sam_to_dataframe(samfile: str) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Parse a SAM file into position and read detail DataFrames.
+
+    This is a fallback parser when pysam is not available.
+
+    Args:
+        samfile: Path to the SAM file.
+
+    Returns:
+        Tuple of (matrix_pos, read_details) DataFrames.
+    """
+    logger.debug("%s Loading alleles and read IDs from %s...", elapsed(), samfile)
+
+    read_ids, allele_ids = [], []
+    first_hit_row = True
+    total_hits = 0
+    nm_index = None
+
+    with open(samfile, "rb") as f:
+        last_read_id = None
+        for line in f:
+            line = line.decode("utf-8")
+            if line.startswith("@"):
+                if line.startswith("@SQ"):
+                    allele_ids.append(line.split("\t")[1][3:])  # SN:HLA:HLA00001
+                continue
+
+            total_hits += 1
+            read_id = line.split("\t")[0]
+            if last_read_id != read_id:
+                read_ids.append(read_id)
+                last_read_id = read_id
+
+            if first_hit_row:
+                first_hit_row = False
+                columns = line.split()
+                try:
+                    nm_index = [x.startswith("NM:") for x in columns].index(True)
+                except ValueError:
+                    logger.warning("No NM-tag found in SAM file!")
+                    nm_index = None
+
+    logger.debug("%s %d alleles and %d reads found.", elapsed(), len(allele_ids), len(read_ids))
+    logger.debug("%s Initializing mapping matrix...", elapsed())
+
+    matrix_pos = pd.DataFrame(
+        np.zeros((len(read_ids), len(allele_ids)), dtype=np.uint16),
+        columns=allele_ids,
+        index=read_ids,
+    )
+
+    read_details = OrderedDict()
+
+    logger.debug(
+        "%s %dx%d mapping matrix initialized. Populating %d hits from SAM file...",
+        elapsed(), len(read_ids), len(allele_ids), total_hits,
+    )
+
+    milestones = [x * total_hits // 10 for x in range(1, 11)]
+
+    with open(samfile, "rb") as f:
+        counter = 0
+        percent = 0
+        for line in f:
+            line = line.decode("utf-8")
+            if line.startswith("@"):
+                continue
+
+            fields = line.strip().split("\t")
+            read_id, allele_id, position, cigar = fields[0], fields[2], fields[3], fields[5]
+            nm = fields[nm_index] if nm_index is not None else "NM:i:0"
+
+            if read_id not in read_details:
+                read_details[read_id] = (int(nm[5:]), _length_on_reference(cigar))
+
+            matrix_pos.at[read_id, allele_id] = int(position)
+
+            counter += 1
+            if counter in milestones:
+                percent += 10
+                logger.debug("\t%d%% completed", percent)
+
+    logger.debug(
+        "%s %d elements filled. Matrix sparsity: 1 in %.2f",
+        elapsed(), counter, matrix_pos.shape[0] * matrix_pos.shape[1] / float(counter),
+    )
+
+    matrix_pos.rename(columns=lambda x: x.replace("HLA:", ""), inplace=True)
+
+    details_df = pd.DataFrame.from_dict(read_details, orient="index")
+    details_df.columns = ["mismatches", "read_length"]
+
+    return matrix_pos, details_df
+
+
+def pysam_to_dataframe(samfile: str) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Parse a SAM/BAM file into position and read detail DataFrames using pysam.
+
+    Args:
+        samfile: Path to the SAM or BAM file.
+
+    Returns:
+        Tuple of (matrix_pos, read_details) DataFrames.
+    """
+    if not PYSAM_AVAILABLE:
+        logger.warning("PySam not available. Falling back to primitive SAM parsing.")
+        return sam_to_dataframe(samfile)
+
+    sam_or_bam = "rb" if samfile.endswith(".bam") else "r"
+    sam = pysam.AlignmentFile(samfile, sam_or_bam)
+
+    # Check if yara produced the file (for XA tag handling)
+    is_yara = sam.header.get("PG", [{}])[0].get("ID", "").lower() == "yara"
+    xa_tag = is_yara and (" -os " not in sam.header.get("PG", [{}])[0].get("CL", ""))
+
+    nref = sam.nreferences
+    hits = OrderedDict()
+    allele_id_to_index = {aa: ii for ii, aa in enumerate(sam.references)}
+    read_details = OrderedDict()
+
+    logger.debug(
+        "%s Loading %s started. Number of HLA reads loaded (updated every thousand):",
+        elapsed(), samfile,
+    )
+
+    read_counter = 0
+    hit_counter = 0
+
+    for aln in sam:
+        if aln.qname not in hits:
+            hits[aln.qname] = np.zeros(nref, dtype=np.uint16)
+            read_details[aln.qname] = (aln.get_tag("NM"), aln.query_length)
+            read_counter += 1
+
+            if logger.isEnabledFor(logging.DEBUG) and not (read_counter % 1000):
+                sys.stdout.write(f"{len(hits) // 1000}K...")
+                sys.stdout.flush()
+
+            if xa_tag and aln.has_tag("XA"):
+                current_row = hits[aln.qname]
+                subtags = aln.get_tag("XA").split(";")[:-1]
+                hit_counter += len(subtags)
+                for subtag in subtags:
+                    allele, pos = subtag.split(",")[:2]
+                    current_row[allele_id_to_index[allele]] = int(pos)
+
+        hits[aln.qname][aln.reference_id] = aln.reference_start + 1
+        hit_counter += 1
+
+    logger.debug("\n %s %d reads loaded. Creating dataframe...", elapsed(), len(hits))
+
+    pos_df = pd.DataFrame.from_dict(hits, orient="index")
+    pos_df.columns = sam.references[:]
+
+    details_df = pd.DataFrame.from_dict(read_details, orient="index")
+    details_df.columns = ["mismatches", "read_length"]
+
+    logger.debug(
+        "%s Dataframes created. Shape: %d x %d, hits: %s (%d), sparsity: 1 in %.2f",
+        elapsed(), pos_df.shape[0], pos_df.shape[1],
+        np.sign(pos_df).sum().sum(), hit_counter,
+        pos_df.shape[0] * pos_df.shape[1] / float(hit_counter),
+    )
+
+    return pos_df, details_df
diff --git a/src/optitype/model.py b/src/optitype/model.py
new file mode 100644
index 0000000..4c52c21
--- /dev/null
+++ b/src/optitype/model.py
@@ -0,0 +1,387 @@
+"""
+OptiType ILP model for HLA typing.
+
+This module implements the integer linear programming model for optimal
+HLA allele selection using Pyomo.
+"""
+
+import itertools
+import logging
+from collections import defaultdict
+
+import pandas as pd
+from pyomo.environ import (
+    Binary,
+    ConcreteModel,
+    Constraint,
+    ConstraintList,
+    Objective,
+    Param,
+    Set,
+    Var,
+    maximize,
+)
+from pyomo.common.errors import ApplicationError
+from pyomo.opt import SolverFactory, TerminationCondition
+
+logger = logging.getLogger(__name__)
+
+
+class OptiType:
+    """
+    OptiType ILP model for HLA genotyping.
+
+    Uses integer linear programming to select the optimal combination
+    of HLA alleles that best explain the observed read mappings.
+    """
+
+    def __init__(
+        self,
+        cov: dict,
+        occ: dict,
+        groups_4digit: dict,
+        allele_table: pd.DataFrame,
+        beta: float,
+        t_max_allele: int = 2,
+        solver: str = "glpk",
+        threads: int = 1,
+        verbosity: bool = False,
+    ):
+        """
+        Initialize the OptiType model.
+
+        Args:
+            cov: Coverage dictionary mapping (read, allele) to 1 for hits.
+            occ: Occurrence dictionary mapping read patterns to counts.
+            groups_4digit: Dictionary mapping 4-digit types to allele lists.
+            allele_table: DataFrame with allele information.
+            beta: Homozygosity detection parameter (0.0 to 0.1).
+            t_max_allele: Maximum alleles per locus (default 2).
+            solver: ILP solver to use ('glpk', 'cplex', 'cbc').
+            threads: Number of solver threads.
+            verbosity: Whether to print verbose output.
+        """
+        self._allele_table = allele_table
+        self._beta = float(beta)
+        self._t_max_allele = t_max_allele
+        self._solver = SolverFactory(solver)
+        self._threads = threads
+        self._opts = {"threads": threads} if threads > 1 else {}
+        self._verbosity = verbosity
+        self._changed = True
+        self._ks = 1
+        self._groups_4digit = groups_4digit
+
+        # Group alleles by locus
+        loci_alleles = defaultdict(list)
+        for type_4digit, group_alleles in groups_4digit.items():
+            loci_alleles[type_4digit.split("*")[0]].extend(group_alleles)
+
+        self._allele_to_4digit = {
+            allele: type_4digit
+            for type_4digit, group in groups_4digit.items()
+            for allele in group
+        }
+
+        # Build the ILP model
+        model = ConcreteModel()
+
+        # Initialize Sets
+        model.LociNames = Set(initialize=list(loci_alleles.keys()))
+        model.Loci = Set(model.LociNames, initialize=lambda m, l: loci_alleles[l])
+
+        L = list(itertools.chain(*list(loci_alleles.values())))
+        reconst = {allele_id: 0.01 for allele_id in L if "_" in allele_id}
+        R = set(r for (r, _) in cov.keys())
+        model.L = Set(initialize=L)
+        model.R = Set(initialize=R)
+
+        # Initialize Parameters
+        model.cov = Param(model.R, model.L, initialize=lambda model, r, a: cov.get((r, a), 0))
+        model.reconst = Param(model.L, initialize=lambda model, a: reconst.get(a, 0))
+        model.occ = Param(model.R, initialize=occ)
+        model.t_allele = Param(initialize=self._t_max_allele, mutable=True)
+        model.beta = Param(
+            initialize=self._beta,
+            validate=lambda val, model: 0.0 <= float(self._beta) <= 0.999,
+            mutable=True,
+        )
+        model.nof_loci = Param(initialize=len(loci_alleles))
+
+        # Initialize Variables
+        model.x = Var(model.L, domain=Binary)
+        model.y = Var(model.R, domain=Binary)
+        model.re = Var(model.R, bounds=(0.0, None))
+        model.hetero = Var(bounds=(0.0, model.nof_loci))
+
+        # Objective function
+        model.read_cov = Objective(
+            rule=lambda model: sum(
+                model.occ[r] * (model.y[r] - model.beta * model.re[r]) for r in model.R
+            )
+            - sum(model.reconst[a] * model.x[a] for a in model.L),
+            sense=maximize,
+        )
+
+        # Constraints
+        model.max_allel_selection = Constraint(
+            model.LociNames,
+            rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) <= model.t_allele,
+        )
+        model.min_allel_selection = Constraint(
+            model.LociNames,
+            rule=lambda model, l: sum(model.x[a] for a in model.Loci[l]) >= 1,
+        )
+        model.is_read_cov = Constraint(
+            model.R,
+            rule=lambda model, r: sum(model.cov[r, a] * model.x[a] for a in model.L)
+            >= model.y[r],
+        )
+        model.heterozygot_count = Constraint(
+            rule=lambda model: model.hetero >= sum(model.x[a] for a in model.L) - model.nof_loci
+        )
+
+        # Regularization constraints
+        model.reg1 = Constraint(
+            model.R, rule=lambda model, r: model.re[r] <= model.nof_loci * model.y[r]
+        )
+        model.reg2 = Constraint(model.R, rule=lambda model, r: model.re[r] <= model.hetero)
+        model.reg3 = Constraint(
+            model.R,
+            rule=lambda model, r: model.re[r]
+            >= model.hetero - model.nof_loci * (1 - model.y[r]),
+        )
+
+        # Constraint list for solution enumeration
+        model.c = ConstraintList()
+
+        self._instance = model
+
+    def _append_result(self, selected: list, result_dict: dict, instance) -> None:
+        """Append a solution's allele assignments and scores to the result dict."""
+        loci = [self._allele_to_4digit[x].split("*")[0] for x in selected]
+        locus_count = dict.fromkeys(loci, 1)
+
+        for i, locus in enumerate(loci):
+            if loci.count(locus) < 2:
+                result_dict[locus + "1"].append(selected[i])
+                result_dict[locus + "2"].append(selected[i])
+            else:
+                result_dict[locus + str(locus_count[locus])].append(selected[i])
+                locus_count[locus] += 1
+
+        nof_reads = sum(instance.occ[j] * instance.y[j].value for j in instance.y)
+        result_dict["obj"].append(instance.read_cov())
+        result_dict["nof_reads"].append(nof_reads)
+
+    def set_beta(self, beta: float) -> None:
+        """Set the beta parameter for homozygosity detection."""
+        self._changed = True
+        self._instance.beta.set_value(float(beta))
+
+    def set_t_max_allele(self, t_max_allele: int) -> None:
+        """Set the upper bound of alleles selected per locus."""
+        self._changed = True
+        self._instance.t_allele.set_value(t_max_allele)
+
+    def solve(self, ks: int) -> pd.DataFrame:
+        """
+        Solve the ILP model and enumerate top k solutions.
+
+        Args:
+            ks: Number of solutions to enumerate.
+
+        Returns:
+            DataFrame with typing results.
+        """
+        d = defaultdict(list)
+
+        if self._changed or self._ks != ks:
+            self._ks = ks
+
+            for k in range(ks):
+                expr = 0
+
+                try:
+                    res = self._solver.solve(
+                        self._instance, options=self._opts, tee=self._verbosity
+                    )
+                except (RuntimeError, ApplicationError):
+                    logger.warning(
+                        "Solver does not support multi-threading. "
+                        "Falling back to single-threading."
+                    )
+                    res = self._solver.solve(self._instance, options={}, tee=self._verbosity)
+
+                self._instance.solutions.load_from(res)
+
+                if res.solver.termination_condition != TerminationCondition.optimal:
+                    logger.error("Optimal solution hasn't been obtained. This is a terminal problem.")
+                    break
+
+                selected = []
+                indices = []
+                encountered_4digit = []
+
+                for j in self._instance.x:
+                    if self._allele_to_4digit[j][0] in "HJG":
+                        if 0.99 <= self._instance.x[j].value <= 1.01:
+                            selected.append(j)
+                        indices.append(j)
+                        continue
+
+                    if 0.99 <= self._instance.x[j].value <= 1.01:
+                        selected.append(j)
+                        exp_i = 0
+                        exp_i += self._instance.x[j]
+
+                        if self._allele_to_4digit[j] in encountered_4digit:
+                            continue
+
+                        encountered_4digit.append(self._allele_to_4digit[j])
+                        for i_allele in self._groups_4digit[self._allele_to_4digit[j]]:
+                            if self._instance.x[i_allele].value <= 0:
+                                exp_i += self._instance.x[i_allele]
+                            indices.append(i_allele)
+                        expr += 1.0 - exp_i
+
+                zero_indices = set(j for j in self._instance.x).difference(set(indices))
+                for j in zero_indices:
+                    expr += self._instance.x[j]
+
+                self._instance.c.add(expr >= 1)
+
+                self._append_result(selected, d, self._instance)
+
+            self._instance.c.clear()
+            self._changed = False
+            self._enumeration = pd.DataFrame(d)
+
+            return self._enumeration
+        else:
+            return self._enumeration
+
+    def solve_for_k_alleles(self, k: int, ks: int = 1) -> pd.DataFrame:
+        """
+        Solve with exactly k alleles (experimental).
+
+        Args:
+            k: Exact number of alleles to select.
+            ks: Number of solutions to enumerate.
+
+        Returns:
+            DataFrame with typing results.
+        """
+        if k < int(self._instance.nof_loci.value) or k > int(
+            self._instance.nof_loci.value * self._t_max_allele
+        ):
+            raise ValueError(
+                f"k {k} is out of range [{self._instance.nof_loci.value}, "
+                f"{self._instance.nof_loci.value * self._t_max_allele}]"
+            )
+
+        inst = self._instance.clone()
+        inst.beta.set_value(0.0)
+
+        inst.del_component("heterozygot_count")
+        inst.del_component("reg1")
+        inst.del_component("reg2")
+        inst.del_component("reg3")
+
+        expr1 = sum(inst.x[j] for j in inst.x)
+        inst.c.add(expr1 == k)
+
+        d = defaultdict(list)
+
+        for _ in range(ks):
+            try:
+                res = self._solver.solve(inst, options=self._opts, tee=self._verbosity)
+            except (RuntimeError, ApplicationError):
+                logger.warning(
+                    "Solver does not support multi-threading. "
+                    "Falling back to single-threading."
+                )
+                res = self._solver.solve(inst, options={}, tee=self._verbosity)
+
+            inst.solutions.load_from(res)
+
+            if self._verbosity:
+                res.write(num=1)
+
+            if res.solver.termination_condition != TerminationCondition.optimal:
+                logger.error("Optimal solution hasn't been obtained.")
+                break
+
+            selected = []
+            expr = 0
+            indices = []
+            encountered_4digit = []
+
+            for j in inst.x:
+                if 0.99 <= inst.x[j].value <= 1.01:
+                    exp_i = 0
+                    selected.append(j)
+                    exp_i += inst.x[j]
+
+                    if self._allele_to_4digit[j] in encountered_4digit:
+                        continue
+
+                    encountered_4digit.append(self._allele_to_4digit[j])
+                    for i_allele in self._groups_4digit[self._allele_to_4digit[j]]:
+                        if inst.x[i_allele].value <= 0:
+                            exp_i += inst.x[i_allele]
+                        indices.append(i_allele)
+                    expr += 1 - exp_i
+
+            zero_indices = set(j for j in inst.x).difference(set(indices))
+            for j in zero_indices:
+                expr += inst.x[j]
+
+            inst.c.add(expr >= 1)
+
+            logger.debug("Selected alleles: %s", selected)
+
+            self._append_result(selected, d, inst)
+
+        return pd.DataFrame(d)
+
+    def solve_enforced_zygosity(self, gosity_dict: dict) -> pd.DataFrame:
+        """
+        Solve with enforced homo/heterozygosity per locus (experimental).
+
+        Args:
+            gosity_dict: Dictionary mapping locus to number of alleles.
+
+        Returns:
+            DataFrame with typing results.
+        """
+        inst = self._instance.clone()
+        inst.beta.set_value(0.0)
+
+        inst.del_component("heterozygot_count")
+        inst.del_component("reg1")
+        inst.del_component("reg2")
+        inst.del_component("reg3")
+        inst.del_component("max_allel_selection")
+
+        for locus in inst.LociNames:
+            cons = sum(inst.x[a] for a in inst.Loci[locus])
+            inst.c.add(cons <= gosity_dict.get(locus, 2))
+
+        d = defaultdict(list)
+
+        try:
+            res = self._solver.solve(inst, options=self._opts, tee=self._verbosity)
+        except (RuntimeError, ApplicationError):
+            logger.warning(
+                "Solver does not support multi-threading. "
+                "Falling back to single-threading."
+            )
+            res = self._solver.solve(inst, options={}, tee=self._verbosity)
+
+        inst.solutions.load_from(res)
+
+        selected = [al for al in inst.x if 0.99 <= inst.x[al].value <= 1.01]
+        self._append_result(selected, d, inst)
+
+        return pd.DataFrame(d)
diff --git a/src/optitype/pipeline.py b/src/optitype/pipeline.py
new file mode 100644
index 0000000..ad6d418
--- /dev/null
+++ b/src/optitype/pipeline.py
@@ -0,0 +1,381 @@
+"""
+Core pipeline orchestration for OptiType.
+
+This module contains the main HLA typing pipeline logic, coordinating
+read mapping, matrix construction, and ILP solving.
+"""
+
+import datetime
+import logging
+import multiprocessing
+import os
+import shutil
+import subprocess
+import time
+from collections import defaultdict
+from configparser import ConfigParser
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from optitype import hlatyper as ht
+from optitype._logging import configure_logging, elapsed
+from optitype.io.data import get_reference_fasta, load_reference_data
+from optitype.io.readers import PYSAM_AVAILABLE, pysam_to_dataframe
+from optitype.model import OptiType
+
+logger = logging.getLogger(__name__)
+
+
+# Frequent alleles list for filtering
+FREQ_ALLELES = """A*01:01 A*01:02 A*01:03 A*01:06 A*01:09 A*01:23 A*01:38 A*01:44 A*02:01 A*02:02 A*02:03 A*02:04 A*02:05 A*02:06 A*02:07 A*02:08 A*02:09 A*02:10 A*02:11 A*02:12 A*02:13 A*02:133 A*02:14 A*02:141 A*02:146 A*02:15N A*02:16 A*02:17 A*02:18 A*02:19 A*02:20 A*02:21 A*02:22 A*02:226N A*02:24 A*02:25 A*02:26 A*02:27 A*02:28 A*02:29 A*02:30 A*02:33 A*02:34 A*02:35 A*02:36 A*02:37 A*02:38 A*02:40 A*02:42 A*02:44 A*02:45 A*02:46 A*02:48 A*02:49 A*02:51 A*02:53N A*02:54 A*02:55 A*02:57 A*02:58 A*02:60 A*02:64 A*02:67 A*02:74 A*02:85 A*02:90 A*02:93 A*03:01 A*03:02 A*03:05 A*03:07 A*03:08 A*03:10 A*03:12 A*03:22 A*03:25 A*03:65 A*03:69N A*03:97 A*11:01 A*11:02 A*11:03 A*11:04 A*11:05 A*11:06 A*11:08 A*11:10 A*11:12 A*11:13 A*11:18 A*11:19 A*11:20 A*11:29 A*11:40 A*23:01 A*23:02 A*23:03 A*23:04 A*23:05 A*23:09 A*24:02 A*24:03 A*24:04 A*24:05 A*24:06 A*24:07 A*24:08 A*24:09N A*24:10 A*24:13 A*24:14 A*24:15 A*24:17 A*24:18 A*24:20 A*24:21 A*24:22 A*24:23 A*24:24 A*24:25 A*24:26 A*24:27 A*24:28 A*24:29 A*24:31 A*24:35 A*24:46 A*24:51 A*24:56 A*24:63 A*24:93 A*25:01 A*25:02 A*25:04 A*26:01 A*26:02 A*26:03 A*26:04 A*26:05 A*26:06 A*26:07 A*26:08 A*26:09 A*26:10 A*26:11N A*26:12 A*26:14 A*26:15 A*26:16 A*26:17 A*26:18 A*26:20 A*26:49 A*29:01 A*29:02 A*29:03 A*29:04 A*29:10 A*29:12 A*30:01 A*30:02 A*30:03 A*30:04 A*30:06 A*30:08 A*30:09 A*30:10 A*30:11 A*30:12 A*30:16 A*31:01 A*31:02 A*31:03 A*31:04 A*31:05 A*31:06 A*31:08 A*31:09 A*31:12 A*32:01 A*32:02 A*32:03 A*32:04 A*32:05 A*32:06 A*32:08 A*32:13 A*32:20 A*32:22 A*33:01 A*33:03 A*33:04 A*33:05 A*33:10 A*33:26 A*34:01 A*34:02 A*34:03 A*34:05 A*36:01 A*36:03 A*43:01 A*66:01 A*66:02 A*66:03 A*68:01 A*68:02 A*68:03 A*68:04 A*68:05 A*68:06 A*68:07 A*68:08 A*68:12 A*68:13 A*68:15 A*68:16 A*68:17 A*68:18N A*68:23 A*68:24 A*68:38 A*69:01 A*74:01 A*74:02 A*74:03 A*74:04 A*74:06 A*74:09 A*74:11 A*80:01 A*80:02 B*07:02 B*07:03 B*07:04 B*07:05 B*07:06 B*07:07 B*07:08 B*07:09 B*07:10 B*07:12 B*07:13 B*07:14 B*07:15 B*07:17 B*07:20 B*07:22 B*07:26 B*07:33 B*07:36 B*07:47 B*07:53 B*07:85 B*08:01 B*08:02 B*08:03 B*08:04 B*08:05 B*08:09 B*08:12 B*08:18 B*08:23 B*13:01 B*13:02 B*13:03 B*13:04 B*13:07N B*13:09 B*13:11 B*13:13 B*14:01 B*14:02 B*14:03 B*14:04 B*14:05 B*14:06 B*15:01 B*15:02 B*15:03 B*15:04 B*15:05 B*15:06 B*15:07 B*15:08 B*15:09 B*15:10 B*15:108 B*15:11 B*15:12 B*15:123 B*15:125 B*15:13 B*15:135 B*15:15 B*15:153 B*15:16 B*15:17 B*15:18 B*15:20 B*15:21 B*15:23 B*15:24 B*15:25 B*15:27 B*15:28 B*15:29 B*15:30 B*15:31 B*15:32 B*15:33 B*15:34 B*15:35 B*15:36 B*15:37 B*15:38 B*15:39 B*15:40 B*15:42 B*15:45 B*15:46 B*15:47 B*15:48 B*15:50 B*15:52 B*15:53 B*15:54 B*15:55 B*15:56 B*15:58 B*15:61 B*15:63 B*15:67 B*15:68 B*15:70 B*15:71 B*15:73 B*15:82 B*15:86 B*18:01 B*18:02 B*18:03 B*18:04 B*18:05 B*18:06 B*18:07 B*18:08 B*18:09 B*18:11 B*18:13 B*18:14 B*18:18 B*18:19 B*18:20 B*18:28 B*18:33 B*27:01 B*27:02 B*27:03 B*27:04 B*27:05 B*27:06 B*27:07 B*27:08 B*27:09 B*27:10 B*27:11 B*27:12 B*27:13 B*27:14 B*27:19 B*27:20 B*27:21 B*27:30 B*27:39 B*35:01 B*35:02 B*35:03 B*35:04 B*35:05 B*35:06 B*35:08 B*35:09 B*35:10 B*35:11 B*35:12 B*35:13 B*35:14 B*35:15 B*35:16 B*35:17 B*35:18 B*35:19 B*35:20 B*35:21 B*35:22 B*35:23 B*35:24 B*35:25 B*35:27 B*35:28 B*35:29 B*35:30 B*35:31 B*35:32 B*35:33 B*35:34 B*35:36 B*35:43 B*35:46 B*35:51 B*35:77 B*35:89 B*37:01 B*37:02 B*37:04 B*37:05 B*38:01 B*38:02 B*38:04 B*38:05 B*38:06 B*38:15 B*39:01 B*39:02 B*39:03 B*39:04 B*39:05 B*39:06 B*39:07 B*39:08 B*39:09 B*39:10 B*39:11 B*39:12 B*39:13 B*39:14 B*39:15 B*39:23 B*39:24 B*39:31 B*39:34 B*40:01 B*40:02 B*40:03 B*40:04 B*40:05 B*40:06 B*40:07 B*40:08 B*40:09 B*40:10 B*40:11 B*40:12 B*40:14 B*40:15 B*40:16 B*40:18 B*40:19 B*40:20 B*40:21 B*40:23 B*40:27 B*40:31 B*40:35 B*40:36 B*40:37 B*40:38 B*40:39 B*40:40 B*40:42 B*40:44 B*40:49 B*40:50 B*40:52 B*40:64 B*40:80 B*41:01 B*41:02 B*41:03 B*42:01 B*42:02 B*44:02 B*44:03 B*44:04 B*44:05 B*44:06 B*44:07 B*44:08 B*44:09 B*44:10 B*44:12 B*44:13 B*44:15 B*44:18 B*44:20 B*44:21 B*44:22 B*44:26 B*44:27 B*44:29 B*44:31 B*44:59 B*45:01 B*45:02 B*45:04 B*45:06 B*46:01 B*46:02 B*46:13 B*47:01 B*47:02 B*47:03 B*48:01 B*48:02 B*48:03 B*48:04 B*48:05 B*48:06 B*48:07 B*48:08 B*49:01 B*49:02 B*49:03 B*50:01 B*50:02 B*50:04 B*50:05 B*51:01 B*51:02 B*51:03 B*51:04 B*51:05 B*51:06 B*51:07 B*51:08 B*51:09 B*51:10 B*51:12 B*51:13 B*51:14 B*51:15 B*51:18 B*51:21 B*51:22 B*51:27N B*51:29 B*51:31 B*51:32 B*51:33 B*51:34 B*51:36 B*51:37 B*51:63 B*51:65 B*52:01 B*52:02 B*52:06 B*53:01 B*53:02 B*53:03 B*53:04 B*53:05 B*53:07 B*53:08 B*54:01 B*54:02 B*55:01 B*55:02 B*55:03 B*55:04 B*55:07 B*55:08 B*55:10 B*55:12 B*55:16 B*55:46 B*56:01 B*56:02 B*56:03 B*56:04 B*56:05 B*56:06 B*56:07 B*56:09 B*56:11 B*57:01 B*57:02 B*57:03 B*57:04 B*57:05 B*57:06 B*57:10 B*58:01 B*58:02 B*58:06 B*59:01 B*67:01 B*67:02 B*73:01 B*78:01 B*78:02 B*78:03 B*78:05 B*81:01 B*81:02 B*82:01 B*82:02 B*83:01 C*01:02 C*01:03 C*01:04 C*01:05 C*01:06 C*01:08 C*01:14 C*01:17 C*01:30 C*01:32 C*02:02 C*02:03 C*02:04 C*02:06 C*02:08 C*02:09 C*02:10 C*02:19 C*02:20 C*02:27 C*03:02 C*03:03 C*03:04 C*03:05 C*03:06 C*03:07 C*03:08 C*03:09 C*03:10 C*03:13 C*03:14 C*03:15 C*03:16 C*03:17 C*03:19 C*03:21 C*03:32 C*03:42 C*03:43 C*03:56 C*03:67 C*03:81 C*04:01 C*04:03 C*04:04 C*04:05 C*04:06 C*04:07 C*04:10 C*04:11 C*04:14 C*04:15 C*04:24 C*04:29 C*04:33 C*04:37 C*05:01 C*05:03 C*05:04 C*05:07N C*05:09 C*06:02 C*06:03 C*06:04 C*06:06 C*06:08 C*06:09 C*06:17 C*06:24 C*06:53 C*07:01 C*07:02 C*07:03 C*07:04 C*07:05 C*07:06 C*07:07 C*07:08 C*07:09 C*07:10 C*07:109 C*07:123 C*07:13 C*07:14 C*07:17 C*07:18 C*07:19 C*07:21 C*07:22 C*07:27 C*07:29 C*07:32N C*07:37 C*07:43 C*07:46 C*07:49 C*07:56 C*07:66 C*07:67 C*07:68 C*07:80 C*07:95 C*08:01 C*08:02 C*08:03 C*08:04 C*08:05 C*08:06 C*08:13 C*08:15 C*08:20 C*08:21 C*08:27 C*12:02 C*12:03 C*12:04 C*12:05 C*12:07 C*12:12 C*12:15 C*12:16 C*14:02 C*14:03 C*14:04 C*15:02 C*15:03 C*15:04 C*15:05 C*15:06 C*15:07 C*15:08 C*15:09 C*15:11 C*15:12 C*15:13 C*15:17 C*16:01 C*16:02 C*16:04 C*16:08 C*16:09 C*17:01 C*17:02 C*17:03 C*17:04 C*18:01 C*18:02 A*30:07 B*15:64 B*18:12""".split()
+
+
+@dataclass
+class PipelineConfig:
+    """Configuration for the OptiType pipeline."""
+
+    razers3_path: str = "razers3"
+    mapping_threads: int = 4
+    solver: str = "glpk"
+    ilp_threads: int = 1
+    delete_bam: bool = True
+    unpaired_weight: float = 0.0
+    use_discordant: bool = False
+
+
+@dataclass
+class PipelineResult:
+    """Results from the OptiType pipeline."""
+
+    result_df: pd.DataFrame
+    result_4digit: pd.DataFrame
+    output_csv: str
+    output_plot: str
+
+
+def get_num_threads(configured_threads: int) -> int:
+    """Get the number of threads to use, capped by available CPUs."""
+    try:
+        cpu_count = multiprocessing.cpu_count()
+    except (ImportError, NotImplementedError):
+        return 2
+
+    return min(cpu_count, configured_threads)
+
+
+def _is_frequent(allele_id: str, table: pd.DataFrame) -> bool:
+    """Check if an allele is in the frequent alleles list."""
+    allele_id = allele_id.split("_")[0]
+    return (
+        table.loc[allele_id]["4digit"] in FREQ_ALLELES
+        and table.loc[allele_id]["flags"] == 0
+    ) or (table.loc[allele_id]["locus"] in "HGJ")
+
+
+def _get_4digit(allele_id: str, table: pd.DataFrame) -> str:
+    """Get the 4-digit type for an allele."""
+    allele_id = allele_id.split("_")[0]
+    return table.loc[allele_id]["4digit"]
+
+
+def _get_types(allele_id: str | float, table: pd.DataFrame) -> str | float:
+    """Get the 4-digit type, handling non-string inputs."""
+    if not isinstance(allele_id, str):
+        return allele_id
+    aa = allele_id.split("_")
+    return table.loc[aa[0]]["4digit"]
+
+
+def load_config(config_path: str | Path | None = None) -> PipelineConfig:
+    """
+    Load configuration from an INI file.
+
+    Args:
+        config_path: Path to config file. If None, uses environment or defaults.
+
+    Returns:
+        PipelineConfig with loaded settings.
+    """
+    config = ConfigParser(os.environ)
+
+    if config_path and Path(config_path).exists():
+        config.read(config_path)
+
+    return PipelineConfig(
+        razers3_path=config.get("mapping", "razers3", fallback="razers3"),
+        mapping_threads=config.getint("mapping", "threads", fallback=4),
+        solver=config.get("ilp", "solver", fallback="glpk"),
+        ilp_threads=config.getint("ilp", "threads", fallback=1),
+        delete_bam=config.getboolean("behavior", "deletebam", fallback=True),
+        unpaired_weight=config.getfloat("behavior", "unpaired_weight", fallback=0.0),
+        use_discordant=config.getboolean("behavior", "use_discordant", fallback=False),
+    )
+
+
+def run_pipeline(
+    input_files: list[str],
+    seq_type: str,
+    output_dir: str,
+    prefix: str | None = None,
+    beta: float = 0.009,
+    enumerate_count: int = 1,
+    config: PipelineConfig | None = None,
+    verbose: bool = False,
+) -> PipelineResult:
+    """
+    Run the OptiType HLA typing pipeline.
+
+    Args:
+        input_files: List of input FASTQ or BAM files (1 or 2 files).
+        seq_type: Sequencing type ('dna' or 'rna').
+        output_dir: Output directory path.
+        prefix: Output file prefix (defaults to timestamp).
+        beta: Homozygosity detection parameter (0.0 to 0.1).
+        enumerate_count: Number of solutions to enumerate.
+        config: Pipeline configuration.
+        verbose: Enable verbose output.
+
+    Returns:
+        PipelineResult with typing results.
+    """
+    if config is None:
+        config = PipelineConfig()
+
+    # Set verbosity
+    configure_logging(verbose)
+
+    # Validate inputs
+    if len(input_files) not in (1, 2):
+        raise ValueError("Number of input files must be 1 (single-end) or 2 (paired-end)")
+
+    if seq_type not in ("dna", "rna"):
+        raise ValueError("seq_type must be 'dna' or 'rna'")
+
+    if not 0.0 <= beta < 0.1:
+        raise ValueError("beta must be between 0.0 and 0.1")
+
+    if enumerate_count <= 0:
+        raise ValueError("enumerate_count must be > 0")
+
+    # Check input file extensions
+    input_extension = input_files[0].split(".")[-1].lower()
+    bam_input = input_extension in ("sam", "bam")
+
+    # Create output directory
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Set up paths
+    if prefix is None:
+        prefix = datetime.datetime.fromtimestamp(time.time()).strftime("%Y_%m_%d_%H_%M_%S")
+        out_dir = os.path.join(output_dir, prefix)
+    else:
+        out_dir = output_dir
+
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+    extension = "bam" if PYSAM_AVAILABLE else "sam"
+    bam_paths = (
+        input_files
+        if bam_input
+        else [os.path.join(out_dir, f"{prefix}_{i + 1}.{extension}") for i in range(len(input_files))]
+    )
+
+    ref_type = "nuc" if seq_type == "rna" else "gen"
+    is_paired = len(input_files) > 1
+
+    out_csv = os.path.join(out_dir, f"{prefix}_result.tsv")
+    out_plot = os.path.join(out_dir, f"{prefix}_coverage_plot.pdf")
+
+    # Run mapping if needed
+    if not bam_input:
+        if not shutil.which(config.razers3_path):
+            raise FileNotFoundError(
+                f"RazerS3 not found: {config.razers3_path!r}. "
+                "Install with: conda install -c bioconda razers3"
+            )
+
+        threads = get_num_threads(config.mapping_threads)
+        logger.debug("Mapping with %d threads...", threads)
+
+        mapping_ref = get_reference_fasta(seq_type)
+        for sample, outbam in zip(input_files, bam_paths):
+            logger.debug("%s Mapping %s to %s reference...", elapsed(), os.path.basename(sample), ref_type.upper())
+
+            cmd = [
+                config.razers3_path,
+                "-i", "97",
+                "-m", "99999",
+                "--distance-range", "0",
+                "-pa",
+                "-tc", str(threads),
+                "-o", str(outbam),
+                str(mapping_ref),
+                str(sample),
+            ]
+            subprocess.run(cmd, check=True)
+
+    # Load reference data
+    table, features = load_reference_data()
+
+    logger.debug("%s Generating binary hit matrix.", elapsed())
+
+    # Process alignments
+    if is_paired:
+        pos, read_details = pysam_to_dataframe(bam_paths[0])
+        binary1 = np.sign(pos)
+
+        pos2, read_details2 = pysam_to_dataframe(bam_paths[1])
+        binary2 = np.sign(pos2)
+
+        if not bam_input and config.delete_bam:
+            os.remove(bam_paths[0])
+            os.remove(bam_paths[1])
+
+        id1 = set(binary1.index)
+        id2 = set(binary2.index)
+
+        # Handle read ID suffixes
+        if len(set(r[-1] for r in id1)) == 1 and len(set(r[-1] for r in id2)) == 1:
+            cut_last_char = lambda x: x[:-1]
+            binary1.index = list(map(cut_last_char, binary1.index))
+            binary2.index = list(map(cut_last_char, binary2.index))
+            pos.index = list(map(cut_last_char, pos.index))
+            pos2.index = list(map(cut_last_char, pos2.index))
+            read_details.index = list(map(cut_last_char, read_details.index))
+            read_details2.index = list(map(cut_last_char, read_details2.index))
+
+        binary_p, binary_mis, binary_un = ht.create_paired_matrix(binary1, binary2)
+
+        if binary_p.shape[0] < len(id1) * 0.1:
+            logger.warning(
+                "Less than 10%% of reads could be paired. Consider an appropriate "
+                "unpaired_weight setting in your config file (currently %.3f), "
+                "because you may need to resort to using unpaired reads.",
+                config.unpaired_weight,
+            )
+
+        if config.unpaired_weight > 0:
+            if config.use_discordant:
+                binary = pd.concat([binary_p, binary_un, binary_mis])
+            else:
+                binary = pd.concat([binary_p, binary_un])
+        else:
+            binary = binary_p
+    else:
+        pos, read_details = pysam_to_dataframe(bam_paths[0])
+
+        if not bam_input and config.delete_bam:
+            os.remove(bam_paths[0])
+
+        binary = np.sign(pos)
+
+    # Filter to frequent alleles
+    alleles_to_keep = [col for col in binary.columns if _is_frequent(col, table)]
+    binary = binary[alleles_to_keep]
+
+    logger.debug("%s Temporary pruning of identical rows and columns", elapsed())
+
+    unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True)
+    representing_df = pd.DataFrame(
+        [[a1, a2] for a1, a_l in representing.items() for a2 in a_l],
+        columns=["representative", "represented"],
+    )
+
+    temp_pruned = ht.prune_identical_reads(unique_col)
+
+    logger.debug("%s Size of mtx with unique rows and columns: %s", elapsed(), temp_pruned.shape)
+    logger.debug("%s Determining minimal set of non-overshadowed alleles", elapsed())
+
+    minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned)
+
+    logger.debug("%s Keeping only the minimal number of required alleles %s", elapsed(), minimal_alleles.shape)
+
+    binary = binary[minimal_alleles]
+
+    logger.debug("%s Creating compact model...", elapsed())
+
+    if is_paired and config.unpaired_weight > 0:
+        if config.use_discordant:
+            compact_mtx, compact_occ = ht.get_compact_model(
+                binary_p[minimal_alleles],
+                pd.concat([binary_un, binary_mis])[minimal_alleles],
+                weight=config.unpaired_weight,
+            )
+        else:
+            compact_mtx, compact_occ = ht.get_compact_model(
+                binary_p[minimal_alleles],
+                binary_un[minimal_alleles],
+                weight=config.unpaired_weight,
+            )
+    else:
+        compact_mtx, compact_occ = ht.get_compact_model(binary)
+
+    allele_ids = binary.columns
+
+    groups_4digit = defaultdict(list)
+    for allele in allele_ids:
+        type_4digit = _get_4digit(allele, table)
+        groups_4digit[type_4digit].append(allele)
+
+    sparse_dict = ht.mtx_to_sparse_dict(compact_mtx)
+    threads = get_num_threads(config.ilp_threads)
+
+    logger.debug("Starting ILP solver with %d threads...", threads)
+    logger.debug("%s Initializing OptiType model...", elapsed())
+
+    op = OptiType(
+        sparse_dict,
+        compact_occ,
+        groups_4digit,
+        table,
+        beta,
+        2,
+        config.solver,
+        threads,
+        verbosity=verbose,
+    )
+    result = op.solve(enumerate_count)
+
+    logger.debug("%s Result dataframe has been constructed...", elapsed())
+
+    result_4digit = result.map(lambda x: _get_types(x, table))
+    for col in ["A1", "A2", "B1", "B2", "C1", "C2"]:
+        if col not in result_4digit:
+            result_4digit[col] = None
+
+    r = result_4digit[["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"]]
+
+    # Write results
+    r.to_csv(
+        out_csv,
+        sep="\t",
+        columns=["A1", "A2", "B1", "B2", "C1", "C2", "nof_reads", "obj"],
+        header=["A1", "A2", "B1", "B2", "C1", "C2", "Reads", "Objective"],
+    )
+
+    # Generate coverage plot
+    hlatype = result.iloc[0].reindex(["A1", "A2", "B1", "B2", "C1", "C2"]).drop_duplicates().dropna()
+    features_used = (
+        [("intron", 1), ("exon", 2), ("intron", 2), ("exon", 3), ("intron", 3)]
+        if seq_type == "dna"
+        else [("exon", 2), ("exon", 3)]
+    )
+
+    plot_variables = (
+        [pos, read_details, pos2, read_details2, (binary_p, binary_un, binary_mis)]
+        if is_paired
+        else [pos, read_details]
+    )
+
+    coverage_mat = ht.calculate_coverage(plot_variables, features, hlatype, features_used)
+    ht.plot_coverage(out_plot, coverage_mat, table, features, features_used)
+
+    return PipelineResult(
+        result_df=result,
+        result_4digit=r,
+        output_csv=out_csv,
+        output_plot=out_plot,
+    )
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..f9849c8
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+"""OptiType test suite."""
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..158df44
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,48 @@
+"""Tests for CLI interface."""
+
+from click.testing import CliRunner
+
+from optitype.cli import main
+
+
+def test_cli_help():
+    """Test that help command works."""
+    runner = CliRunner()
+    result = runner.invoke(main, ["--help"])
+    assert result.exit_code == 0
+    assert "OptiType" in result.output
+
+
+def test_cli_version():
+    """Test that version command works."""
+    runner = CliRunner()
+    result = runner.invoke(main, ["--version"])
+    assert result.exit_code == 0
+    assert "1.5.0" in result.output
+
+
+def test_cli_run_help():
+    """Test that run subcommand help works."""
+    runner = CliRunner()
+    result = runner.invoke(main, ["run", "--help"])
+    assert result.exit_code == 0
+    assert "--input" in result.output
+    assert "--dna" in result.output
+    assert "--rna" in result.output
+
+
+def test_cli_check_deps():
+    """Test that check-deps command works."""
+    runner = CliRunner()
+    result = runner.invoke(main, ["check-deps"])
+    # Exit code may vary based on installed dependencies
+    assert "RazerS3" in result.output
+    assert "GLPK" in result.output
+
+
+def test_cli_info():
+    """Test that info command works."""
+    runner = CliRunner()
+    result = runner.invoke(main, ["info"])
+    assert result.exit_code == 0
+    assert "OptiType version" in result.output
diff --git a/tests/test_io.py b/tests/test_io.py
new file mode 100644
index 0000000..d1b1a77
--- /dev/null
+++ b/tests/test_io.py
@@ -0,0 +1,50 @@
+"""Tests for I/O utilities."""
+
+import pytest
+import pandas as pd
+
+from optitype.io.data import get_data_path, load_reference_data, get_reference_fasta
+
+
+def test_get_data_path():
+    """Test that data path can be found."""
+    data_path = get_data_path()
+    assert data_path.exists()
+    assert data_path.is_dir()
+
+
+def test_load_reference_data():
+    """Test loading reference data from Parquet files."""
+    table, features = load_reference_data()
+
+    assert isinstance(table, pd.DataFrame)
+    assert isinstance(features, pd.DataFrame)
+
+    # Check table structure
+    assert "id" in table.columns
+    assert "4digit" in table.columns
+    assert "locus" in table.columns
+    assert len(table) > 0
+
+    # Check features structure
+    assert "id" in features.columns
+    assert "feature" in features.columns
+    assert "number" in features.columns
+    assert len(features) > 0
+
+
+def test_get_reference_fasta():
+    """Test getting reference FASTA paths."""
+    dna_ref = get_reference_fasta("dna")
+    assert dna_ref.exists()
+    assert "dna" in dna_ref.name
+
+    rna_ref = get_reference_fasta("rna")
+    assert rna_ref.exists()
+    assert "rna" in rna_ref.name
+
+
+def test_get_reference_fasta_invalid():
+    """Test that invalid seq_type raises ValueError."""
+    with pytest.raises(ValueError):
+        get_reference_fasta("invalid")
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
new file mode 100644
index 0000000..dc7bf1a
--- /dev/null
+++ b/tests/test_pipeline.py
@@ -0,0 +1,175 @@
+"""Smoke tests for the pipeline and API."""
+
+import shutil
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from optitype import hlatyper as ht
+from optitype.io.data import load_reference_data
+from optitype.pipeline import PipelineConfig, run_pipeline, load_config, _is_frequent, _get_4digit, _get_types
+
+# Check for external dependencies
+HAS_RAZERS3 = shutil.which("razers3") is not None
+HAS_SOLVER = shutil.which("glpsol") is not None or shutil.which("cbc") is not None
+
+# Test data paths
+TEST_RNA_1 = Path(__file__).parent.parent / "test" / "rna" / "CRC_81_N_1_fished.fastq"
+TEST_RNA_2 = Path(__file__).parent.parent / "test" / "rna" / "CRC_81_N_2_fished.fastq"
+
+
+class TestPipelineConfig:
+    """Tests for PipelineConfig and load_config."""
+
+    def test_default_config(self):
+        config = PipelineConfig()
+        assert config.razers3_path == "razers3"
+        assert config.solver == "glpk"
+        assert config.mapping_threads == 4
+        assert config.ilp_threads == 1
+        assert config.delete_bam is True
+
+    def test_load_config_no_file(self):
+        config = load_config(None)
+        assert config.razers3_path == "razers3"
+        assert config.solver == "glpk"
+
+    def test_load_config_missing_file(self):
+        config = load_config("/nonexistent/config.ini")
+        assert config.razers3_path == "razers3"
+
+
+class TestHelperFunctions:
+    """Tests for pipeline helper functions."""
+
+    def test_is_frequent(self):
+        table, _ = load_reference_data()
+        # Find a known frequent allele in the table
+        for idx in table.index:
+            if table.loc[idx]["4digit"] == "A*02:01" and table.loc[idx]["flags"] == 0:
+                assert _is_frequent(idx, table)
+                break
+
+    def test_get_4digit(self):
+        table, _ = load_reference_data()
+        # Pick any allele and check it returns a string with '*'
+        idx = table.index[0]
+        result = _get_4digit(idx, table)
+        assert isinstance(result, str)
+        assert "*" in result
+
+    def test_get_types_with_string(self):
+        table, _ = load_reference_data()
+        idx = table.index[0]
+        result = _get_types(idx, table)
+        assert isinstance(result, str)
+
+    def test_get_types_with_non_string(self):
+        table, _ = load_reference_data()
+        assert _get_types(3.14, table) == 3.14
+        assert np.isnan(_get_types(float("nan"), table))
+
+
+class TestHlatyper:
+    """Tests for hlatyper matrix operations."""
+
+    def test_get_compact_model(self):
+        df = pd.DataFrame(
+            [[1, 0, 1], [1, 0, 1], [0, 1, 0], [0, 0, 0]],
+            columns=["A", "B", "C"],
+            index=["r1", "r2", "r3", "r4"],
+        )
+        unique_mtx, occ = ht.get_compact_model(df)
+        # r4 is all-zero, removed. r1 and r2 are duplicates.
+        assert unique_mtx.shape[0] == 2
+        assert sum(occ.values()) == 3  # 2 for r1/r2 + 1 for r3
+
+    def test_mtx_to_sparse_dict(self):
+        df = pd.DataFrame(
+            [[1, 0], [0, 1]],
+            columns=["A", "B"],
+            index=["r1", "r2"],
+        )
+        sparse = ht.mtx_to_sparse_dict(df)
+        assert ("r1", "A") in sparse
+        assert ("r2", "B") in sparse
+        assert ("r1", "B") not in sparse
+
+    def test_prune_identical_reads(self):
+        df = pd.DataFrame(
+            [[1, 0], [1, 0], [0, 1]],
+            columns=["A", "B"],
+            index=["r1", "r2", "r3"],
+        )
+        pruned = ht.prune_identical_reads(df)
+        assert pruned.shape[0] == 2
+
+    def test_create_paired_matrix(self):
+        b1 = pd.DataFrame(
+            [[1, 0], [1, 1]],
+            columns=["A", "B"],
+            index=["r1", "r2"],
+        )
+        b2 = pd.DataFrame(
+            [[1, 0], [0, 1]],
+            columns=["A", "B"],
+            index=["r1", "r3"],
+        )
+        paired, mispaired, unpaired = ht.create_paired_matrix(b1, b2)
+        assert "r1" in paired.index  # r1 is in both
+        assert "r2" in unpaired.index or "r3" in unpaired.index
+
+
+class TestPipelineValidation:
+    """Tests for pipeline input validation."""
+
+    def test_invalid_file_count(self, tmp_path):
+        f1 = tmp_path / "a.fq"
+        f2 = tmp_path / "b.fq"
+        f3 = tmp_path / "c.fq"
+        for f in (f1, f2, f3):
+            f.write_text("")
+        with pytest.raises(ValueError, match="1.*or 2"):
+            run_pipeline([str(f1), str(f2), str(f3)], "dna", str(tmp_path))
+
+    def test_invalid_seq_type(self, tmp_path):
+        f = tmp_path / "a.fq"
+        f.write_text("")
+        with pytest.raises(ValueError, match="dna.*rna"):
+            run_pipeline([str(f)], "invalid", str(tmp_path))
+
+    def test_invalid_beta(self, tmp_path):
+        f = tmp_path / "a.fq"
+        f.write_text("")
+        with pytest.raises(ValueError, match="beta"):
+            run_pipeline([str(f)], "dna", str(tmp_path), beta=0.5)
+
+    def test_invalid_enumerate(self, tmp_path):
+        f = tmp_path / "a.fq"
+        f.write_text("")
+        with pytest.raises(ValueError, match="enumerate"):
+            run_pipeline([str(f)], "dna", str(tmp_path), enumerate_count=0)
+
+
+@pytest.mark.skipif(
+    not (HAS_RAZERS3 and HAS_SOLVER and TEST_RNA_1.exists()),
+    reason="Requires razers3, ILP solver, and test data",
+)
+class TestPipelineIntegration:
+    """Integration tests that run the full pipeline (require external tools)."""
+
+    def test_rna_paired_end(self, tmp_path):
+        result = run_pipeline(
+            input_files=[str(TEST_RNA_1), str(TEST_RNA_2)],
+            seq_type="rna",
+            output_dir=str(tmp_path),
+            verbose=False,
+        )
+        assert result.result_df is not None
+        assert Path(result.output_csv).exists()
+        assert Path(result.output_plot).exists()
+        # Check result has expected columns
+        for col in ["A1", "A2", "B1", "B2", "C1", "C2"]:
+            assert col in result.result_4digit.columns