diff --git a/.gitignore b/.gitignore index e741616..84dd975 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ gvcf/* results/* logs/* +logs/** example_data/* .snakemake/ slurm-*.out diff --git a/README.md b/README.md index 0a360ec..f54a080 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ The pipeline can be run one of two ways, both from the repo root. **It is recomm A default SLURM profile is provided under `profiles/slurm/`. Edit `profiles/slurm/config.yaml` to customize sbatch options if needed. Defaults for account/partition and baseline resources are set in `config.yaml` (`slurm_account`, `slurm_partition`, `default_*`). SLURM stdout/stderr logs are written to `logs/slurm/` by default. +The provided profile sets `jobs: 200` (Snakemake's max concurrent workflow jobs, equivalent to `-j 200`), which you can override on the command line if needed. ```bash snakemake --profile profiles/slurm @@ -55,7 +56,7 @@ snakemake -j 8 Common options: -- `-j `: number of parallel jobs +- `-j `: number of parallel Snakemake jobs (not per-job threads/CPUs) - `--rerun-incomplete`: clean up partial outputs - `--printshellcmds`: show executed commands - `--notemp`: keep temporary intermediates (see Notes) @@ -71,21 +72,48 @@ By default the workflow uses these locations (override in `config.yaml`): - `results/combined/combined..gvcf.gz` : merged gVCF per contig - `results/split/combined..inv` : invariant sites - `results/split/combined..filtered` : filtered sites -- `results/split/combined..clean` : clean sites +- `results/split/combined..clean.vcf.gz` : clean sites (bgzip-compressed by default) - `results/split/combined..missing.bed` : missing positions -- `results/split/combined..filtered.bed` : merged mask bed +- `results/split/combined..clean.mask.bed` : merged mask bed - `results/split/combined..coverage.txt` : split coverage validation summary - `results/split/combined..accessible.npz` : boolean accessibility array (union of clean + invariant sites), for scikit-allel statistics - `results/summary.html` : HTML summary of jobs run, outputs created, and warnings + - Contig remaps (e.g., configured `chr1` remapped to reference contig `1`) are reported under the remap warning and are not repeated in the generic MAF-vs-reference mismatch warnings. + +## Testing + +Run from the repo root: + +```bash +pytest -q +``` + +- Runs unit/fast tests by default. +- Some integration tests are gated and skipped unless enabled. + +To run integration tests too: + +```bash +RUN_INTEGRATION=1 pytest -q +``` + +- Requires the `argprep` conda environment and external tools used by the workflow (e.g., `snakemake`, `bcftools`, `tabix`, `bgzip`, `gatk`, `java`; some tests also need `samtools`). +- These tests take longer and may run parts of the Snakemake workflow. + +For a quick SLURM-profile syntax/config sanity check without submitting jobs, use a dry run: + +```bash +snakemake --profile profiles/slurm -n +``` ## Notes -- If `bgzip_output: true`, the `.inv`, `.filtered`, `.clean`, and `.missing.bed` files will have a `.gz` suffix. +- `bgzip_output` defaults to `true`; by default the `.inv`, `.filtered`, `.clean.vcf`, and `.missing.bed` files are bgzip-compressed (`.gz` suffix). - All gzipped outputs in this pipeline use bgzip (required for `tabix`). - `scripts/dropSV.py` removes indels larger than `drop_cutoff` (if set in `config.yaml`). - `scripts/split.py` supports `--filter-multiallelic` and `--bgzip-output` (toggle via `config.yaml`). - `scripts/filt_to_bed.py` merges `.filtered`, `.missing.bed`, and `dropped_indels.bed` into a final mask bed. -- `make_accessibility` builds a per-contig accessibility array from the union of `combined..clean` and `combined..inv` using the reference `.fai` to size the array. The output is a compressed NumPy archive containing a boolean array named `mask`, intended for scikit-allel statistics. +- `make_accessibility` builds a per-contig accessibility array from the union of `combined..clean.vcf.gz` and `combined..inv.gz` (default names) using the reference `.fai` to size the array. The output is a compressed NumPy archive containing a boolean array named `mask`, intended for scikit-allel statistics. - Ploidy is inferred from MAF block structure by default (max non-reference `s` lines per block, typically `1` for pairwise MAFs). You can override with `ploidy` in `config.yaml`. - Optional: enable `vt_normalize: true` in `config.yaml` to normalize merged gVCFs with `vt normalize` after `SelectVariants`. - If GenomicsDBImport fails with a buffer-size error, increase `genomicsdb_vcf_buffer_size` and `genomicsdb_segment_size` in `config.yaml` (set them above your longest gVCF line length). @@ -104,7 +132,7 @@ By default the workflow uses these locations (override in `config.yaml`): - New/expanded validation and tests: split coverage checks, filtered‑bed tests, integration tests gated by `RUN_INTEGRATION=1`. - Example data regenerated via msprime with indels and missing data and AnchorWave‑style MAF formatting. - `check_split_coverage.py` now reports overlap intervals with file names to aid debugging. -- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined..filtered.bed`. +- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined..clean.mask.bed`. - SLURM default resources now read `default_*` from `config.yaml` instead of hardcoded profile values. ## Changes since v0.2 @@ -123,11 +151,11 @@ By default the workflow uses these locations (override in `config.yaml`): ### ARG estimation -Use Nate Pope's SINGER Snakemake [pipeline](https://github.com/nspope/singer-snakemake) with `combined..clean` and `combined..filtered.bed` as inputs. +Use Nate Pope's SINGER Snakemake [pipeline](https://github.com/nspope/singer-snakemake) with `combined..clean.vcf.gz` and `combined..clean.mask.bed` as inputs. ### Population genetic statistics -If you use scikit-allel, you can use the `combined..clean` VCF and load the accessibility mask like this: +If you use scikit-allel, you can use the `combined..clean.vcf.gz` VCF and load the accessibility mask like this: ```python import numpy as np diff --git a/Snakefile b/Snakefile index bdaef9a..5dc0174 100644 --- a/Snakefile +++ b/Snakefile @@ -24,7 +24,7 @@ FILL_GAPS = str(config.get("fill_gaps", "false")).lower() OUTPUT_JUST_GT = bool(config.get("outputJustGT", False)) DROP_CUTOFF = config.get("drop_cutoff", "") FILTER_MULTIALLELIC = bool(config.get("filter_multiallelic", False)) -BGZIP_OUTPUT = bool(config.get("bgzip_output", False)) +BGZIP_OUTPUT = bool(config.get("bgzip_output", True)) GENOMICSDB_VCF_BUFFER_SIZE = int(config.get("genomicsdb_vcf_buffer_size", 1048576)) GENOMICSDB_SEGMENT_SIZE = int(config.get("genomicsdb_segment_size", 1048576)) MAF_TO_GVCF_THREADS = int(config.get("maf_to_gvcf_threads", 2)) @@ -39,7 +39,6 @@ MERGE_CONTIG_TIME = str(config.get("merge_contig_time", config.get("default_time DEFAULT_JAVA_MEM_MB = max(256, int(DEFAULT_MEM_MB * 0.9)) VT_NORMALIZE = bool(config.get("vt_normalize", False)) VT_PATH = str(config.get("vt_path", "vt")) -MERGED_GENOTYPER = "selectvariants" workflow.global_resources["merge_gvcf_jobs"] = int(config.get("merge_gvcf_max_jobs", 4)) @@ -68,24 +67,49 @@ def _normalize_contig(name: str) -> str: return name if name else "0" -def _read_maf_contigs() -> set[str]: +def _maf_path_for_sample(sample: str) -> Path: + maf = MAF_DIR / f"{sample}.maf" + maf_gz = MAF_DIR / f"{sample}.maf.gz" + if maf.exists(): + return maf + if maf_gz.exists(): + return maf_gz + return maf + + +def _read_maf_contigs(maf_path: Path) -> set[str]: contigs = set() - maf_files = list(MAF_DIR.glob("*.maf")) + list(MAF_DIR.glob("*.maf.gz")) - for maf in sorted(maf_files): - try: - if maf.name.endswith(".gz"): - handle = gzip.open(maf, "rt", encoding="utf-8") - else: - handle = maf.open("r", encoding="utf-8") - with handle: - for line in handle: - if not line or line.startswith("#"): - continue - parts = line.split() - if parts and parts[0] == "s" and len(parts) >= 2: - contigs.add(parts[1]) - except OSError: - continue + try: + if maf_path.name.endswith(".gz"): + handle = gzip.open(maf_path, "rt", encoding="utf-8") + else: + handle = maf_path.open("r", encoding="utf-8") + with handle: + first_src = None + for line in handle: + if not line or line.startswith("#"): + continue + stripped = line.strip() + if not stripped: + if first_src is not None: + contigs.add(first_src) + first_src = None + continue + parts = stripped.split() + if not parts: + continue + if parts[0] == "a": + if first_src is not None: + contigs.add(first_src) + first_src = None + continue + if parts[0] == "s" and len(parts) >= 2 and first_src is None: + # Use first sequence src in each block as the reference-side contig. + first_src = parts[1] + if first_src is not None: + contigs.add(first_src) + except OSError: + return set() return contigs @@ -172,6 +196,23 @@ def _discover_samples(): return sorted(samples) +def _read_maf_contig_sets(samples: list[str]) -> dict[str, set[str]]: + contigs_by_sample: dict[str, set[str]] = {} + for sample in samples: + maf_path = _maf_path_for_sample(sample) + contigs_by_sample[sample] = _read_maf_contigs(maf_path) + return contigs_by_sample + + +def _contigs_not_in_all_mafs(contigs_by_sample: dict[str, set[str]]) -> list[str]: + if not contigs_by_sample: + return [] + all_sets = list(contigs_by_sample.values()) + union = set().union(*all_sets) if all_sets else set() + intersection = set.intersection(*all_sets) if all_sets else set() + return sorted(union - intersection) + + def _infer_ploidy_from_maf(maf_path: Path, max_blocks: int = 10000) -> int: opener = gzip.open if maf_path.suffix == ".gz" else open max_non_ref = 0 @@ -282,34 +323,14 @@ def _resolve_requested_contigs( return kept, dropped, remapped -def _read_contigs(): - orig_fai = REF_FASTA.with_suffix(REF_FASTA.suffix + ".fai") - if not orig_fai.exists(): - raise ValueError( - "Reference FASTA index (.fai) not found. " - "Either run 'samtools faidx' on the reference or set 'contigs' in config.yaml." - ) - target_fai = Path(REF_FAI) - if target_fai.exists(): - ref_contigs = _read_fai_contigs(target_fai) - else: - ref_contigs = _read_fai_contigs(orig_fai) - - if "contigs" in config: - requested = [str(c) for c in config["contigs"]] - kept, dropped, remapped = _resolve_requested_contigs(requested, ref_contigs) - if not kept: - raise ValueError( - "None of the configured contigs are present in reference .fai: " - + ", ".join(requested[:10]) - ) - return kept, dropped, requested, remapped - return ref_contigs, [], list(ref_contigs), [] - - SAMPLES = _discover_samples() PLOIDY, PLOIDY_SOURCE, PLOIDY_FILE_VALUES, PLOIDY_WARNINGS = _resolve_ploidy(SAMPLES) -CONTIGS, DROPPED_CONTIGS_NOT_IN_REF, REQUESTED_CONTIGS, REMAPPED_CONTIGS = _read_contigs() +MAF_CONTIGS_BY_SAMPLE = _read_maf_contig_sets(SAMPLES) +if MAF_CONTIGS_BY_SAMPLE: + MAF_CONTIG_INTERSECTION = sorted(set.intersection(*MAF_CONTIGS_BY_SAMPLE.values())) +else: + MAF_CONTIG_INTERSECTION = [] +CONTIGS_NOT_IN_ALL_MAFS = _contigs_not_in_all_mafs(MAF_CONTIGS_BY_SAMPLE) GVCF_BASES = [f"{sample}To{REF_BASE}" for sample in SAMPLES] @@ -330,7 +351,20 @@ def _active_contig_resolution() -> tuple[list[str], list[str], list[str], list[t + ", ".join(requested[:10]) ) return kept, dropped, requested, remapped - return available, [], list(available), [] + + requested = list(MAF_CONTIG_INTERSECTION) + if not requested: + raise ValueError( + "No contigs are shared across all MAF files. " + "Set explicit 'contigs' in config.yaml to override." + ) + kept, dropped, remapped = _resolve_requested_contigs(requested, available) + if not kept: + raise ValueError( + "No shared MAF contigs are present in renamed reference .fai. " + "Set explicit 'contigs' in config.yaml to override." + ) + return kept, dropped, requested, remapped def _active_contigs() -> list[str]: @@ -341,7 +375,7 @@ def _all_targets(_wc): contigs = _active_contigs() return ( [str(_combined_out(c)) for c in contigs] - + [str(_split_prefix(c)) + ".filtered.bed" for c in contigs] + + [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs] + [str(_split_prefix(c)) + ".coverage.txt" for c in contigs] + [str(_accessibility_out(c)) for c in contigs] + [str(RESULTS_DIR / "summary.html")] @@ -388,6 +422,8 @@ def _accessibility_out(contig): SPLIT_SUFFIX = ".gz" if BGZIP_OUTPUT else "" +CLEAN_VCF_SUFFIX = ".clean.vcf" + SPLIT_SUFFIX +MASK_BED_SUFFIX = ".clean.mask.bed" rule all: # Final targets: merged gVCFs plus filtered bed masks per contig. @@ -460,11 +496,17 @@ def _summary_jobs() -> list[tuple[str, list[str]]]: [ str(_split_prefix(c)) + suffix for c in contigs - for suffix in (".inv", ".filtered", ".clean", ".missing.bed") + for suffix in ( + ".inv" + SPLIT_SUFFIX, + ".filtered" + SPLIT_SUFFIX, + CLEAN_VCF_SUFFIX, + ".missing.bed" + SPLIT_SUFFIX, + ".missing_gt_snp_by_sample.tsv", + ) ], ), ("check_split_coverage", [str(_split_prefix(c)) + ".coverage.txt" for c in contigs]), - ("mask_bed", [str(_split_prefix(c)) + ".filtered.bed" for c in contigs]), + ("mask_bed", [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs]), ("make_accessibility", [str(_accessibility_out(c)) for c in contigs]), ] ) @@ -491,8 +533,8 @@ def _summary_temp_paths() -> set[str]: def _summary_arg_outputs() -> list[str]: contigs = _active_contigs() return ( - [str(_split_prefix(c)) + ".clean" for c in contigs] - + [str(_split_prefix(c)) + ".filtered.bed" for c in contigs] + [str(_split_prefix(c)) + CLEAN_VCF_SUFFIX for c in contigs] + + [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs] + [str(_accessibility_out(c)) for c in contigs] ) @@ -501,10 +543,11 @@ rule summary_report: # Write an HTML summary of jobs, outputs, and warnings. input: combined=lambda wc: [str(_combined_out(c)) for c in _active_contigs()], - beds=lambda wc: [str(_split_prefix(c)) + ".filtered.bed" for c in _active_contigs()], + beds=lambda wc: [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in _active_contigs()], invs=lambda wc: [str(_split_prefix(c)) + ".inv" + SPLIT_SUFFIX for c in _active_contigs()], filts=lambda wc: [str(_split_prefix(c)) + ".filtered" + SPLIT_SUFFIX for c in _active_contigs()], - cleans=lambda wc: [str(_split_prefix(c)) + ".clean" + SPLIT_SUFFIX for c in _active_contigs()], + cleans=lambda wc: [str(_split_prefix(c)) + CLEAN_VCF_SUFFIX for c in _active_contigs()], + missing_gt_stats=lambda wc: [str(_split_prefix(c)) + ".missing_gt_snp_by_sample.tsv" for c in _active_contigs()], dropped=str(GVCF_DIR / "cleangVCF" / "dropped_indels.bed"), output: report=str(RESULTS_DIR / "summary.html"), @@ -526,6 +569,7 @@ rule summary_report: ploidy_source=PLOIDY_SOURCE, ploidy_file_values=PLOIDY_FILE_VALUES, ploidy_warnings=PLOIDY_WARNINGS, + contigs_not_in_all_mafs=CONTIGS_NOT_IN_ALL_MAFS, dropped_contigs_not_in_ref=lambda wc: _active_contig_resolution()[1], requested_contigs=lambda wc: _active_contig_resolution()[2], remapped_contigs=lambda wc: _active_contig_resolution()[3], @@ -790,8 +834,9 @@ rule split_gvcf: output: inv=str(RESULTS_DIR / "split" / ("combined.{contig}.inv" + SPLIT_SUFFIX)), filt=str(RESULTS_DIR / "split" / ("combined.{contig}.filtered" + SPLIT_SUFFIX)), - clean=str(RESULTS_DIR / "split" / ("combined.{contig}.clean" + SPLIT_SUFFIX)), + clean=str(RESULTS_DIR / "split" / ("combined.{contig}.clean.vcf" + SPLIT_SUFFIX)), missing=str(RESULTS_DIR / "split" / ("combined.{contig}.missing.bed" + SPLIT_SUFFIX)), + missing_gt_stats=str(RESULTS_DIR / "split" / "combined.{contig}.missing_gt_snp_by_sample.tsv"), params: filter_multiallelic=FILTER_MULTIALLELIC, bgzip_output=BGZIP_OUTPUT, @@ -807,6 +852,7 @@ rule split_gvcf: if [ "{params.bgzip_output}" = "True" ]; then cmd+=(--bgzip-output) fi + cmd+=(--missing-gt-stats-out "{output.missing_gt_stats}") cmd+=("{input.gvcf}") "${{cmd[@]}}" """ @@ -815,9 +861,9 @@ rule split_gvcf: rule check_split_coverage: # Validate that clean + inv + filtered bed sum to contig length. input: - clean=lambda wc: str(_split_prefix(wc.contig)) + ".clean" + SPLIT_SUFFIX, + clean=lambda wc: str(_split_prefix(wc.contig)) + CLEAN_VCF_SUFFIX, inv=lambda wc: str(_split_prefix(wc.contig)) + ".inv" + SPLIT_SUFFIX, - bed=lambda wc: str(_split_prefix(wc.contig)) + ".filtered.bed", + bed=lambda wc: str(_split_prefix(wc.contig)) + MASK_BED_SUFFIX, fai=REF_FAI, output: report=str(RESULTS_DIR / "split" / "combined.{contig}.coverage.txt"), @@ -839,7 +885,7 @@ rule mask_bed: missing=lambda wc: str(_split_prefix(wc.contig)) + ".missing.bed" + SPLIT_SUFFIX, dropped=str(GVCF_DIR / "cleangVCF" / "dropped_indels.bed"), output: - bed=str(RESULTS_DIR / "split" / "combined.{contig}.filtered.bed"), + bed=str(RESULTS_DIR / "split" / "combined.{contig}.clean.mask.bed"), params: prefix=lambda wc: str(_split_prefix(wc.contig)), shell: @@ -854,7 +900,7 @@ rule mask_bed: rule make_accessibility: # Build boolean accessibility array from clean + inv VCFs per contig. input: - clean=lambda wc: str(_split_prefix(wc.contig)) + ".clean" + SPLIT_SUFFIX, + clean=lambda wc: str(_split_prefix(wc.contig)) + CLEAN_VCF_SUFFIX, inv=lambda wc: str(_split_prefix(wc.contig)) + ".inv" + SPLIT_SUFFIX, fai=REF_FAI, output: diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..8db0191 --- /dev/null +++ b/changelog.md @@ -0,0 +1,38 @@ +## Changes since v0.3 + +- Added/expanded contig handling across the workflow: reference renaming from gVCF/MAF contigs, numeric-suffix normalization, configured-contig remapping (e.g. `chr1` -> `1`), filtering contigs to the reference `.fai`, and defaulting to the shared MAF-contig intersection when `contigs:` is not set. +- Improved missing-contig behavior and reporting: split/merge steps now skip missing per-sample contigs more safely, preserve valid placeholders, and surface clearer warnings in the HTML summary. +- Expanded HTML summary reporting with stronger warning parsing, MAF-vs-reference contig mismatch warnings, ploidy reporting, and a missing-genotype exclusion histogram. +- Fixed summary warning de-duplication so contigs reported in "Configured contigs were remapped..." are not also repeated in generic MAF/reference mismatch warnings. +- Improved split/filter/mask robustness: refined split classification and filtered BED span handling, gzip/bgzip split-output support, gzipped missing-BED input support, and better empty-coverage handling in `check_split_coverage.py`. +- Added accessibility mask outputs (`combined..accessible.npz`) for scikit-allel workflows and documented downstream use. +- Added ploidy inference from MAF blocks (with summary reporting) while retaining config override support. +- Added and refined resource/concurrency controls: configurable `maf_to_gvcf_*` and `merge_contig_*` resources, `merge_gvcf_max_jobs`, `maf_to_gvcf_array_max_jobs`, GenomicsDB buffer tuning, and `default_mem_mb` fallback behavior. +- Improved SLURM profile/runtime behavior: numeric default resource values in the profile, SLURM stdout/stderr written to `logs/slurm/`, and clearer documentation of profile `jobs` (Snakemake parallelism) vs per-job CPUs. +- Added substantial test coverage: pytest scaffolding, split/filt/dropSV unit tests, coverage checks, integration tests (including contig mismatch/default-contig behavior), random-position validation, summary-report regressions, Snakefile contig-resolution unit tests, and a SLURM-profile dry-run smoke test. +- Documentation updates: `logic.md`, expanded README notes/options/outputs/testing instructions, and a standalone `changelog.md`. + +## Changes since v0.2 + +- Moved HTML summary generation into `scripts/summary_report.py` and simplified `Snakefile`. +- Corrected example MAF inputs so `example_data/*.maf.gz` are valid gzip files. +- Updated split classification so `ALT=`-only records are treated as invariant. +- Updated SINGER clean-output formatting to strip `` while preserving genotype/sample fields. +- Added `logic.md` with detailed site-routing/filtering logic and concrete examples. +- Added `results/split/combined..coverage.txt` to documented workflow outputs. +- Added split-test coverage for invariant/nonref and genotype-preserving clean formatting. +- Updated SLURM profile default resources to numeric values to avoid resource conversion/submission errors. +- Added merge_gvcf_max_jobs pipeline concurrency control + +## Changes since v0.1 + +- Added HTML summary report with embedded SVG histograms and expanded output details. +- Split logic tightened: clean sites now require all samples called; missing GTs are routed to filtered. +- Invariant/filtered/clean outputs are enforced as mutually exclusive per position; filtered BED spans now respect END/REF lengths and subtract inv/clean. +- Merged gVCFs are produced via GATK SelectVariants with genotype calling; TASSEL `outputJustGT` default set to `false` to retain likelihoods for calling. +- Added accessibility mask generation (`combined..accessible.npz`) for scikit‑allel workflows. +- New/expanded validation and tests: split coverage checks, filtered‑bed tests, integration tests gated by `RUN_INTEGRATION=1`. +- Example data regenerated via msprime with indels and missing data and AnchorWave‑style MAF formatting. +- `check_split_coverage.py` now reports overlap intervals with file names to aid debugging. +- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined..filtered.bed`. +- SLURM default resources now read `default_*` from `config.yaml` instead of hardcoded profile values. diff --git a/config.yaml b/config.yaml index a0798bc..35523ba 100644 --- a/config.yaml +++ b/config.yaml @@ -15,10 +15,10 @@ sample_suffix: _anchorwave fill_gaps: false outputJustGT: false -# Optional: list of contigs to process (one per line). If specified, the workflow will only run on these contigs. Contig names should match those in reference_fasta. -# If omitted (comment out "contigs" line), the workflow will run all contigs but expects reference_fasta.fai to exist. -contigs: - - 1 +# Optional: list of contigs to process (one per line). If specified, the workflow will only run on these contigs. Contig names should match those in reference_fasta. To run this, uncomment lines below, adding one line for each contig you wish to run. +# By default the workflow will run all contigs present in ALL maf files. +#contigs: +# - 1 # Optional ploidy override. If omitted, ploidy is inferred from MAF block structure # as max(non-reference "s" lines per block), with a default of 1. @@ -35,7 +35,7 @@ drop_cutoff: 4000 # Optional: split.py switches. filter_multiallelic: false -bgzip_output: false +bgzip_output: true # Optional: GenomicsDBImport tuning (buffer/segment sizes) genomicsdb_vcf_buffer_size: 10485760 diff --git a/profiles/slurm/config.yaml b/profiles/slurm/config.yaml index c4b545c..df437f7 100644 --- a/profiles/slurm/config.yaml +++ b/profiles/slurm/config.yaml @@ -4,6 +4,7 @@ executor: cluster-generic cluster-generic-submit-cmd: "sbatch --account {config[slurm_account]} --partition {config[slurm_partition]} --cpus-per-task={threads} --mem={resources.mem_mb} --time={resources.time} --output logs/slurm/slurm-%j.out --error logs/slurm/slurm-%j.err" +# Snakemake max concurrent workflow jobs (equivalent to -j), not per-job SLURM CPUs. jobs: 200 cores: 2 use-conda: true diff --git a/scripts/check_split_coverage.py b/scripts/check_split_coverage.py index 59e91a8..a011463 100644 --- a/scripts/check_split_coverage.py +++ b/scripts/check_split_coverage.py @@ -3,9 +3,9 @@ Check that split outputs cover the full contig length. This script sums: - - .clean records (1 bp each) + - .clean.vcf records (1 bp each) - .inv records (1 bp each; END spans are expanded by split.py) - - .filtered.bed intervals (0-based, half-open) + - .clean.mask.bed intervals (0-based, half-open) and compares the total to the contig length from a reference .fai. """ @@ -205,14 +205,22 @@ def main() -> None: args = ap.parse_args() prefix = Path(args.prefix) - clean = prefix.with_suffix(prefix.suffix + ".clean") + clean = prefix.with_suffix(prefix.suffix + ".clean.vcf") inv = prefix.with_suffix(prefix.suffix + ".inv") - filtered_bed = prefix.with_suffix(prefix.suffix + ".filtered.bed") + filtered_bed = prefix.with_suffix(prefix.suffix + ".clean.mask.bed") if clean.with_suffix(clean.suffix + ".gz").exists(): clean = clean.with_suffix(clean.suffix + ".gz") + elif not clean.exists(): + # Backward compatibility for legacy split output naming. + clean = prefix.with_suffix(prefix.suffix + ".clean") + if clean.with_suffix(clean.suffix + ".gz").exists(): + clean = clean.with_suffix(clean.suffix + ".gz") if inv.with_suffix(inv.suffix + ".gz").exists(): inv = inv.with_suffix(inv.suffix + ".gz") + if not filtered_bed.exists(): + # Backward compatibility for legacy mask naming. + filtered_bed = prefix.with_suffix(prefix.suffix + ".filtered.bed") if not clean.exists(): sys.exit(f"ERROR: clean file not found: {clean}") @@ -248,7 +256,7 @@ def main() -> None: "filtered_bed_bp=0\n" "total_bp=0\n" f"chrom_len={chrom_len}\n" - "warning=No records found in clean/inv/filtered.bed; likely absent in all gVCFs.\n", + "warning=No records found in clean/inv/clean.mask.bed; likely absent in all gVCFs.\n", encoding="utf-8", ) return diff --git a/scripts/filt_to_bed.py b/scripts/filt_to_bed.py index 9b71678..4d0e1eb 100755 --- a/scripts/filt_to_bed.py +++ b/scripts/filt_to_bed.py @@ -124,7 +124,7 @@ def read_bed_intervals(path: str, by_chrom: Dict[str, List[Tuple[int, int]]]) -> Load a BED file into a per-chromosome interval map. Assumes 0-based half-open intervals [start, end). """ - with open(path, "rt", encoding="utf-8") as fin: + with open_maybe_gzip(path, "rt") as fin: for raw in fin: if not raw or raw.startswith("#"): continue @@ -218,9 +218,12 @@ def main() -> None: filtered_gz = filtered_path + ".gz" if os.path.isfile(filtered_gz): filtered_path = filtered_gz - out_path = prefix + ".filtered.bed" + out_path = prefix + ".clean.mask.bed" dropped_bed = args.dropped_bed or os.path.join(os.path.dirname(prefix), "cleangVCF", "dropped_indels.bed") missing_bed = prefix + ".missing.bed" + missing_bed_gz = missing_bed + ".gz" + if os.path.isfile(missing_bed_gz): + missing_bed = missing_bed_gz inv_path = prefix + ".inv" inv_gz = inv_path + ".gz" if os.path.isfile(inv_gz): @@ -277,10 +280,16 @@ def main() -> None: # If filtered is empty, try to infer target from other inputs. if target_chrom is None and os.path.isfile(inv_path): target_chrom = first_chrom_from_vcf(inv_path) - clean_path = prefix + ".clean" + clean_path = prefix + ".clean.vcf" clean_gz = clean_path + ".gz" if os.path.isfile(clean_gz): clean_path = clean_gz + elif not os.path.isfile(clean_path): + # Backward compatibility for legacy split outputs. + clean_path = prefix + ".clean" + clean_gz = clean_path + ".gz" + if os.path.isfile(clean_gz): + clean_path = clean_gz if target_chrom is None and os.path.isfile(clean_path): target_chrom = first_chrom_from_vcf(clean_path) if target_chrom is None and os.path.isfile(missing_bed): @@ -340,7 +349,7 @@ def first_chrom_from_vcf(path: str) -> str | None: def first_chrom_from_bed(path: str) -> str | None: - with open(path, "rt", encoding="utf-8") as fin: + with open_maybe_gzip(path, "rt") as fin: for raw in fin: if not raw or raw.startswith("#"): continue diff --git a/scripts/split.py b/scripts/split.py index e546296..576dd0a 100755 --- a/scripts/split.py +++ b/scripts/split.py @@ -7,7 +7,7 @@ Split a gVCF into three mutually exclusive outputs: .inv invariant sites (INFO="." or END=... spans) .filtered sites removed for quality/format reasons - .clean usable variant sites for downstream inference + .clean.vcf usable variant sites for downstream inference The script also emits .missing.bed, a mask of positions absent from the input gVCF (gaps between covered positions). This helps track accessiblity. @@ -238,6 +238,9 @@ def main() -> None: clean_bp = 0 missing_bp = 0 singer_reformat: bool | None = None + sample_names: list[str] = [] + missing_gt_snp_total = 0 + missing_gt_snp_by_sample: dict[str, int] = {} # ---------------- Argument parsing ---------------- @@ -268,6 +271,14 @@ def main() -> None: default=None, help="Reference .fai to fill missing BED gaps at contig ends.", ) + ap.add_argument( + "--missing-gt-stats-out", + default=None, + help=( + "Optional TSV output path for per-sample counts of SNP sites excluded " + "from clean due to missing genotype calls." + ), + ) args = ap.parse_args() in_path = args.vcf @@ -288,8 +299,9 @@ def main() -> None: out_inv = prefix + ".inv" out_filt = prefix + ".filtered" - out_clean = prefix + ".clean" + out_clean = prefix + ".clean.vcf" out_missing = prefix + ".missing.bed" + out_missing_gt_stats = args.missing_gt_stats_out or (prefix + ".missing_gt_snp_by_sample.tsv") bgzip_output = args.bgzip_output out_inv_final = out_inv + (".gz" if bgzip_output else "") @@ -330,6 +342,7 @@ def main() -> None: def flush_group(records: list[dict]) -> None: nonlocal inv_bp, filtered_bp, clean_bp, singer_reformat + nonlocal missing_gt_snp_total, missing_gt_snp_by_sample if not records: return has_inv = any(r["is_inv"] for r in records) @@ -342,6 +355,18 @@ def flush_group(records: list[dict]) -> None: records.clear() return if has_filtered or has_missing_gt: + # Track SNP-like sites that were excluded only because sample GTs were missing. + if has_missing_gt and not has_filtered: + missing_gt_snp_total += 1 + missing_samples: set[str] = set() + for r in records: + for idx in r["missing_gt_sample_idxs"]: + if idx < len(sample_names): + missing_samples.add(sample_names[idx]) + else: + missing_samples.add(f"sample_{idx + 1}") + for sample in missing_samples: + missing_gt_snp_by_sample[sample] = missing_gt_snp_by_sample.get(sample, 0) + 1 for r in records: f_filt.write(r["line"] + "\n") filtered_bp += 1 @@ -386,6 +411,8 @@ def flush_group(records: list[dict]) -> None: f_clean.write(raw) # Buffer meta/header lines until we see the #CHROM header line. elif raw.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"): + header_cols = raw.rstrip("\n").split("\t") + sample_names = header_cols[9:] if len(header_cols) > 9 else [] # First: inject provenance headers (##...). for ln in prov_lines: f_inv.write(ln) @@ -554,7 +581,11 @@ def flush_group(records: list[dict]) -> None: group_chrom = chrom group_pos = int(cols[1]) gts = [c.split(":", 1)[0] for c in cols[9:]] if len(cols) > 9 else ["."] - has_missing_gt = any(gt == "." for gt in gts) + missing_gt_sample_idxs = [ + i for i, gt in enumerate(gts) + if ("." in gt) or (gt == "") + ] + has_missing_gt = len(missing_gt_sample_idxs) > 0 group.append( { "line": line, @@ -562,6 +593,7 @@ def flush_group(records: list[dict]) -> None: "is_inv": is_inv, "is_filtered": is_filtered, "has_missing_gt": has_missing_gt, + "missing_gt_sample_idxs": missing_gt_sample_idxs, } ) @@ -580,6 +612,13 @@ def flush_group(records: list[dict]) -> None: print(f" missing: {missing_bp:,}",file=sys.stderr) + with open(out_missing_gt_stats, "w", encoding="utf-8") as f_stats: + f_stats.write("sample\texcluded_snp_sites\ttotal_excluded_snp_sites\n") + names = sample_names if sample_names else sorted(missing_gt_snp_by_sample.keys()) + for sample in names: + count = missing_gt_snp_by_sample.get(sample, 0) + f_stats.write(f"{sample}\t{count}\t{missing_gt_snp_total}\n") + if bgzip_output: for path in (out_inv, out_filt, out_clean, out_missing): proc = subprocess.run(["bgzip", "-f", path], check=False) diff --git a/scripts/split_and_mask.sh b/scripts/split_and_mask.sh index 4d4294e..26fd0cc 100644 --- a/scripts/split_and_mask.sh +++ b/scripts/split_and_mask.sh @@ -8,10 +8,10 @@ set -euo pipefail -# One job: split gVCF into .inv/.filtered/.clean and build mask bed. +# One job: split gVCF into .inv/.filtered/.clean.vcf and build mask bed. usage() { echo "Usage: $0 -p -d [--filter-multiallelic] [--no-gzip] [--no-merge]" - echo " -p Prefix for .inv/.filtered/.clean outputs" + echo " -p Prefix for .inv/.filtered/.clean.vcf outputs" echo " -d Depth cutoff passed to split.py" echo " --filter-multiallelic Filter multi-allelic SNPs in split.py" echo " --no-gzip Do not gzip split.py outputs (default)" @@ -76,7 +76,7 @@ if [ "$NO_GZIP" != "true" ]; then fi SPLIT_ARGS+=("$INPUT_VCF") -# Run split.py to produce .inv/.filtered/.clean/.missing.bed. +# Run split.py to produce .inv/.filtered/.clean.vcf/.missing.bed. "${SPLIT_ARGS[@]}" # Build mask bed from filtered + missing + dropped indels. diff --git a/scripts/summary_report.py b/scripts/summary_report.py index f32ef21..82336d2 100644 --- a/scripts/summary_report.py +++ b/scripts/summary_report.py @@ -221,6 +221,86 @@ def y_scale(y_val: int) -> float: return "\n".join(parts) +def _read_missing_gt_stats(paths: list[str]) -> dict[str, int]: + counts: dict[str, int] = {} + for path in paths: + try: + with open(path, "r", encoding="utf-8", errors="ignore") as handle: + header = handle.readline() + if not header: + continue + for line in handle: + if not line.strip(): + continue + parts = line.rstrip("\n").split("\t") + if len(parts) < 2: + continue + sample = parts[0] + try: + value = int(parts[1]) + except ValueError: + continue + counts[sample] = counts.get(sample, 0) + value + except OSError: + continue + return counts + + +def _svg_horizontal_bar_chart(labels: list[str], values: list[int], width: int = 900) -> str: + if not labels or not values or len(labels) != len(values): + return "

No data available.

" + max_label_len = max(len(label) for label in labels) + label_space = min(max(120, max_label_len * 8), 360) + bar_h = 18 + gap = 8 + margin = {"left": label_space + 20, "right": 70, "top": 24, "bottom": 30} + n = len(labels) + height = margin["top"] + margin["bottom"] + n * bar_h + max(n - 1, 0) * gap + plot_w = width - margin["left"] - margin["right"] + max_val = max(values) + if max_val <= 0: + max_val = 1 + + parts = [ + f'', + '', + ] + for idx, (label, value) in enumerate(zip(labels, values)): + y = margin["top"] + idx * (bar_h + gap) + bar_w = int(round((value / max_val) * plot_w)) + parts.append( + f'{html.escape(label)}' + ) + parts.append( + f'' + ) + parts.append( + f'{value:,}' + ) + + x0 = margin["left"] + y0 = height - margin["bottom"] + 2 + parts.append( + f'' + ) + for i in range(5): + frac = i / 4 + x = x0 + frac * plot_w + tick = int(round(frac * max_val)) + parts.append( + f'' + ) + parts.append( + f'{tick:,}' + ) + parts.append("") + return "\n".join(parts) + + report_path = Path(snakemake.output.report) report_path.parent.mkdir(parents=True, exist_ok=True) @@ -233,6 +313,7 @@ def y_scale(y_val: int) -> float: dropped_contigs_not_in_ref = [str(c) for c in snakemake.params.dropped_contigs_not_in_ref] requested_contigs = [str(c) for c in snakemake.params.requested_contigs] remapped_contigs = [(str(a), str(b)) for a, b in snakemake.params.remapped_contigs] +contigs_not_in_all_mafs = [str(c) for c in getattr(snakemake.params, "contigs_not_in_all_mafs", [])] ploidy = int(getattr(snakemake.params, "ploidy", 1)) ploidy_source = str(getattr(snakemake.params, "ploidy_source", "unknown")) ploidy_file_values = { @@ -248,8 +329,21 @@ def y_scale(y_val: int) -> float: log_paths.extend(sorted(Path(".snakemake").rglob("*.log"))) for log_path in log_paths: try: + in_shell_command_block = False with log_path.open("r", encoding="utf-8", errors="ignore") as handle: for line in handle: + # Snakemake logs include literal shell command text. Ignore WARNING + # tokens inside those blocks to avoid false positives from lines + # like: echo "WARNING: ..." + if line.startswith("Shell command:"): + in_shell_command_block = True + continue + if in_shell_command_block: + if line.startswith(" ") or line.startswith("\t") or not line.strip(): + continue + in_shell_command_block = False + if 'echo "WARNING:' in line or "echo 'WARNING:" in line: + continue if "WARNING" in line or "Warning" in line: warnings.append(f"{log_path}: {line.rstrip()}") except OSError: @@ -258,8 +352,10 @@ def y_scale(y_val: int) -> float: try: maf_contigs = _read_maf_contigs(Path(snakemake.params.maf_dir)) ref_contigs = set(_read_fasta_contigs(Path(snakemake.params.orig_ref_fasta))) - missing_in_ref = sorted(set(maf_contigs) - ref_contigs) - missing_in_maf = sorted(ref_contigs - set(maf_contigs)) + remapped_sources = {src for src, _dst in remapped_contigs} + remapped_targets = {dst for _src, dst in remapped_contigs} + missing_in_ref = sorted((set(maf_contigs) - ref_contigs) - remapped_sources) + missing_in_maf = sorted((ref_contigs - set(maf_contigs)) - remapped_targets) if missing_in_ref: warnings.append( "WARNING: MAF contigs not present in reference (showing up to 5): " @@ -295,6 +391,17 @@ def y_scale(y_val: int) -> float: "WARNING: Configured contigs were remapped to renamed-reference contigs: " f"{preview_pairs}{extra}" ) +if contigs_not_in_all_mafs: + preview = ", ".join(contigs_not_in_all_mafs[:20]) + extra = ( + f" (+{len(contigs_not_in_all_mafs) - 20} more)" + if len(contigs_not_in_all_mafs) > 20 + else "" + ) + warnings.append( + "WARNING: Contigs not present in all MAF files were excluded from default " + f"processing: {preview}{extra}" + ) for message in ploidy_warnings: warnings.append(f"WARNING: Ploidy inference: {message}") @@ -358,6 +465,17 @@ def y_scale(y_val: int) -> float: ) handle.write("\n") + if contigs_not_in_all_mafs: + handle.write("

Contigs Not In All MAF Files

\n") + handle.write( + "

These contigs are not present in every MAF and are excluded " + "from default contig selection unless explicitly listed in config.

\n" + ) + handle.write("
    \n") + for contig in contigs_not_in_all_mafs: + handle.write(f"
  • {html.escape(contig)}
  • \n") + handle.write("
\n") + handle.write("

Jobs run

\n") handle.write("
    \n") for job, outputs in jobs: @@ -520,6 +638,21 @@ def y_scale(y_val: int) -> float: ) ) + handle.write("

    SNP Sites Excluded From Clean By MAF File

    \n") + excluded_by_sample = _read_missing_gt_stats([str(p) for p in snakemake.input.missing_gt_stats]) + if excluded_by_sample: + ranked = sorted(excluded_by_sample.items(), key=lambda kv: kv[1], reverse=True) + labels = [name for name, _ in ranked] + values = [value for _, value in ranked] + handle.write( + _svg_horizontal_bar_chart( + labels, + values, + ) + ) + else: + handle.write("

    No missing-genotype SNP exclusions were recorded.

    \n") + handle.write("

    Warnings

    \n") if warnings: handle.write("
      \n") diff --git a/tests/test_check_split_coverage.py b/tests/test_check_split_coverage.py index 1fa2619..5516652 100644 --- a/tests/test_check_split_coverage.py +++ b/tests/test_check_split_coverage.py @@ -12,7 +12,7 @@ def test_check_split_coverage_pass(tmp_path: Path): fai = tmp_path / "ref.fa.fai" fai.write_text("1\t10\t0\t0\t0\n", encoding="utf-8") - (tmp_path / "out.clean").write_text( + (tmp_path / "out.clean.vcf").write_text( "##fileformat=VCFv4.2\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" "1\t1\t.\tA\tC\t.\t.\tDP=1\n" @@ -26,7 +26,7 @@ def test_check_split_coverage_pass(tmp_path: Path): "1\t4\t.\tA\t.\t.\t.\tDP=1\n", encoding="utf-8", ) - (tmp_path / "out.filtered.bed").write_text( + (tmp_path / "out.clean.mask.bed").write_text( "1\t4\t10\n", encoding="utf-8", ) @@ -51,7 +51,7 @@ def test_check_split_coverage_overlap_fails(tmp_path: Path): fai = tmp_path / "ref.fa.fai" fai.write_text("1\t5\t0\t0\t0\n", encoding="utf-8") - (tmp_path / "out.clean").write_text( + (tmp_path / "out.clean.vcf").write_text( "##fileformat=VCFv4.2\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" "1\t1\t.\tA\tC\t.\t.\tDP=1\n", @@ -64,7 +64,7 @@ def test_check_split_coverage_overlap_fails(tmp_path: Path): encoding="utf-8", ) # Overlaps clean at pos 1. - (tmp_path / "out.filtered.bed").write_text( + (tmp_path / "out.clean.mask.bed").write_text( "1\t0\t2\n", encoding="utf-8", ) @@ -90,7 +90,7 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path): fai = tmp_path / "ref.fa.fai" fai.write_text("SCAFFOLD_24\t100\t0\t0\t0\n", encoding="utf-8") - (tmp_path / "combined.SCAFFOLD_24.clean").write_text( + (tmp_path / "combined.SCAFFOLD_24.clean.vcf").write_text( "##fileformat=VCFv4.2\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8", @@ -100,7 +100,7 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path): "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8", ) - (tmp_path / "combined.SCAFFOLD_24.filtered.bed").write_text("", encoding="utf-8") + (tmp_path / "combined.SCAFFOLD_24.clean.mask.bed").write_text("", encoding="utf-8") _run( [ @@ -116,4 +116,4 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path): report = tmp_path / "combined.SCAFFOLD_24.coverage.txt" text = report.read_text(encoding="utf-8") assert "chrom=SCAFFOLD_24" in text - assert "warning=No records found in clean/inv/filtered.bed" in text + assert "warning=No records found in clean/inv/clean.mask.bed" in text diff --git a/tests/test_drop_sv.py b/tests/test_drop_sv.py index 1739eed..97a6afc 100644 --- a/tests/test_drop_sv.py +++ b/tests/test_drop_sv.py @@ -44,6 +44,15 @@ def _write_vcf(path: Path): ) +def _write_snp_only_vcf(path: Path): + path.write_text( + "##fileformat=VCFv4.2\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" + "1\t2\t.\tA\tT\t.\t.\t.\n", + encoding="utf-8", + ) + + def _count_records(path: Path) -> int: count = 0 with subprocess.Popen(_conda_cmd("bcftools", "view", "-H", str(path)), stdout=subprocess.PIPE, text=True) as proc: @@ -85,3 +94,34 @@ def test_drop_sv_filters_large_indel(tmp_path: Path): assert dropped_bed.exists() bed_lines = [l for l in dropped_bed.read_text(encoding="utf-8").splitlines() if l.strip()] assert len(bed_lines) == 1 + + +def test_drop_sv_copies_and_indexes_when_no_large_indels(tmp_path: Path): + _require_conda_tools("bcftools", "bgzip", "tabix") + vcf = tmp_path / "sample.gvcf" + _write_snp_only_vcf(vcf) + + _run(_conda_cmd("bgzip", "-f", str(vcf))) + gvcf = tmp_path / "sample.gvcf.gz" + _run(_conda_cmd("tabix", "-p", "vcf", str(gvcf))) + + _run(_conda_cmd( + "python3", + str(Path("scripts") / "dropSV.py"), + "-d", + str(tmp_path), + "-c", + "1", + )) + + cleaned = tmp_path / "cleangVCF" / "sample.gvcf.gz" + cleaned_tbi = tmp_path / "cleangVCF" / "sample.gvcf.gz.tbi" + assert cleaned.exists() + assert cleaned_tbi.exists() + assert _count_records(cleaned) == 1 + # No filtering path should preserve the compressed payload byte-for-byte via copy. + assert cleaned.read_bytes() == gvcf.read_bytes() + + dropped_bed = tmp_path / "cleangVCF" / "dropped_indels.bed" + assert dropped_bed.exists() + assert dropped_bed.read_text(encoding="utf-8").strip() == "" diff --git a/tests/test_filt_to_bed.py b/tests/test_filt_to_bed.py index 40e89f4..a0216c3 100644 --- a/tests/test_filt_to_bed.py +++ b/tests/test_filt_to_bed.py @@ -1,5 +1,6 @@ import subprocess import sys +import gzip from pathlib import Path @@ -36,7 +37,7 @@ def test_filt_to_bed_uses_ref_len(tmp_path: Path): dropped.write_text("", encoding="utf-8") (tmp_path / "sample.inv").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8") - (tmp_path / "sample.clean").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8") + (tmp_path / "sample.clean.vcf").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8") _run( [ @@ -49,7 +50,7 @@ def test_filt_to_bed_uses_ref_len(tmp_path: Path): cwd=Path.cwd(), ) - out_bed = tmp_path / "sample.filtered.bed" + out_bed = tmp_path / "sample.clean.mask.bed" assert out_bed.exists() # REF length 3 => interval length 3, plus missing interval length 1 @@ -79,7 +80,7 @@ def test_filt_to_bed_end_and_subtraction(tmp_path: Path): "1\t11\t.\tA\t.\t.\t.\tDP=10\tGT\t0\n", encoding="utf-8", ) - (tmp_path / "sample.clean").write_text( + (tmp_path / "sample.clean.vcf").write_text( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" "1\t12\t.\tA\tG\t.\t.\tDP=10\tGT\t0/1\n", encoding="utf-8", @@ -96,8 +97,44 @@ def test_filt_to_bed_end_and_subtraction(tmp_path: Path): cwd=Path.cwd(), ) - out_bed = tmp_path / "sample.filtered.bed" + out_bed = tmp_path / "sample.clean.mask.bed" assert out_bed.exists() # Span 10-12 (3 bp), subtract 11 and 12 => only 1 bp remains. assert _sum_bed(out_bed) == 1 + + +def test_filt_to_bed_accepts_gz_missing_bed(tmp_path: Path): + prefix = tmp_path / "sample" + + filtered = tmp_path / "sample.filtered.gz" + with gzip.open(filtered, "wt", encoding="utf-8") as handle: + handle.write( + "##fileformat=VCFv4.2\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" + "1\t10\t.\tA\tT\t.\t.\t.\n" + ) + + with gzip.open(tmp_path / "sample.missing.bed.gz", "wt", encoding="utf-8") as handle: + handle.write("1\t0\t1\n") + + dropped = tmp_path / "dropped_indels.bed" + dropped.write_text("", encoding="utf-8") + + (tmp_path / "sample.inv").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8") + (tmp_path / "sample.clean.vcf").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8") + + _run( + [ + sys.executable, + str(Path("scripts") / "filt_to_bed.py"), + str(prefix), + "--dropped-bed", + str(dropped), + ], + cwd=Path.cwd(), + ) + + out_bed = tmp_path / "sample.clean.mask.bed" + assert out_bed.exists() + assert _sum_bed(out_bed) == 2 diff --git a/tests/test_integration.py b/tests/test_integration.py index b73bdc2..e6bcf21 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ import subprocess from pathlib import Path import shutil +import yaml import pytest @@ -55,10 +56,7 @@ def _is_gzip(path: Path) -> bool: return False -def test_snakemake_summary(tmp_path: Path): - _require_integration() - _require_conda_tools("argprep", "snakemake", "bcftools", "tabix", "bgzip", "gatk", "java") - repo = Path.cwd() +def _run_example_summary(repo: Path, tmp_path: Path) -> str: ref = _config_reference_fasta(repo / "config.yaml") if ref is None or not ref.exists(): pytest.skip(f"Missing reference FASTA from config.yaml: {ref}") @@ -100,15 +98,10 @@ def test_snakemake_summary(tmp_path: Path): ], cwd=Path.cwd(), ) + return Path(summary_target).read_text(encoding="utf-8", errors="ignore") -def test_snakemake_contig_mismatch_handling(tmp_path: Path): - _require_integration() - _require_conda_tools( - "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java" - ) - repo = Path.cwd() - +def _run_contig_mismatch_summary(repo: Path, tmp_path: Path) -> str: # Build a reference with one real contig and one contig absent from MAFs. ref_src = repo / "example_data" / "ref.fa" if not ref_src.exists(): @@ -173,8 +166,27 @@ def test_snakemake_contig_mismatch_handling(tmp_path: Path): ], cwd=repo, ) + return Path(summary_target).read_text(encoding="utf-8", errors="ignore") - summary_text = Path(summary_target).read_text(encoding="utf-8", errors="ignore") + +def test_snakemake_summary(tmp_path: Path): + _require_integration() + _require_conda_tools("argprep", "snakemake", "bcftools", "tabix", "bgzip", "gatk", "java") + repo = Path.cwd() + summary_text = _run_example_summary(repo, tmp_path) + assert "Workflow summary" in summary_text + assert "Ploidy" in summary_text + assert "Jobs run" in summary_text + assert "Warnings" in summary_text + + +def test_snakemake_contig_mismatch_handling(tmp_path: Path): + _require_integration() + _require_conda_tools( + "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java" + ) + repo = Path.cwd() + summary_text = _run_contig_mismatch_summary(repo, tmp_path) assert "Configured contigs not present in reference .fai were skipped" in summary_text assert "requested_only_contig" in summary_text assert "Configured contigs were remapped to renamed-reference contigs" in summary_text @@ -185,6 +197,109 @@ def test_snakemake_contig_mismatch_handling(tmp_path: Path): assert "fai_only_contig" in summary_text +def test_snakemake_contig_mismatch_remapped_contigs_not_double_reported(tmp_path: Path): + _require_integration() + _require_conda_tools( + "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java" + ) + summary_text = _run_contig_mismatch_summary(Path.cwd(), tmp_path) + assert "Configured contigs were remapped to renamed-reference contigs" in summary_text + assert "chr1->1" in summary_text + assert "MAF contigs not present in reference (showing up to 5): chr1" not in summary_text + assert "Reference contigs not present in MAFs (showing up to 5): 1" not in summary_text + + +def test_snakemake_slurm_profile_dry_run_smoke(tmp_path: Path): + profile_path = Path.cwd() / "profiles" / "slurm" / "config.yaml" + profile = yaml.safe_load(profile_path.read_text(encoding="utf-8")) + assert profile["executor"] == "cluster-generic" + assert int(profile["jobs"]) >= 1 + submit_cmd = str(profile["cluster-generic-submit-cmd"]) + assert "sbatch" in submit_cmd + assert "{config[slurm_account]}" in submit_cmd + assert "{config[slurm_partition]}" in submit_cmd + assert "{threads}" in submit_cmd + assert "{resources.mem_mb}" in submit_cmd + assert "{resources.time}" in submit_cmd + defaults = profile.get("default-resources") + assert isinstance(defaults, list) + assert any(str(item).startswith("mem_mb=") for item in defaults) + assert any(str(item).startswith("time=") for item in defaults) + + +def test_snakemake_default_contigs_use_maf_intersection(tmp_path: Path): + _require_integration() + _require_conda_tools( + "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java" + ) + repo = Path.cwd() + + ref = tmp_path / "ref.fa" + ref.write_text( + ">1\n" + ("ACGT" * 50) + "\n" + ">2\n" + ("TGCA" * 50) + "\n", + encoding="utf-8", + ) + _run([_conda_exe(), "run", "-n", "argprep", "samtools", "faidx", str(ref)], cwd=repo) + + maf_dir = tmp_path / "maf" + maf_dir.mkdir() + (maf_dir / "s1.maf").write_text( + "##maf version=1\n" + "a score=0\n" + "s 1 0 12 + 200 ACGTACGTACGT\n" + "s s1 0 12 + 200 ACGTACGTACGT\n" + "\n" + "a score=0\n" + "s 2 0 12 + 200 TGCATGCATGCA\n" + "s s1 0 12 + 200 TGCATGCATGCA\n", + encoding="utf-8", + ) + (maf_dir / "s2.maf").write_text( + "##maf version=1\n" + "a score=0\n" + "s 1 0 12 + 200 ACGTACGTACGT\n" + "s s2 0 12 + 200 ACGTACGTACGT\n", + encoding="utf-8", + ) + + gvcf_dir = tmp_path / "gvcf" + results_dir = tmp_path / "results" + summary_target = str(results_dir / "summary.html") + _run( + [ + _conda_exe(), + "run", + "-n", + "argprep", + "env", + "PATH=/opt/anaconda3/envs/argprep/bin:/usr/bin:/bin:/usr/sbin:/sbin", + "HOME=/tmp", + "TMPDIR=/tmp", + "XDG_CACHE_HOME=/tmp", + "SNAKEMAKE_OUTPUT_CACHE=/tmp/snakemake", + "SNAKEMAKE_SOURCE_CACHE=/tmp/snakemake", + "snakemake", + "-j", + "2", + "--config", + f"maf_dir={maf_dir}", + f"reference_fasta={ref}", + f"gvcf_dir={gvcf_dir}", + f"results_dir={results_dir}", + "samples=['s1','s2']", + "--", + summary_target, + ], + cwd=repo, + ) + + summary_text = Path(summary_target).read_text(encoding="utf-8", errors="ignore") + assert "Contigs not present in all MAF files were excluded from default processing" in summary_text + assert "2" in summary_text + assert (results_dir / "combined" / "combined.1.gvcf.gz").exists() + assert not (results_dir / "combined" / "combined.2.gvcf.gz").exists() + + def test_maf_to_gvcf_single_sample(tmp_path: Path): _require_integration() _require_conda_tools("argprep", "java") diff --git a/tests/test_random_positions.py b/tests/test_random_positions.py index 55b7401..f1824f8 100644 --- a/tests/test_random_positions.py +++ b/tests/test_random_positions.py @@ -74,8 +74,8 @@ def test_random_position_membership(): contig = "1" base = Path("results") / "split" / f"combined.{contig}" inv_path = base.with_suffix(base.suffix + ".inv") - clean_path = base.with_suffix(base.suffix + ".clean") - filtered_bed = base.with_suffix(base.suffix + ".filtered.bed") + clean_path = base.with_suffix(base.suffix + ".clean.vcf") + filtered_bed = base.with_suffix(base.suffix + ".clean.mask.bed") missing_bed = base.with_suffix(base.suffix + ".missing.bed") fai_path = Path("results") / "refs" / "reference_gvcf.fa.fai" diff --git a/tests/test_snakefile_contig_resolution.py b/tests/test_snakefile_contig_resolution.py new file mode 100644 index 0000000..8510230 --- /dev/null +++ b/tests/test_snakefile_contig_resolution.py @@ -0,0 +1,76 @@ +import ast +import re +from pathlib import Path + + +def _extract_function_source(text: str, name: str) -> str: + lines = text.splitlines(keepends=True) + start = None + for i, line in enumerate(lines): + if line.startswith(f"def {name}("): + start = i + break + if start is None: + raise AssertionError(f"Function {name} not found in Snakefile") + + end = len(lines) + i = start + paren_balance = 0 + while i < len(lines): + paren_balance += lines[i].count("(") - lines[i].count(")") + if paren_balance <= 0 and lines[i].rstrip().endswith(":"): + i += 1 + break + i += 1 + + for i in range(i, len(lines)): + line = lines[i] + if not line.strip(): + continue + if line.startswith(" ") or line.startswith("\t"): + continue + if line.startswith("#"): + continue + if line.startswith("def "): + end = i + break + end = i + break + return "".join(lines[start:end]) + + +def _load_snakefile_functions(): + snakefile = Path(__file__).resolve().parents[1] / "Snakefile" + text = snakefile.read_text(encoding="utf-8") + snippet = ( + _extract_function_source(text, "_normalize_contig") + + "\n" + + _extract_function_source(text, "_resolve_requested_contigs") + ) + module = ast.parse(snippet, filename=str(snakefile)) + ast.fix_missing_locations(module) + ns = {"re": re} + exec(compile(module, str(snakefile), "exec"), ns) + return ns["_normalize_contig"], ns["_resolve_requested_contigs"] + + +def test_normalize_contig_handles_chr_prefix_and_zero_padding(): + normalize, _ = _load_snakefile_functions() + assert normalize("chr01") == "1" + assert normalize("001") == "1" + assert normalize("chrX") == "x" + assert normalize("chr000") == "0" + + +def test_resolve_requested_contigs_remaps_dedupes_and_drops_ambiguous(): + _, resolve = _load_snakefile_functions() + + kept, dropped, remapped = resolve(["chr1", "01", "2", "chr2", "chr3"], ["1", "2"]) + assert kept == ["1", "2"] + assert dropped == ["chr3"] + assert remapped == [("chr1", "1"), ("01", "1"), ("chr2", "2")] + + kept2, dropped2, remapped2 = resolve(["chr1"], ["1", "chr01"]) + assert kept2 == [] + assert dropped2 == ["chr1"] + assert remapped2 == [] diff --git a/tests/test_split.py b/tests/test_split.py index abc2ad1..870a623 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -60,7 +60,7 @@ def test_singer_keeps_genotypes_and_strips_nonref_alt(tmp_path: Path): gvcf.write_text( "##fileformat=VCFv4.2\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\tS2\tS3\n" - "1\t2\t.\tA\tG,\t.\t.\tDP=30\tGT:AD\t0/1:0,1\t./.:.\t0/0:1,0\n", + "1\t2\t.\tA\tG,\t.\t.\tDP=30\tGT:AD\t0/1:0,1\t0/1:1,1\t0/0:1,0\n", encoding="utf-8", ) @@ -78,13 +78,13 @@ def test_singer_keeps_genotypes_and_strips_nonref_alt(tmp_path: Path): cwd=Path.cwd(), ) - clean = tmp_path / "out.clean" + clean = tmp_path / "out.clean.vcf" line = [l for l in clean.read_text(encoding="utf-8").splitlines() if not l.startswith("#")][0] parts = line.split("\t") # ALT should drop , but genotype/sample fields should be preserved. assert parts[4] == "G" assert parts[8] == "GT:AD" - assert parts[9:] == ["0/1:0,1", "./.:.", "0/0:1,0"] + assert parts[9:] == ["0/1:0,1", "0/1:1,1", "0/0:1,0"] def _count_vcf_records(path: Path) -> int: @@ -156,7 +156,7 @@ def test_split_alt_dot_and_nonref_only_are_invariant(tmp_path: Path): ) inv = tmp_path / "out.inv" - clean = tmp_path / "out.clean" + clean = tmp_path / "out.clean.vcf" inv_records = _count_vcf_records(inv) clean_records = _count_vcf_records(clean) @@ -164,3 +164,37 @@ def test_split_alt_dot_and_nonref_only_are_invariant(tmp_path: Path): # ALT="." and ALT=""-only records are invariant in current logic. assert inv_records == 3 assert clean_records == 0 + + +def test_missing_gt_exclusion_stats_written_per_sample(tmp_path: Path): + gvcf = tmp_path / "in.gvcf" + gvcf.write_text( + "##fileformat=VCFv4.2\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tbad\tgood\n" + "1\t1\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n" + "1\t2\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n" + "1\t3\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n" + "1\t4\t.\tA\tG\t.\t.\tDP=10\tGT\t0/1\t0/1\n", + encoding="utf-8", + ) + + prefix = tmp_path / "out" + proc = subprocess.run( + [ + sys.executable, + str(Path("scripts") / "split.py"), + "--out-prefix", + str(prefix), + str(gvcf), + ], + cwd=Path.cwd(), + capture_output=True, + text=True, + check=True, + ) + + assert "WARNING:" not in proc.stderr + stats = (tmp_path / "out.missing_gt_snp_by_sample.tsv").read_text(encoding="utf-8") + assert "sample\texcluded_snp_sites\ttotal_excluded_snp_sites" in stats + assert "bad\t3\t3" in stats + assert "good\t0\t3" in stats diff --git a/tests/test_summary_report.py b/tests/test_summary_report.py new file mode 100644 index 0000000..f58f34e --- /dev/null +++ b/tests/test_summary_report.py @@ -0,0 +1,105 @@ +import gzip +import runpy +from pathlib import Path +from types import SimpleNamespace + + +def _run_summary_report(tmp_path: Path, monkeypatch) -> str: + repo = Path(__file__).resolve().parents[1] + monkeypatch.chdir(tmp_path) + + # The report scans these locations for warnings. + (tmp_path / "logs").mkdir() + (tmp_path / ".snakemake").mkdir() + (tmp_path / "logs" / "test.log").write_text( + "Shell command:\n" + ' echo "WARNING: from shell command block should be ignored"\n' + 'echo "WARNING: explicit echo line should be ignored"\n' + "WARNING: real log warning should be kept\n", + encoding="utf-8", + ) + + maf_dir = tmp_path / "maf" + maf_dir.mkdir() + (maf_dir / "plain.maf").write_text( + "##maf version=1\n" + "a score=0\n" + "s chr1 0 4 + 4 ACGT\n" + "s sampleA 0 4 + 4 ACGT\n", + encoding="utf-8", + ) + with gzip.open(maf_dir / "extra.maf.gz", "wt", encoding="utf-8") as handle: + handle.write( + "##maf version=1\n" + "a score=0\n" + "s maf_only_contig 0 4 + 4 ACGT\n" + "s sampleB 0 4 + 4 ACGT\n" + ) + + ref_fa = tmp_path / "ref.fa" + ref_fa.write_text(">1\nACGTACGTAC\n>fai_only_contig\nACGT\n", encoding="utf-8") + ref_fai = tmp_path / "ref.fa.fai" + ref_fai.write_text("1\t10\t0\t0\t0\n", encoding="utf-8") + + status_tsv = tmp_path / "split.status.tsv" + status_tsv.write_text( + "sampleToRef\tchr1\tmissing\n" + "sampleToRef\tchr1\tmissing\n" + "sampleToRef\tchr2\tpresent\n", + encoding="utf-8", + ) + + report_path = tmp_path / "results" / "summary.html" + fake_snakemake = SimpleNamespace( + output=SimpleNamespace(report=str(report_path)), + input=SimpleNamespace(beds=[], invs=[], cleans=[], missing_gt_stats=[]), + params=SimpleNamespace( + contigs=["1"], + jobs=[], + temp_paths=[], + arg_outputs=[], + split_prefixes={}, + split_status_files=[str(status_tsv)], + dropped_contigs_not_in_ref=[], + requested_contigs=["chr1"], + remapped_contigs=[("chr1", "1")], + contigs_not_in_all_mafs=[], + ploidy=1, + ploidy_source="test", + ploidy_file_values={}, + ploidy_warnings=[], + maf_dir=str(maf_dir), + orig_ref_fasta=str(ref_fa), + ref_fai=str(ref_fai), + ), + ) + + runpy.run_path( + str(repo / "scripts" / "summary_report.py"), + init_globals={"snakemake": fake_snakemake}, + run_name="__main__", + ) + return report_path.read_text(encoding="utf-8", errors="ignore") + + +def test_summary_report_warning_filters_and_deduplicates(monkeypatch, tmp_path: Path): + html = _run_summary_report(tmp_path, monkeypatch) + + assert "Configured contigs were remapped to renamed-reference contigs" in html + assert "chr1->1" in html + + assert "MAF contigs not present in reference" in html + assert "maf_only_contig" in html + assert "MAF contigs not present in reference (showing up to 5): chr1" not in html + + assert "Reference contigs not present in MAFs" in html + assert "fai_only_contig" in html + assert "Reference contigs not present in MAFs (showing up to 5): 1" not in html + + assert "gVCF sampleToRef.gvcf.gz is missing configured contigs: chr1" in html + assert "gVCF sampleToRef.gvcf.gz is missing configured contigs: chr1, chr1" not in html + + assert "real log warning should be kept" in html + assert "from shell command block should be ignored" not in html + assert "explicit echo line should be ignored" not in html +