diff --git a/.gitignore b/.gitignore
index e741616..84dd975 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 gvcf/*
 results/*
 logs/*
+logs/**
 example_data/*
 .snakemake/
 slurm-*.out
diff --git a/README.md b/README.md
index 0a360ec..f54a080 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@ The pipeline can be run one of two ways, both from the repo root. **It is recomm
 A default SLURM profile is provided under `profiles/slurm/`. Edit `profiles/slurm/config.yaml` to customize sbatch options if needed.
 Defaults for account/partition and baseline resources are set in `config.yaml` (`slurm_account`, `slurm_partition`, `default_*`).
 SLURM stdout/stderr logs are written to `logs/slurm/` by default.
+The provided profile sets `jobs: 200` (Snakemake's max concurrent workflow jobs, equivalent to `-j 200`), which you can override on the command line if needed.
 
 ```bash
 snakemake --profile profiles/slurm
@@ -55,7 +56,7 @@ snakemake -j 8
 
 Common options:
 
-- `-j <N>`: number of parallel jobs
+- `-j <N>`: number of parallel Snakemake jobs (not per-job threads/CPUs)
 - `--rerun-incomplete`: clean up partial outputs
 - `--printshellcmds`: show executed commands
 - `--notemp`: keep temporary intermediates (see Notes)
@@ -71,21 +72,48 @@ By default the workflow uses these locations (override in `config.yaml`):
 - `results/combined/combined.<contig>.gvcf.gz` : merged gVCF per contig
 - `results/split/combined.<contig>.inv` : invariant sites
 - `results/split/combined.<contig>.filtered` : filtered sites
-- `results/split/combined.<contig>.clean` : clean sites
+- `results/split/combined.<contig>.clean.vcf.gz` : clean sites (bgzip-compressed by default)
 - `results/split/combined.<contig>.missing.bed` : missing positions
-- `results/split/combined.<contig>.filtered.bed` : merged mask bed
+- `results/split/combined.<contig>.clean.mask.bed` : merged mask bed
 - `results/split/combined.<contig>.coverage.txt` : split coverage validation summary
 - `results/split/combined.<contig>.accessible.npz` : boolean accessibility array (union of clean + invariant sites), for scikit-allel statistics
 - `results/summary.html` : HTML summary of jobs run, outputs created, and warnings
+  - Contig remaps (e.g., configured `chr1` remapped to reference contig `1`) are reported under the remap warning and are not repeated in the generic MAF-vs-reference mismatch warnings.
+
+## Testing
+
+Run from the repo root:
+
+```bash
+pytest -q
+```
+
+- Runs unit/fast tests by default.
+- Some integration tests are gated and skipped unless enabled.
+
+To run integration tests too:
+
+```bash
+RUN_INTEGRATION=1 pytest -q
+```
+
+- Requires the `argprep` conda environment and external tools used by the workflow (e.g., `snakemake`, `bcftools`, `tabix`, `bgzip`, `gatk`, `java`; some tests also need `samtools`).
+- These tests take longer and may run parts of the Snakemake workflow.
+
+For a quick SLURM-profile syntax/config sanity check without submitting jobs, use a dry run:
+
+```bash
+snakemake --profile profiles/slurm -n
+```
 
 ## Notes
 
-- If `bgzip_output: true`, the `.inv`, `.filtered`, `.clean`, and `.missing.bed` files will have a `.gz` suffix.
+- `bgzip_output` defaults to `true`; by default the `.inv`, `.filtered`, `.clean.vcf`, and `.missing.bed` files are bgzip-compressed (`.gz` suffix).
 - All gzipped outputs in this pipeline use bgzip (required for `tabix`).
 - `scripts/dropSV.py` removes indels larger than `drop_cutoff` (if set in `config.yaml`).
 - `scripts/split.py` supports `--filter-multiallelic` and `--bgzip-output` (toggle via `config.yaml`).
 - `scripts/filt_to_bed.py` merges `<prefix>.filtered`, `<prefix>.missing.bed`, and `dropped_indels.bed` into a final mask bed.
-- `make_accessibility` builds a per-contig accessibility array from the union of `combined.<contig>.clean` and `combined.<contig>.inv` using the reference `.fai` to size the array. The output is a compressed NumPy archive containing a boolean array named `mask`, intended for scikit-allel statistics.
+- `make_accessibility` builds a per-contig accessibility array from the union of `combined.<contig>.clean.vcf.gz` and `combined.<contig>.inv.gz` (default names) using the reference `.fai` to size the array. The output is a compressed NumPy archive containing a boolean array named `mask`, intended for scikit-allel statistics.
 - Ploidy is inferred from MAF block structure by default (max non-reference `s` lines per block, typically `1` for pairwise MAFs). You can override with `ploidy` in `config.yaml`.
 - Optional: enable `vt_normalize: true` in `config.yaml` to normalize merged gVCFs with `vt normalize` after `SelectVariants`.
 - If GenomicsDBImport fails with a buffer-size error, increase `genomicsdb_vcf_buffer_size` and `genomicsdb_segment_size` in `config.yaml` (set them above your longest gVCF line length).
@@ -104,7 +132,7 @@ By default the workflow uses these locations (override in `config.yaml`):
 - New/expanded validation and tests: split coverage checks, filtered‑bed tests, integration tests gated by `RUN_INTEGRATION=1`.
 - Example data regenerated via msprime with indels and missing data and AnchorWave‑style MAF formatting.
 - `check_split_coverage.py` now reports overlap intervals with file names to aid debugging.
-- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined.<contig>.filtered.bed`.
+- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined.<contig>.clean.mask.bed`.
 - SLURM default resources now read `default_*` from `config.yaml` instead of hardcoded profile values.
 
 ## Changes since v0.2
@@ -123,11 +151,11 @@ By default the workflow uses these locations (override in `config.yaml`):
 
 ### ARG estimation
 
-Use Nate Pope's SINGER Snakemake [pipeline](https://github.com/nspope/singer-snakemake) with `combined.<contig>.clean` and `combined.<contig>.filtered.bed` as inputs.
+Use Nate Pope's SINGER Snakemake [pipeline](https://github.com/nspope/singer-snakemake) with `combined.<contig>.clean.vcf.gz` and `combined.<contig>.clean.mask.bed` as inputs.
 
 ### Population genetic statistics 
 
-If you use scikit-allel, you can use the `combined.<contig>.clean` VCF and load the accessibility mask like this:
+If you use scikit-allel, you can use the `combined.<contig>.clean.vcf.gz` VCF and load the accessibility mask like this:
 
 ```python
 import numpy as np
diff --git a/Snakefile b/Snakefile
index bdaef9a..5dc0174 100644
--- a/Snakefile
+++ b/Snakefile
@@ -24,7 +24,7 @@ FILL_GAPS = str(config.get("fill_gaps", "false")).lower()
 OUTPUT_JUST_GT = bool(config.get("outputJustGT", False))
 DROP_CUTOFF = config.get("drop_cutoff", "")
 FILTER_MULTIALLELIC = bool(config.get("filter_multiallelic", False))
-BGZIP_OUTPUT = bool(config.get("bgzip_output", False))
+BGZIP_OUTPUT = bool(config.get("bgzip_output", True))
 GENOMICSDB_VCF_BUFFER_SIZE = int(config.get("genomicsdb_vcf_buffer_size", 1048576))
 GENOMICSDB_SEGMENT_SIZE = int(config.get("genomicsdb_segment_size", 1048576))
 MAF_TO_GVCF_THREADS = int(config.get("maf_to_gvcf_threads", 2))
@@ -39,7 +39,6 @@ MERGE_CONTIG_TIME = str(config.get("merge_contig_time", config.get("default_time
 DEFAULT_JAVA_MEM_MB = max(256, int(DEFAULT_MEM_MB * 0.9))
 VT_NORMALIZE = bool(config.get("vt_normalize", False))
 VT_PATH = str(config.get("vt_path", "vt"))
-MERGED_GENOTYPER = "selectvariants"
 
 workflow.global_resources["merge_gvcf_jobs"] = int(config.get("merge_gvcf_max_jobs", 4))
 
@@ -68,24 +67,49 @@ def _normalize_contig(name: str) -> str:
     return name if name else "0"
 
 
-def _read_maf_contigs() -> set[str]:
+def _maf_path_for_sample(sample: str) -> Path:
+    maf = MAF_DIR / f"{sample}.maf"
+    maf_gz = MAF_DIR / f"{sample}.maf.gz"
+    if maf.exists():
+        return maf
+    if maf_gz.exists():
+        return maf_gz
+    return maf
+
+
+def _read_maf_contigs(maf_path: Path) -> set[str]:
     contigs = set()
-    maf_files = list(MAF_DIR.glob("*.maf")) + list(MAF_DIR.glob("*.maf.gz"))
-    for maf in sorted(maf_files):
-        try:
-            if maf.name.endswith(".gz"):
-                handle = gzip.open(maf, "rt", encoding="utf-8")
-            else:
-                handle = maf.open("r", encoding="utf-8")
-            with handle:
-                for line in handle:
-                    if not line or line.startswith("#"):
-                        continue
-                    parts = line.split()
-                    if parts and parts[0] == "s" and len(parts) >= 2:
-                        contigs.add(parts[1])
-        except OSError:
-            continue
+    try:
+        if maf_path.name.endswith(".gz"):
+            handle = gzip.open(maf_path, "rt", encoding="utf-8")
+        else:
+            handle = maf_path.open("r", encoding="utf-8")
+        with handle:
+            first_src = None
+            for line in handle:
+                if not line or line.startswith("#"):
+                    continue
+                stripped = line.strip()
+                if not stripped:
+                    if first_src is not None:
+                        contigs.add(first_src)
+                        first_src = None
+                    continue
+                parts = stripped.split()
+                if not parts:
+                    continue
+                if parts[0] == "a":
+                    if first_src is not None:
+                        contigs.add(first_src)
+                    first_src = None
+                    continue
+                if parts[0] == "s" and len(parts) >= 2 and first_src is None:
+                    # Use first sequence src in each block as the reference-side contig.
+                    first_src = parts[1]
+            if first_src is not None:
+                contigs.add(first_src)
+    except OSError:
+        return set()
     return contigs
 
 
@@ -172,6 +196,23 @@ def _discover_samples():
     return sorted(samples)
 
 
+def _read_maf_contig_sets(samples: list[str]) -> dict[str, set[str]]:
+    contigs_by_sample: dict[str, set[str]] = {}
+    for sample in samples:
+        maf_path = _maf_path_for_sample(sample)
+        contigs_by_sample[sample] = _read_maf_contigs(maf_path)
+    return contigs_by_sample
+
+
+def _contigs_not_in_all_mafs(contigs_by_sample: dict[str, set[str]]) -> list[str]:
+    if not contigs_by_sample:
+        return []
+    all_sets = list(contigs_by_sample.values())
+    union = set().union(*all_sets) if all_sets else set()
+    intersection = set.intersection(*all_sets) if all_sets else set()
+    return sorted(union - intersection)
+
+
 def _infer_ploidy_from_maf(maf_path: Path, max_blocks: int = 10000) -> int:
     opener = gzip.open if maf_path.suffix == ".gz" else open
     max_non_ref = 0
@@ -282,34 +323,14 @@ def _resolve_requested_contigs(
     return kept, dropped, remapped
 
 
-def _read_contigs():
-    orig_fai = REF_FASTA.with_suffix(REF_FASTA.suffix + ".fai")
-    if not orig_fai.exists():
-        raise ValueError(
-            "Reference FASTA index (.fai) not found. "
-            "Either run 'samtools faidx' on the reference or set 'contigs' in config.yaml."
-        )
-    target_fai = Path(REF_FAI)
-    if target_fai.exists():
-        ref_contigs = _read_fai_contigs(target_fai)
-    else:
-        ref_contigs = _read_fai_contigs(orig_fai)
-
-    if "contigs" in config:
-        requested = [str(c) for c in config["contigs"]]
-        kept, dropped, remapped = _resolve_requested_contigs(requested, ref_contigs)
-        if not kept:
-            raise ValueError(
-                "None of the configured contigs are present in reference .fai: "
-                + ", ".join(requested[:10])
-            )
-        return kept, dropped, requested, remapped
-    return ref_contigs, [], list(ref_contigs), []
-
-
 SAMPLES = _discover_samples()
 PLOIDY, PLOIDY_SOURCE, PLOIDY_FILE_VALUES, PLOIDY_WARNINGS = _resolve_ploidy(SAMPLES)
-CONTIGS, DROPPED_CONTIGS_NOT_IN_REF, REQUESTED_CONTIGS, REMAPPED_CONTIGS = _read_contigs()
+MAF_CONTIGS_BY_SAMPLE = _read_maf_contig_sets(SAMPLES)
+if MAF_CONTIGS_BY_SAMPLE:
+    MAF_CONTIG_INTERSECTION = sorted(set.intersection(*MAF_CONTIGS_BY_SAMPLE.values()))
+else:
+    MAF_CONTIG_INTERSECTION = []
+CONTIGS_NOT_IN_ALL_MAFS = _contigs_not_in_all_mafs(MAF_CONTIGS_BY_SAMPLE)
 
 GVCF_BASES = [f"{sample}To{REF_BASE}" for sample in SAMPLES]
 
@@ -330,7 +351,20 @@ def _active_contig_resolution() -> tuple[list[str], list[str], list[str], list[t
                 + ", ".join(requested[:10])
             )
         return kept, dropped, requested, remapped
-    return available, [], list(available), []
+
+    requested = list(MAF_CONTIG_INTERSECTION)
+    if not requested:
+        raise ValueError(
+            "No contigs are shared across all MAF files. "
+            "Set explicit 'contigs' in config.yaml to override."
+        )
+    kept, dropped, remapped = _resolve_requested_contigs(requested, available)
+    if not kept:
+        raise ValueError(
+            "No shared MAF contigs are present in renamed reference .fai. "
+            "Set explicit 'contigs' in config.yaml to override."
+        )
+    return kept, dropped, requested, remapped
 
 
 def _active_contigs() -> list[str]:
@@ -341,7 +375,7 @@ def _all_targets(_wc):
     contigs = _active_contigs()
     return (
         [str(_combined_out(c)) for c in contigs]
-        + [str(_split_prefix(c)) + ".filtered.bed" for c in contigs]
+        + [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs]
         + [str(_split_prefix(c)) + ".coverage.txt" for c in contigs]
         + [str(_accessibility_out(c)) for c in contigs]
         + [str(RESULTS_DIR / "summary.html")]
@@ -388,6 +422,8 @@ def _accessibility_out(contig):
 
 
 SPLIT_SUFFIX = ".gz" if BGZIP_OUTPUT else ""
+CLEAN_VCF_SUFFIX = ".clean.vcf" + SPLIT_SUFFIX
+MASK_BED_SUFFIX = ".clean.mask.bed"
 
 rule all:
     # Final targets: merged gVCFs plus filtered bed masks per contig.
@@ -460,11 +496,17 @@ def _summary_jobs() -> list[tuple[str, list[str]]]:
                 [
                     str(_split_prefix(c)) + suffix
                     for c in contigs
-                    for suffix in (".inv", ".filtered", ".clean", ".missing.bed")
+                    for suffix in (
+                        ".inv" + SPLIT_SUFFIX,
+                        ".filtered" + SPLIT_SUFFIX,
+                        CLEAN_VCF_SUFFIX,
+                        ".missing.bed" + SPLIT_SUFFIX,
+                        ".missing_gt_snp_by_sample.tsv",
+                    )
                 ],
             ),
             ("check_split_coverage", [str(_split_prefix(c)) + ".coverage.txt" for c in contigs]),
-            ("mask_bed", [str(_split_prefix(c)) + ".filtered.bed" for c in contigs]),
+            ("mask_bed", [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs]),
             ("make_accessibility", [str(_accessibility_out(c)) for c in contigs]),
         ]
     )
@@ -491,8 +533,8 @@ def _summary_temp_paths() -> set[str]:
 def _summary_arg_outputs() -> list[str]:
     contigs = _active_contigs()
     return (
-        [str(_split_prefix(c)) + ".clean" for c in contigs]
-        + [str(_split_prefix(c)) + ".filtered.bed" for c in contigs]
+        [str(_split_prefix(c)) + CLEAN_VCF_SUFFIX for c in contigs]
+        + [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in contigs]
         + [str(_accessibility_out(c)) for c in contigs]
     )
 
@@ -501,10 +543,11 @@ rule summary_report:
     # Write an HTML summary of jobs, outputs, and warnings.
     input:
         combined=lambda wc: [str(_combined_out(c)) for c in _active_contigs()],
-        beds=lambda wc: [str(_split_prefix(c)) + ".filtered.bed" for c in _active_contigs()],
+        beds=lambda wc: [str(_split_prefix(c)) + MASK_BED_SUFFIX for c in _active_contigs()],
         invs=lambda wc: [str(_split_prefix(c)) + ".inv" + SPLIT_SUFFIX for c in _active_contigs()],
         filts=lambda wc: [str(_split_prefix(c)) + ".filtered" + SPLIT_SUFFIX for c in _active_contigs()],
-        cleans=lambda wc: [str(_split_prefix(c)) + ".clean" + SPLIT_SUFFIX for c in _active_contigs()],
+        cleans=lambda wc: [str(_split_prefix(c)) + CLEAN_VCF_SUFFIX for c in _active_contigs()],
+        missing_gt_stats=lambda wc: [str(_split_prefix(c)) + ".missing_gt_snp_by_sample.tsv" for c in _active_contigs()],
         dropped=str(GVCF_DIR / "cleangVCF" / "dropped_indels.bed"),
     output:
         report=str(RESULTS_DIR / "summary.html"),
@@ -526,6 +569,7 @@ rule summary_report:
         ploidy_source=PLOIDY_SOURCE,
         ploidy_file_values=PLOIDY_FILE_VALUES,
         ploidy_warnings=PLOIDY_WARNINGS,
+        contigs_not_in_all_mafs=CONTIGS_NOT_IN_ALL_MAFS,
         dropped_contigs_not_in_ref=lambda wc: _active_contig_resolution()[1],
         requested_contigs=lambda wc: _active_contig_resolution()[2],
         remapped_contigs=lambda wc: _active_contig_resolution()[3],
@@ -790,8 +834,9 @@ rule split_gvcf:
     output:
         inv=str(RESULTS_DIR / "split" / ("combined.{contig}.inv" + SPLIT_SUFFIX)),
         filt=str(RESULTS_DIR / "split" / ("combined.{contig}.filtered" + SPLIT_SUFFIX)),
-        clean=str(RESULTS_DIR / "split" / ("combined.{contig}.clean" + SPLIT_SUFFIX)),
+        clean=str(RESULTS_DIR / "split" / ("combined.{contig}.clean.vcf" + SPLIT_SUFFIX)),
         missing=str(RESULTS_DIR / "split" / ("combined.{contig}.missing.bed" + SPLIT_SUFFIX)),
+        missing_gt_stats=str(RESULTS_DIR / "split" / "combined.{contig}.missing_gt_snp_by_sample.tsv"),
     params:
         filter_multiallelic=FILTER_MULTIALLELIC,
         bgzip_output=BGZIP_OUTPUT,
@@ -807,6 +852,7 @@ rule split_gvcf:
         if [ "{params.bgzip_output}" = "True" ]; then
           cmd+=(--bgzip-output)
         fi
+        cmd+=(--missing-gt-stats-out "{output.missing_gt_stats}")
         cmd+=("{input.gvcf}")
         "${{cmd[@]}}"
         """
@@ -815,9 +861,9 @@ rule split_gvcf:
 rule check_split_coverage:
     # Validate that clean + inv + filtered bed sum to contig length.
     input:
-        clean=lambda wc: str(_split_prefix(wc.contig)) + ".clean" + SPLIT_SUFFIX,
+        clean=lambda wc: str(_split_prefix(wc.contig)) + CLEAN_VCF_SUFFIX,
         inv=lambda wc: str(_split_prefix(wc.contig)) + ".inv" + SPLIT_SUFFIX,
-        bed=lambda wc: str(_split_prefix(wc.contig)) + ".filtered.bed",
+        bed=lambda wc: str(_split_prefix(wc.contig)) + MASK_BED_SUFFIX,
         fai=REF_FAI,
     output:
         report=str(RESULTS_DIR / "split" / "combined.{contig}.coverage.txt"),
@@ -839,7 +885,7 @@ rule mask_bed:
         missing=lambda wc: str(_split_prefix(wc.contig)) + ".missing.bed" + SPLIT_SUFFIX,
         dropped=str(GVCF_DIR / "cleangVCF" / "dropped_indels.bed"),
     output:
-        bed=str(RESULTS_DIR / "split" / "combined.{contig}.filtered.bed"),
+        bed=str(RESULTS_DIR / "split" / "combined.{contig}.clean.mask.bed"),
     params:
         prefix=lambda wc: str(_split_prefix(wc.contig)),
     shell:
@@ -854,7 +900,7 @@ rule mask_bed:
 rule make_accessibility:
     # Build boolean accessibility array from clean + inv VCFs per contig.
     input:
-        clean=lambda wc: str(_split_prefix(wc.contig)) + ".clean" + SPLIT_SUFFIX,
+        clean=lambda wc: str(_split_prefix(wc.contig)) + CLEAN_VCF_SUFFIX,
         inv=lambda wc: str(_split_prefix(wc.contig)) + ".inv" + SPLIT_SUFFIX,
         fai=REF_FAI,
     output:
diff --git a/changelog.md b/changelog.md
new file mode 100644
index 0000000..8db0191
--- /dev/null
+++ b/changelog.md
@@ -0,0 +1,38 @@
+## Changes since v0.3
+
+- Added/expanded contig handling across the workflow: reference renaming from gVCF/MAF contigs, numeric-suffix normalization, configured-contig remapping (e.g. `chr1` -> `1`), filtering contigs to the reference `.fai`, and defaulting to the shared MAF-contig intersection when `contigs:` is not set.
+- Improved missing-contig behavior and reporting: split/merge steps now skip missing per-sample contigs more safely, preserve valid placeholders, and surface clearer warnings in the HTML summary.
+- Expanded HTML summary reporting with stronger warning parsing, MAF-vs-reference contig mismatch warnings, ploidy reporting, and a missing-genotype exclusion histogram.
+- Fixed summary warning de-duplication so contigs reported in "Configured contigs were remapped..." are not also repeated in generic MAF/reference mismatch warnings.
+- Improved split/filter/mask robustness: refined split classification and filtered BED span handling, gzip/bgzip split-output support, gzipped missing-BED input support, and better empty-coverage handling in `check_split_coverage.py`.
+- Added accessibility mask outputs (`combined.<contig>.accessible.npz`) for scikit-allel workflows and documented downstream use.
+- Added ploidy inference from MAF blocks (with summary reporting) while retaining config override support.
+- Added and refined resource/concurrency controls: configurable `maf_to_gvcf_*` and `merge_contig_*` resources, `merge_gvcf_max_jobs`, `maf_to_gvcf_array_max_jobs`, GenomicsDB buffer tuning, and `default_mem_mb` fallback behavior.
+- Improved SLURM profile/runtime behavior: numeric default resource values in the profile, SLURM stdout/stderr written to `logs/slurm/`, and clearer documentation of profile `jobs` (Snakemake parallelism) vs per-job CPUs.
+- Added substantial test coverage: pytest scaffolding, split/filt/dropSV unit tests, coverage checks, integration tests (including contig mismatch/default-contig behavior), random-position validation, summary-report regressions, Snakefile contig-resolution unit tests, and a SLURM-profile dry-run smoke test.
+- Documentation updates: `logic.md`, expanded README notes/options/outputs/testing instructions, and a standalone `changelog.md`.
+
+## Changes since v0.2
+
+- Moved HTML summary generation into `scripts/summary_report.py` and simplified `Snakefile`.
+- Corrected example MAF inputs so `example_data/*.maf.gz` are valid gzip files.
+- Updated split classification so `ALT=<NON_REF>`-only records are treated as invariant.
+- Updated SINGER clean-output formatting to strip `<NON_REF>` while preserving genotype/sample fields.
+- Added `logic.md` with detailed site-routing/filtering logic and concrete examples.
+- Added `results/split/combined.<contig>.coverage.txt` to documented workflow outputs.
+- Added split-test coverage for invariant/nonref and genotype-preserving clean formatting.
+- Updated SLURM profile default resources to numeric values to avoid resource conversion/submission errors.
+- Added merge_gvcf_max_jobs pipeline concurrency control
+
+## Changes since v0.1
+
+- Added HTML summary report with embedded SVG histograms and expanded output details.
+- Split logic tightened: clean sites now require all samples called; missing GTs are routed to filtered.
+- Invariant/filtered/clean outputs are enforced as mutually exclusive per position; filtered BED spans now respect END/REF lengths and subtract inv/clean.
+- Merged gVCFs are produced via GATK SelectVariants with genotype calling; TASSEL `outputJustGT` default set to `false` to retain likelihoods for calling.
+- Added accessibility mask generation (`combined.<contig>.accessible.npz`) for scikit‑allel workflows.
+- New/expanded validation and tests: split coverage checks, filtered‑bed tests, integration tests gated by `RUN_INTEGRATION=1`.
+- Example data regenerated via msprime with indels and missing data and AnchorWave‑style MAF formatting.
+- `check_split_coverage.py` now reports overlap intervals with file names to aid debugging.
+- `filt_to_bed.py` filters masks to the target contig, preventing cross‑contig lines in `combined.<contig>.filtered.bed`.
+- SLURM default resources now read `default_*` from `config.yaml` instead of hardcoded profile values.
diff --git a/config.yaml b/config.yaml
index a0798bc..35523ba 100644
--- a/config.yaml
+++ b/config.yaml
@@ -15,10 +15,10 @@ sample_suffix: _anchorwave
 fill_gaps: false
 outputJustGT: false
 
-# Optional: list of contigs to process (one per line). If specified, the workflow will only run on these contigs. Contig names should match those in reference_fasta.
-# If omitted (comment out "contigs" line), the workflow will run all contigs but expects reference_fasta.fai to exist.
-contigs:
-  - 1
+# Optional: list of contigs to process (one per line). If specified, the workflow will only run on these contigs. Contig names should match those in reference_fasta. To run this, uncomment lines below, adding one line for each contig you wish to run.
+# By default the workflow will run all contigs present in ALL maf files.
+#contigs:
+# - 1
 
 # Optional ploidy override. If omitted, ploidy is inferred from MAF block structure
 # as max(non-reference "s" lines per block), with a default of 1.
@@ -35,7 +35,7 @@ drop_cutoff: 4000
 
 # Optional: split.py switches. 
 filter_multiallelic: false
-bgzip_output: false
+bgzip_output: true
 
 # Optional: GenomicsDBImport tuning (buffer/segment sizes)
 genomicsdb_vcf_buffer_size: 10485760
diff --git a/profiles/slurm/config.yaml b/profiles/slurm/config.yaml
index c4b545c..df437f7 100644
--- a/profiles/slurm/config.yaml
+++ b/profiles/slurm/config.yaml
@@ -4,6 +4,7 @@
 executor: cluster-generic
 cluster-generic-submit-cmd: "sbatch --account {config[slurm_account]} --partition {config[slurm_partition]} --cpus-per-task={threads} --mem={resources.mem_mb} --time={resources.time} --output logs/slurm/slurm-%j.out --error logs/slurm/slurm-%j.err"
 
+# Snakemake max concurrent workflow jobs (equivalent to -j), not per-job SLURM CPUs.
 jobs: 200
 cores: 2
 use-conda: true
diff --git a/scripts/check_split_coverage.py b/scripts/check_split_coverage.py
index 59e91a8..a011463 100644
--- a/scripts/check_split_coverage.py
+++ b/scripts/check_split_coverage.py
@@ -3,9 +3,9 @@
 Check that split outputs cover the full contig length.
 
 This script sums:
-  - .clean records (1 bp each)
+  - .clean.vcf records (1 bp each)
   - .inv records (1 bp each; END spans are expanded by split.py)
-  - .filtered.bed intervals (0-based, half-open)
+  - .clean.mask.bed intervals (0-based, half-open)
 
 and compares the total to the contig length from a reference .fai.
 """
@@ -205,14 +205,22 @@ def main() -> None:
     args = ap.parse_args()
 
     prefix = Path(args.prefix)
-    clean = prefix.with_suffix(prefix.suffix + ".clean")
+    clean = prefix.with_suffix(prefix.suffix + ".clean.vcf")
     inv = prefix.with_suffix(prefix.suffix + ".inv")
-    filtered_bed = prefix.with_suffix(prefix.suffix + ".filtered.bed")
+    filtered_bed = prefix.with_suffix(prefix.suffix + ".clean.mask.bed")
 
     if clean.with_suffix(clean.suffix + ".gz").exists():
         clean = clean.with_suffix(clean.suffix + ".gz")
+    elif not clean.exists():
+        # Backward compatibility for legacy split output naming.
+        clean = prefix.with_suffix(prefix.suffix + ".clean")
+        if clean.with_suffix(clean.suffix + ".gz").exists():
+            clean = clean.with_suffix(clean.suffix + ".gz")
     if inv.with_suffix(inv.suffix + ".gz").exists():
         inv = inv.with_suffix(inv.suffix + ".gz")
+    if not filtered_bed.exists():
+        # Backward compatibility for legacy mask naming.
+        filtered_bed = prefix.with_suffix(prefix.suffix + ".filtered.bed")
 
     if not clean.exists():
         sys.exit(f"ERROR: clean file not found: {clean}")
@@ -248,7 +256,7 @@ def main() -> None:
             "filtered_bed_bp=0\n"
             "total_bp=0\n"
             f"chrom_len={chrom_len}\n"
-            "warning=No records found in clean/inv/filtered.bed; likely absent in all gVCFs.\n",
+            "warning=No records found in clean/inv/clean.mask.bed; likely absent in all gVCFs.\n",
             encoding="utf-8",
         )
         return
diff --git a/scripts/filt_to_bed.py b/scripts/filt_to_bed.py
index 9b71678..4d0e1eb 100755
--- a/scripts/filt_to_bed.py
+++ b/scripts/filt_to_bed.py
@@ -124,7 +124,7 @@ def read_bed_intervals(path: str, by_chrom: Dict[str, List[Tuple[int, int]]]) ->
     Load a BED file into a per-chromosome interval map.
     Assumes 0-based half-open intervals [start, end).
     """
-    with open(path, "rt", encoding="utf-8") as fin:
+    with open_maybe_gzip(path, "rt") as fin:
         for raw in fin:
             if not raw or raw.startswith("#"):
                 continue
@@ -218,9 +218,12 @@ def main() -> None:
     filtered_gz = filtered_path + ".gz"
     if os.path.isfile(filtered_gz):
         filtered_path = filtered_gz
-    out_path = prefix + ".filtered.bed"
+    out_path = prefix + ".clean.mask.bed"
     dropped_bed = args.dropped_bed or os.path.join(os.path.dirname(prefix), "cleangVCF", "dropped_indels.bed")
     missing_bed = prefix + ".missing.bed"
+    missing_bed_gz = missing_bed + ".gz"
+    if os.path.isfile(missing_bed_gz):
+        missing_bed = missing_bed_gz
     inv_path = prefix + ".inv"
     inv_gz = inv_path + ".gz"
     if os.path.isfile(inv_gz):
@@ -277,10 +280,16 @@ def main() -> None:
     # If filtered is empty, try to infer target from other inputs.
     if target_chrom is None and os.path.isfile(inv_path):
         target_chrom = first_chrom_from_vcf(inv_path)
-    clean_path = prefix + ".clean"
+    clean_path = prefix + ".clean.vcf"
     clean_gz = clean_path + ".gz"
     if os.path.isfile(clean_gz):
         clean_path = clean_gz
+    elif not os.path.isfile(clean_path):
+        # Backward compatibility for legacy split outputs.
+        clean_path = prefix + ".clean"
+        clean_gz = clean_path + ".gz"
+        if os.path.isfile(clean_gz):
+            clean_path = clean_gz
     if target_chrom is None and os.path.isfile(clean_path):
         target_chrom = first_chrom_from_vcf(clean_path)
     if target_chrom is None and os.path.isfile(missing_bed):
@@ -340,7 +349,7 @@ def first_chrom_from_vcf(path: str) -> str | None:
 
 
 def first_chrom_from_bed(path: str) -> str | None:
-    with open(path, "rt", encoding="utf-8") as fin:
+    with open_maybe_gzip(path, "rt") as fin:
         for raw in fin:
             if not raw or raw.startswith("#"):
                 continue
diff --git a/scripts/split.py b/scripts/split.py
index e546296..576dd0a 100755
--- a/scripts/split.py
+++ b/scripts/split.py
@@ -7,7 +7,7 @@
 Split a gVCF into three mutually exclusive outputs:
   <prefix>.inv       invariant sites (INFO="." or END=... spans)
   <prefix>.filtered  sites removed for quality/format reasons
-  <prefix>.clean     usable variant sites for downstream inference
+  <prefix>.clean.vcf usable variant sites for downstream inference
 
 The script also emits <prefix>.missing.bed, a mask of positions absent from
 the input gVCF (gaps between covered positions). This helps track accessiblity.
@@ -238,6 +238,9 @@ def main() -> None:
     clean_bp = 0
     missing_bp = 0
     singer_reformat: bool | None = None
+    sample_names: list[str] = []
+    missing_gt_snp_total = 0
+    missing_gt_snp_by_sample: dict[str, int] = {}
 
 
     # ---------------- Argument parsing ----------------
@@ -268,6 +271,14 @@ def main() -> None:
         default=None,
         help="Reference .fai to fill missing BED gaps at contig ends.",
     )
+    ap.add_argument(
+        "--missing-gt-stats-out",
+        default=None,
+        help=(
+            "Optional TSV output path for per-sample counts of SNP sites excluded "
+            "from clean due to missing genotype calls."
+        ),
+    )
     args = ap.parse_args()
 
     in_path = args.vcf
@@ -288,8 +299,9 @@ def main() -> None:
 
     out_inv = prefix + ".inv"
     out_filt = prefix + ".filtered"
-    out_clean = prefix + ".clean"
+    out_clean = prefix + ".clean.vcf"
     out_missing = prefix + ".missing.bed"
+    out_missing_gt_stats = args.missing_gt_stats_out or (prefix + ".missing_gt_snp_by_sample.tsv")
     bgzip_output = args.bgzip_output
 
     out_inv_final = out_inv + (".gz" if bgzip_output else "")
@@ -330,6 +342,7 @@ def main() -> None:
 
         def flush_group(records: list[dict]) -> None:
             nonlocal inv_bp, filtered_bp, clean_bp, singer_reformat
+            nonlocal missing_gt_snp_total, missing_gt_snp_by_sample
             if not records:
                 return
             has_inv = any(r["is_inv"] for r in records)
@@ -342,6 +355,18 @@ def flush_group(records: list[dict]) -> None:
                 records.clear()
                 return
             if has_filtered or has_missing_gt:
+                # Track SNP-like sites that were excluded only because sample GTs were missing.
+                if has_missing_gt and not has_filtered:
+                    missing_gt_snp_total += 1
+                    missing_samples: set[str] = set()
+                    for r in records:
+                        for idx in r["missing_gt_sample_idxs"]:
+                            if idx < len(sample_names):
+                                missing_samples.add(sample_names[idx])
+                            else:
+                                missing_samples.add(f"sample_{idx + 1}")
+                    for sample in missing_samples:
+                        missing_gt_snp_by_sample[sample] = missing_gt_snp_by_sample.get(sample, 0) + 1
                 for r in records:
                     f_filt.write(r["line"] + "\n")
                     filtered_bp += 1
@@ -386,6 +411,8 @@ def flush_group(records: list[dict]) -> None:
                     f_clean.write(raw)
                 # Buffer meta/header lines until we see the #CHROM header line.
                 elif raw.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"):
+                    header_cols = raw.rstrip("\n").split("\t")
+                    sample_names = header_cols[9:] if len(header_cols) > 9 else []
                     # First: inject provenance headers (##...).
                     for ln in prov_lines:
                         f_inv.write(ln)
@@ -554,7 +581,11 @@ def flush_group(records: list[dict]) -> None:
                 group_chrom = chrom
                 group_pos = int(cols[1])
             gts = [c.split(":", 1)[0] for c in cols[9:]] if len(cols) > 9 else ["."]
-            has_missing_gt = any(gt == "." for gt in gts)
+            missing_gt_sample_idxs = [
+                i for i, gt in enumerate(gts)
+                if ("." in gt) or (gt == "")
+            ]
+            has_missing_gt = len(missing_gt_sample_idxs) > 0
             group.append(
                 {
                     "line": line,
@@ -562,6 +593,7 @@ def flush_group(records: list[dict]) -> None:
                     "is_inv": is_inv,
                     "is_filtered": is_filtered,
                     "has_missing_gt": has_missing_gt,
+                    "missing_gt_sample_idxs": missing_gt_sample_idxs,
                 }
             )
 
@@ -580,6 +612,13 @@ def flush_group(records: list[dict]) -> None:
 
         print(f"  missing:  {missing_bp:,}",file=sys.stderr)
 
+    with open(out_missing_gt_stats, "w", encoding="utf-8") as f_stats:
+        f_stats.write("sample\texcluded_snp_sites\ttotal_excluded_snp_sites\n")
+        names = sample_names if sample_names else sorted(missing_gt_snp_by_sample.keys())
+        for sample in names:
+            count = missing_gt_snp_by_sample.get(sample, 0)
+            f_stats.write(f"{sample}\t{count}\t{missing_gt_snp_total}\n")
+
     if bgzip_output:
         for path in (out_inv, out_filt, out_clean, out_missing):
             proc = subprocess.run(["bgzip", "-f", path], check=False)
diff --git a/scripts/split_and_mask.sh b/scripts/split_and_mask.sh
index 4d4294e..26fd0cc 100644
--- a/scripts/split_and_mask.sh
+++ b/scripts/split_and_mask.sh
@@ -8,10 +8,10 @@
 
 set -euo pipefail
 
-# One job: split gVCF into .inv/.filtered/.clean and build mask bed.
+# One job: split gVCF into .inv/.filtered/.clean.vcf and build mask bed.
 usage() {
   echo "Usage: $0 -p <prefix> -d <depth> [--filter-multiallelic] [--no-gzip] [--no-merge]"
-  echo "  -p  Prefix for .inv/.filtered/.clean outputs"
+  echo "  -p  Prefix for .inv/.filtered/.clean.vcf outputs"
   echo "  -d  Depth cutoff passed to split.py"
   echo "  --filter-multiallelic  Filter multi-allelic SNPs in split.py"
   echo "  --no-gzip              Do not gzip split.py outputs (default)"
@@ -76,7 +76,7 @@ if [ "$NO_GZIP" != "true" ]; then
 fi
 SPLIT_ARGS+=("$INPUT_VCF")
 
-# Run split.py to produce .inv/.filtered/.clean/.missing.bed.
+# Run split.py to produce .inv/.filtered/.clean.vcf/.missing.bed.
 "${SPLIT_ARGS[@]}"
 
 # Build mask bed from filtered + missing + dropped indels.
diff --git a/scripts/summary_report.py b/scripts/summary_report.py
index f32ef21..82336d2 100644
--- a/scripts/summary_report.py
+++ b/scripts/summary_report.py
@@ -221,6 +221,86 @@ def y_scale(y_val: int) -> float:
     return "\n".join(parts)
 
 
+def _read_missing_gt_stats(paths: list[str]) -> dict[str, int]:
+    counts: dict[str, int] = {}
+    for path in paths:
+        try:
+            with open(path, "r", encoding="utf-8", errors="ignore") as handle:
+                header = handle.readline()
+                if not header:
+                    continue
+                for line in handle:
+                    if not line.strip():
+                        continue
+                    parts = line.rstrip("\n").split("\t")
+                    if len(parts) < 2:
+                        continue
+                    sample = parts[0]
+                    try:
+                        value = int(parts[1])
+                    except ValueError:
+                        continue
+                    counts[sample] = counts.get(sample, 0) + value
+        except OSError:
+            continue
+    return counts
+
+
+def _svg_horizontal_bar_chart(labels: list[str], values: list[int], width: int = 900) -> str:
+    if not labels or not values or len(labels) != len(values):
+        return "<p>No data available.</p>"
+    max_label_len = max(len(label) for label in labels)
+    label_space = min(max(120, max_label_len * 8), 360)
+    bar_h = 18
+    gap = 8
+    margin = {"left": label_space + 20, "right": 70, "top": 24, "bottom": 30}
+    n = len(labels)
+    height = margin["top"] + margin["bottom"] + n * bar_h + max(n - 1, 0) * gap
+    plot_w = width - margin["left"] - margin["right"]
+    max_val = max(values)
+    if max_val <= 0:
+        max_val = 1
+
+    parts = [
+        f'<svg width="{width}" height="{height}" viewBox="0 0 {width} {height}" '
+        'xmlns="http://www.w3.org/2000/svg" role="img">',
+        '<rect width="100%" height="100%" fill="white"/>',
+    ]
+    for idx, (label, value) in enumerate(zip(labels, values)):
+        y = margin["top"] + idx * (bar_h + gap)
+        bar_w = int(round((value / max_val) * plot_w))
+        parts.append(
+            f'<text x="{margin["left"] - 8}" y="{y + bar_h - 4}" text-anchor="end" '
+            f'font-size="11" font-family="sans-serif">{html.escape(label)}</text>'
+        )
+        parts.append(
+            f'<rect x="{margin["left"]}" y="{y}" width="{bar_w}" height="{bar_h}" fill="#4C78A8"/>'
+        )
+        parts.append(
+            f'<text x="{margin["left"] + bar_w + 6}" y="{y + bar_h - 4}" '
+            f'font-size="11" font-family="sans-serif">{value:,}</text>'
+        )
+
+    x0 = margin["left"]
+    y0 = height - margin["bottom"] + 2
+    parts.append(
+        f'<line x1="{x0}" y1="{y0}" x2="{x0 + plot_w}" y2="{y0}" stroke="#333" stroke-width="1"/>'
+    )
+    for i in range(5):
+        frac = i / 4
+        x = x0 + frac * plot_w
+        tick = int(round(frac * max_val))
+        parts.append(
+            f'<line x1="{x:.2f}" y1="{y0}" x2="{x:.2f}" y2="{y0 + 4}" stroke="#333" stroke-width="1"/>'
+        )
+        parts.append(
+            f'<text x="{x:.2f}" y="{y0 + 16}" text-anchor="middle" font-size="10" '
+            f'font-family="sans-serif">{tick:,}</text>'
+        )
+    parts.append("</svg>")
+    return "\n".join(parts)
+
+
 report_path = Path(snakemake.output.report)
 report_path.parent.mkdir(parents=True, exist_ok=True)
 
@@ -233,6 +313,7 @@ def y_scale(y_val: int) -> float:
 dropped_contigs_not_in_ref = [str(c) for c in snakemake.params.dropped_contigs_not_in_ref]
 requested_contigs = [str(c) for c in snakemake.params.requested_contigs]
 remapped_contigs = [(str(a), str(b)) for a, b in snakemake.params.remapped_contigs]
+contigs_not_in_all_mafs = [str(c) for c in getattr(snakemake.params, "contigs_not_in_all_mafs", [])]
 ploidy = int(getattr(snakemake.params, "ploidy", 1))
 ploidy_source = str(getattr(snakemake.params, "ploidy_source", "unknown"))
 ploidy_file_values = {
@@ -248,8 +329,21 @@ def y_scale(y_val: int) -> float:
 log_paths.extend(sorted(Path(".snakemake").rglob("*.log")))
 for log_path in log_paths:
     try:
+        in_shell_command_block = False
         with log_path.open("r", encoding="utf-8", errors="ignore") as handle:
             for line in handle:
+                # Snakemake logs include literal shell command text. Ignore WARNING
+                # tokens inside those blocks to avoid false positives from lines
+                # like: echo "WARNING: ..."
+                if line.startswith("Shell command:"):
+                    in_shell_command_block = True
+                    continue
+                if in_shell_command_block:
+                    if line.startswith(" ") or line.startswith("\t") or not line.strip():
+                        continue
+                    in_shell_command_block = False
+                if 'echo "WARNING:' in line or "echo 'WARNING:" in line:
+                    continue
                 if "WARNING" in line or "Warning" in line:
                     warnings.append(f"{log_path}: {line.rstrip()}")
     except OSError:
@@ -258,8 +352,10 @@ def y_scale(y_val: int) -> float:
 try:
     maf_contigs = _read_maf_contigs(Path(snakemake.params.maf_dir))
     ref_contigs = set(_read_fasta_contigs(Path(snakemake.params.orig_ref_fasta)))
-    missing_in_ref = sorted(set(maf_contigs) - ref_contigs)
-    missing_in_maf = sorted(ref_contigs - set(maf_contigs))
+    remapped_sources = {src for src, _dst in remapped_contigs}
+    remapped_targets = {dst for _src, dst in remapped_contigs}
+    missing_in_ref = sorted((set(maf_contigs) - ref_contigs) - remapped_sources)
+    missing_in_maf = sorted((ref_contigs - set(maf_contigs)) - remapped_targets)
     if missing_in_ref:
         warnings.append(
             "WARNING: MAF contigs not present in reference (showing up to 5): "
@@ -295,6 +391,17 @@ def y_scale(y_val: int) -> float:
         "WARNING: Configured contigs were remapped to renamed-reference contigs: "
         f"{preview_pairs}{extra}"
     )
+if contigs_not_in_all_mafs:
+    preview = ", ".join(contigs_not_in_all_mafs[:20])
+    extra = (
+        f" (+{len(contigs_not_in_all_mafs) - 20} more)"
+        if len(contigs_not_in_all_mafs) > 20
+        else ""
+    )
+    warnings.append(
+        "WARNING: Contigs not present in all MAF files were excluded from default "
+        f"processing: {preview}{extra}"
+    )
 for message in ploidy_warnings:
     warnings.append(f"WARNING: Ploidy inference: {message}")
 
@@ -358,6 +465,17 @@ def y_scale(y_val: int) -> float:
             )
         handle.write("</ul>\n")
 
+    if contigs_not_in_all_mafs:
+        handle.write("<h2>Contigs Not In All MAF Files</h2>\n")
+        handle.write(
+            "<p><em>These contigs are not present in every MAF and are excluded "
+            "from default contig selection unless explicitly listed in config.</em></p>\n"
+        )
+        handle.write("<ul>\n")
+        for contig in contigs_not_in_all_mafs:
+            handle.write(f"<li><code>{html.escape(contig)}</code></li>\n")
+        handle.write("</ul>\n")
+
     handle.write("<h2>Jobs run</h2>\n")
     handle.write("<ul>\n")
     for job, outputs in jobs:
@@ -520,6 +638,21 @@ def y_scale(y_val: int) -> float:
             )
         )
 
+    handle.write("<h2>SNP Sites Excluded From Clean By MAF File</h2>\n")
+    excluded_by_sample = _read_missing_gt_stats([str(p) for p in snakemake.input.missing_gt_stats])
+    if excluded_by_sample:
+        ranked = sorted(excluded_by_sample.items(), key=lambda kv: kv[1], reverse=True)
+        labels = [name for name, _ in ranked]
+        values = [value for _, value in ranked]
+        handle.write(
+            _svg_horizontal_bar_chart(
+                labels,
+                values,
+            )
+        )
+    else:
+        handle.write("<p>No missing-genotype SNP exclusions were recorded.</p>\n")
+
     handle.write("<h2>Warnings</h2>\n")
     if warnings:
         handle.write("<ul>\n")
diff --git a/tests/test_check_split_coverage.py b/tests/test_check_split_coverage.py
index 1fa2619..5516652 100644
--- a/tests/test_check_split_coverage.py
+++ b/tests/test_check_split_coverage.py
@@ -12,7 +12,7 @@ def test_check_split_coverage_pass(tmp_path: Path):
     fai = tmp_path / "ref.fa.fai"
     fai.write_text("1\t10\t0\t0\t0\n", encoding="utf-8")
 
-    (tmp_path / "out.clean").write_text(
+    (tmp_path / "out.clean.vcf").write_text(
         "##fileformat=VCFv4.2\n"
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
         "1\t1\t.\tA\tC\t.\t.\tDP=1\n"
@@ -26,7 +26,7 @@ def test_check_split_coverage_pass(tmp_path: Path):
         "1\t4\t.\tA\t.\t.\t.\tDP=1\n",
         encoding="utf-8",
     )
-    (tmp_path / "out.filtered.bed").write_text(
+    (tmp_path / "out.clean.mask.bed").write_text(
         "1\t4\t10\n",
         encoding="utf-8",
     )
@@ -51,7 +51,7 @@ def test_check_split_coverage_overlap_fails(tmp_path: Path):
     fai = tmp_path / "ref.fa.fai"
     fai.write_text("1\t5\t0\t0\t0\n", encoding="utf-8")
 
-    (tmp_path / "out.clean").write_text(
+    (tmp_path / "out.clean.vcf").write_text(
         "##fileformat=VCFv4.2\n"
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
         "1\t1\t.\tA\tC\t.\t.\tDP=1\n",
@@ -64,7 +64,7 @@ def test_check_split_coverage_overlap_fails(tmp_path: Path):
         encoding="utf-8",
     )
     # Overlaps clean at pos 1.
-    (tmp_path / "out.filtered.bed").write_text(
+    (tmp_path / "out.clean.mask.bed").write_text(
         "1\t0\t2\n",
         encoding="utf-8",
     )
@@ -90,7 +90,7 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path):
     fai = tmp_path / "ref.fa.fai"
     fai.write_text("SCAFFOLD_24\t100\t0\t0\t0\n", encoding="utf-8")
 
-    (tmp_path / "combined.SCAFFOLD_24.clean").write_text(
+    (tmp_path / "combined.SCAFFOLD_24.clean.vcf").write_text(
         "##fileformat=VCFv4.2\n"
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n",
         encoding="utf-8",
@@ -100,7 +100,7 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path):
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n",
         encoding="utf-8",
     )
-    (tmp_path / "combined.SCAFFOLD_24.filtered.bed").write_text("", encoding="utf-8")
+    (tmp_path / "combined.SCAFFOLD_24.clean.mask.bed").write_text("", encoding="utf-8")
 
     _run(
         [
@@ -116,4 +116,4 @@ def test_check_split_coverage_empty_inputs_warns_not_fails(tmp_path: Path):
     report = tmp_path / "combined.SCAFFOLD_24.coverage.txt"
     text = report.read_text(encoding="utf-8")
     assert "chrom=SCAFFOLD_24" in text
-    assert "warning=No records found in clean/inv/filtered.bed" in text
+    assert "warning=No records found in clean/inv/clean.mask.bed" in text
diff --git a/tests/test_drop_sv.py b/tests/test_drop_sv.py
index 1739eed..97a6afc 100644
--- a/tests/test_drop_sv.py
+++ b/tests/test_drop_sv.py
@@ -44,6 +44,15 @@ def _write_vcf(path: Path):
     )
 
 
+def _write_snp_only_vcf(path: Path):
+    path.write_text(
+        "##fileformat=VCFv4.2\n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
+        "1\t2\t.\tA\tT\t.\t.\t.\n",
+        encoding="utf-8",
+    )
+
+
 def _count_records(path: Path) -> int:
     count = 0
     with subprocess.Popen(_conda_cmd("bcftools", "view", "-H", str(path)), stdout=subprocess.PIPE, text=True) as proc:
@@ -85,3 +94,34 @@ def test_drop_sv_filters_large_indel(tmp_path: Path):
     assert dropped_bed.exists()
     bed_lines = [l for l in dropped_bed.read_text(encoding="utf-8").splitlines() if l.strip()]
     assert len(bed_lines) == 1
+
+
+def test_drop_sv_copies_and_indexes_when_no_large_indels(tmp_path: Path):
+    _require_conda_tools("bcftools", "bgzip", "tabix")
+    vcf = tmp_path / "sample.gvcf"
+    _write_snp_only_vcf(vcf)
+
+    _run(_conda_cmd("bgzip", "-f", str(vcf)))
+    gvcf = tmp_path / "sample.gvcf.gz"
+    _run(_conda_cmd("tabix", "-p", "vcf", str(gvcf)))
+
+    _run(_conda_cmd(
+        "python3",
+        str(Path("scripts") / "dropSV.py"),
+        "-d",
+        str(tmp_path),
+        "-c",
+        "1",
+    ))
+
+    cleaned = tmp_path / "cleangVCF" / "sample.gvcf.gz"
+    cleaned_tbi = tmp_path / "cleangVCF" / "sample.gvcf.gz.tbi"
+    assert cleaned.exists()
+    assert cleaned_tbi.exists()
+    assert _count_records(cleaned) == 1
+    # No filtering path should preserve the compressed payload byte-for-byte via copy.
+    assert cleaned.read_bytes() == gvcf.read_bytes()
+
+    dropped_bed = tmp_path / "cleangVCF" / "dropped_indels.bed"
+    assert dropped_bed.exists()
+    assert dropped_bed.read_text(encoding="utf-8").strip() == ""
diff --git a/tests/test_filt_to_bed.py b/tests/test_filt_to_bed.py
index 40e89f4..a0216c3 100644
--- a/tests/test_filt_to_bed.py
+++ b/tests/test_filt_to_bed.py
@@ -1,5 +1,6 @@
 import subprocess
 import sys
+import gzip
 from pathlib import Path
 
 
@@ -36,7 +37,7 @@ def test_filt_to_bed_uses_ref_len(tmp_path: Path):
     dropped.write_text("", encoding="utf-8")
 
     (tmp_path / "sample.inv").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8")
-    (tmp_path / "sample.clean").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8")
+    (tmp_path / "sample.clean.vcf").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8")
 
     _run(
         [
@@ -49,7 +50,7 @@ def test_filt_to_bed_uses_ref_len(tmp_path: Path):
         cwd=Path.cwd(),
     )
 
-    out_bed = tmp_path / "sample.filtered.bed"
+    out_bed = tmp_path / "sample.clean.mask.bed"
     assert out_bed.exists()
 
     # REF length 3 => interval length 3, plus missing interval length 1
@@ -79,7 +80,7 @@ def test_filt_to_bed_end_and_subtraction(tmp_path: Path):
         "1\t11\t.\tA\t.\t.\t.\tDP=10\tGT\t0\n",
         encoding="utf-8",
     )
-    (tmp_path / "sample.clean").write_text(
+    (tmp_path / "sample.clean.vcf").write_text(
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
         "1\t12\t.\tA\tG\t.\t.\tDP=10\tGT\t0/1\n",
         encoding="utf-8",
@@ -96,8 +97,44 @@ def test_filt_to_bed_end_and_subtraction(tmp_path: Path):
         cwd=Path.cwd(),
     )
 
-    out_bed = tmp_path / "sample.filtered.bed"
+    out_bed = tmp_path / "sample.clean.mask.bed"
     assert out_bed.exists()
 
     # Span 10-12 (3 bp), subtract 11 and 12 => only 1 bp remains.
     assert _sum_bed(out_bed) == 1
+
+
+def test_filt_to_bed_accepts_gz_missing_bed(tmp_path: Path):
+    prefix = tmp_path / "sample"
+
+    filtered = tmp_path / "sample.filtered.gz"
+    with gzip.open(filtered, "wt", encoding="utf-8") as handle:
+        handle.write(
+            "##fileformat=VCFv4.2\n"
+            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
+            "1\t10\t.\tA\tT\t.\t.\t.\n"
+        )
+
+    with gzip.open(tmp_path / "sample.missing.bed.gz", "wt", encoding="utf-8") as handle:
+        handle.write("1\t0\t1\n")
+
+    dropped = tmp_path / "dropped_indels.bed"
+    dropped.write_text("", encoding="utf-8")
+
+    (tmp_path / "sample.inv").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8")
+    (tmp_path / "sample.clean.vcf").write_text("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", encoding="utf-8")
+
+    _run(
+        [
+            sys.executable,
+            str(Path("scripts") / "filt_to_bed.py"),
+            str(prefix),
+            "--dropped-bed",
+            str(dropped),
+        ],
+        cwd=Path.cwd(),
+    )
+
+    out_bed = tmp_path / "sample.clean.mask.bed"
+    assert out_bed.exists()
+    assert _sum_bed(out_bed) == 2
diff --git a/tests/test_integration.py b/tests/test_integration.py
index b73bdc2..e6bcf21 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -2,6 +2,7 @@
 import subprocess
 from pathlib import Path
 import shutil
+import yaml
 
 import pytest
 
@@ -55,10 +56,7 @@ def _is_gzip(path: Path) -> bool:
         return False
 
 
-def test_snakemake_summary(tmp_path: Path):
-    _require_integration()
-    _require_conda_tools("argprep", "snakemake", "bcftools", "tabix", "bgzip", "gatk", "java")
-    repo = Path.cwd()
+def _run_example_summary(repo: Path, tmp_path: Path) -> str:
     ref = _config_reference_fasta(repo / "config.yaml")
     if ref is None or not ref.exists():
         pytest.skip(f"Missing reference FASTA from config.yaml: {ref}")
@@ -100,15 +98,10 @@ def test_snakemake_summary(tmp_path: Path):
         ],
         cwd=Path.cwd(),
     )
+    return Path(summary_target).read_text(encoding="utf-8", errors="ignore")
 
 
-def test_snakemake_contig_mismatch_handling(tmp_path: Path):
-    _require_integration()
-    _require_conda_tools(
-        "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java"
-    )
-    repo = Path.cwd()
-
+def _run_contig_mismatch_summary(repo: Path, tmp_path: Path) -> str:
     # Build a reference with one real contig and one contig absent from MAFs.
     ref_src = repo / "example_data" / "ref.fa"
     if not ref_src.exists():
@@ -173,8 +166,27 @@ def test_snakemake_contig_mismatch_handling(tmp_path: Path):
         ],
         cwd=repo,
     )
+    return Path(summary_target).read_text(encoding="utf-8", errors="ignore")
 
-    summary_text = Path(summary_target).read_text(encoding="utf-8", errors="ignore")
+
+def test_snakemake_summary(tmp_path: Path):
+    _require_integration()
+    _require_conda_tools("argprep", "snakemake", "bcftools", "tabix", "bgzip", "gatk", "java")
+    repo = Path.cwd()
+    summary_text = _run_example_summary(repo, tmp_path)
+    assert "Workflow summary" in summary_text
+    assert "Ploidy" in summary_text
+    assert "Jobs run" in summary_text
+    assert "Warnings" in summary_text
+
+
+def test_snakemake_contig_mismatch_handling(tmp_path: Path):
+    _require_integration()
+    _require_conda_tools(
+        "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java"
+    )
+    repo = Path.cwd()
+    summary_text = _run_contig_mismatch_summary(repo, tmp_path)
     assert "Configured contigs not present in reference .fai were skipped" in summary_text
     assert "requested_only_contig" in summary_text
     assert "Configured contigs were remapped to renamed-reference contigs" in summary_text
@@ -185,6 +197,109 @@ def test_snakemake_contig_mismatch_handling(tmp_path: Path):
     assert "fai_only_contig" in summary_text
 
 
+def test_snakemake_contig_mismatch_remapped_contigs_not_double_reported(tmp_path: Path):
+    _require_integration()
+    _require_conda_tools(
+        "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java"
+    )
+    summary_text = _run_contig_mismatch_summary(Path.cwd(), tmp_path)
+    assert "Configured contigs were remapped to renamed-reference contigs" in summary_text
+    assert "chr1-&gt;1" in summary_text
+    assert "MAF contigs not present in reference (showing up to 5): chr1" not in summary_text
+    assert "Reference contigs not present in MAFs (showing up to 5): 1" not in summary_text
+
+
+def test_snakemake_slurm_profile_dry_run_smoke(tmp_path: Path):
+    profile_path = Path.cwd() / "profiles" / "slurm" / "config.yaml"
+    profile = yaml.safe_load(profile_path.read_text(encoding="utf-8"))
+    assert profile["executor"] == "cluster-generic"
+    assert int(profile["jobs"]) >= 1
+    submit_cmd = str(profile["cluster-generic-submit-cmd"])
+    assert "sbatch" in submit_cmd
+    assert "{config[slurm_account]}" in submit_cmd
+    assert "{config[slurm_partition]}" in submit_cmd
+    assert "{threads}" in submit_cmd
+    assert "{resources.mem_mb}" in submit_cmd
+    assert "{resources.time}" in submit_cmd
+    defaults = profile.get("default-resources")
+    assert isinstance(defaults, list)
+    assert any(str(item).startswith("mem_mb=") for item in defaults)
+    assert any(str(item).startswith("time=") for item in defaults)
+
+
+def test_snakemake_default_contigs_use_maf_intersection(tmp_path: Path):
+    _require_integration()
+    _require_conda_tools(
+        "argprep", "snakemake", "samtools", "bcftools", "tabix", "bgzip", "gatk", "java"
+    )
+    repo = Path.cwd()
+
+    ref = tmp_path / "ref.fa"
+    ref.write_text(
+        ">1\n" + ("ACGT" * 50) + "\n" + ">2\n" + ("TGCA" * 50) + "\n",
+        encoding="utf-8",
+    )
+    _run([_conda_exe(), "run", "-n", "argprep", "samtools", "faidx", str(ref)], cwd=repo)
+
+    maf_dir = tmp_path / "maf"
+    maf_dir.mkdir()
+    (maf_dir / "s1.maf").write_text(
+        "##maf version=1\n"
+        "a score=0\n"
+        "s 1 0 12 + 200 ACGTACGTACGT\n"
+        "s s1 0 12 + 200 ACGTACGTACGT\n"
+        "\n"
+        "a score=0\n"
+        "s 2 0 12 + 200 TGCATGCATGCA\n"
+        "s s1 0 12 + 200 TGCATGCATGCA\n",
+        encoding="utf-8",
+    )
+    (maf_dir / "s2.maf").write_text(
+        "##maf version=1\n"
+        "a score=0\n"
+        "s 1 0 12 + 200 ACGTACGTACGT\n"
+        "s s2 0 12 + 200 ACGTACGTACGT\n",
+        encoding="utf-8",
+    )
+
+    gvcf_dir = tmp_path / "gvcf"
+    results_dir = tmp_path / "results"
+    summary_target = str(results_dir / "summary.html")
+    _run(
+        [
+            _conda_exe(),
+            "run",
+            "-n",
+            "argprep",
+            "env",
+            "PATH=/opt/anaconda3/envs/argprep/bin:/usr/bin:/bin:/usr/sbin:/sbin",
+            "HOME=/tmp",
+            "TMPDIR=/tmp",
+            "XDG_CACHE_HOME=/tmp",
+            "SNAKEMAKE_OUTPUT_CACHE=/tmp/snakemake",
+            "SNAKEMAKE_SOURCE_CACHE=/tmp/snakemake",
+            "snakemake",
+            "-j",
+            "2",
+            "--config",
+            f"maf_dir={maf_dir}",
+            f"reference_fasta={ref}",
+            f"gvcf_dir={gvcf_dir}",
+            f"results_dir={results_dir}",
+            "samples=['s1','s2']",
+            "--",
+            summary_target,
+        ],
+        cwd=repo,
+    )
+
+    summary_text = Path(summary_target).read_text(encoding="utf-8", errors="ignore")
+    assert "Contigs not present in all MAF files were excluded from default processing" in summary_text
+    assert "<code>2</code>" in summary_text
+    assert (results_dir / "combined" / "combined.1.gvcf.gz").exists()
+    assert not (results_dir / "combined" / "combined.2.gvcf.gz").exists()
+
+
 def test_maf_to_gvcf_single_sample(tmp_path: Path):
     _require_integration()
     _require_conda_tools("argprep", "java")
diff --git a/tests/test_random_positions.py b/tests/test_random_positions.py
index 55b7401..f1824f8 100644
--- a/tests/test_random_positions.py
+++ b/tests/test_random_positions.py
@@ -74,8 +74,8 @@ def test_random_position_membership():
     contig = "1"
     base = Path("results") / "split" / f"combined.{contig}"
     inv_path = base.with_suffix(base.suffix + ".inv")
-    clean_path = base.with_suffix(base.suffix + ".clean")
-    filtered_bed = base.with_suffix(base.suffix + ".filtered.bed")
+    clean_path = base.with_suffix(base.suffix + ".clean.vcf")
+    filtered_bed = base.with_suffix(base.suffix + ".clean.mask.bed")
     missing_bed = base.with_suffix(base.suffix + ".missing.bed")
     fai_path = Path("results") / "refs" / "reference_gvcf.fa.fai"
 
diff --git a/tests/test_snakefile_contig_resolution.py b/tests/test_snakefile_contig_resolution.py
new file mode 100644
index 0000000..8510230
--- /dev/null
+++ b/tests/test_snakefile_contig_resolution.py
@@ -0,0 +1,76 @@
+import ast
+import re
+from pathlib import Path
+
+
+def _extract_function_source(text: str, name: str) -> str:
+    lines = text.splitlines(keepends=True)
+    start = None
+    for i, line in enumerate(lines):
+        if line.startswith(f"def {name}("):
+            start = i
+            break
+    if start is None:
+        raise AssertionError(f"Function {name} not found in Snakefile")
+
+    end = len(lines)
+    i = start
+    paren_balance = 0
+    while i < len(lines):
+        paren_balance += lines[i].count("(") - lines[i].count(")")
+        if paren_balance <= 0 and lines[i].rstrip().endswith(":"):
+            i += 1
+            break
+        i += 1
+
+    for i in range(i, len(lines)):
+        line = lines[i]
+        if not line.strip():
+            continue
+        if line.startswith(" ") or line.startswith("\t"):
+            continue
+        if line.startswith("#"):
+            continue
+        if line.startswith("def "):
+            end = i
+            break
+        end = i
+        break
+    return "".join(lines[start:end])
+
+
+def _load_snakefile_functions():
+    snakefile = Path(__file__).resolve().parents[1] / "Snakefile"
+    text = snakefile.read_text(encoding="utf-8")
+    snippet = (
+        _extract_function_source(text, "_normalize_contig")
+        + "\n"
+        + _extract_function_source(text, "_resolve_requested_contigs")
+    )
+    module = ast.parse(snippet, filename=str(snakefile))
+    ast.fix_missing_locations(module)
+    ns = {"re": re}
+    exec(compile(module, str(snakefile), "exec"), ns)
+    return ns["_normalize_contig"], ns["_resolve_requested_contigs"]
+
+
+def test_normalize_contig_handles_chr_prefix_and_zero_padding():
+    normalize, _ = _load_snakefile_functions()
+    assert normalize("chr01") == "1"
+    assert normalize("001") == "1"
+    assert normalize("chrX") == "x"
+    assert normalize("chr000") == "0"
+
+
+def test_resolve_requested_contigs_remaps_dedupes_and_drops_ambiguous():
+    _, resolve = _load_snakefile_functions()
+
+    kept, dropped, remapped = resolve(["chr1", "01", "2", "chr2", "chr3"], ["1", "2"])
+    assert kept == ["1", "2"]
+    assert dropped == ["chr3"]
+    assert remapped == [("chr1", "1"), ("01", "1"), ("chr2", "2")]
+
+    kept2, dropped2, remapped2 = resolve(["chr1"], ["1", "chr01"])
+    assert kept2 == []
+    assert dropped2 == ["chr1"]
+    assert remapped2 == []
diff --git a/tests/test_split.py b/tests/test_split.py
index abc2ad1..870a623 100644
--- a/tests/test_split.py
+++ b/tests/test_split.py
@@ -60,7 +60,7 @@ def test_singer_keeps_genotypes_and_strips_nonref_alt(tmp_path: Path):
     gvcf.write_text(
         "##fileformat=VCFv4.2\n"
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\tS2\tS3\n"
-        "1\t2\t.\tA\tG,<NON_REF>\t.\t.\tDP=30\tGT:AD\t0/1:0,1\t./.:.\t0/0:1,0\n",
+        "1\t2\t.\tA\tG,<NON_REF>\t.\t.\tDP=30\tGT:AD\t0/1:0,1\t0/1:1,1\t0/0:1,0\n",
         encoding="utf-8",
     )
 
@@ -78,13 +78,13 @@ def test_singer_keeps_genotypes_and_strips_nonref_alt(tmp_path: Path):
         cwd=Path.cwd(),
     )
 
-    clean = tmp_path / "out.clean"
+    clean = tmp_path / "out.clean.vcf"
     line = [l for l in clean.read_text(encoding="utf-8").splitlines() if not l.startswith("#")][0]
     parts = line.split("\t")
     # ALT should drop <NON_REF>, but genotype/sample fields should be preserved.
     assert parts[4] == "G"
     assert parts[8] == "GT:AD"
-    assert parts[9:] == ["0/1:0,1", "./.:.", "0/0:1,0"]
+    assert parts[9:] == ["0/1:0,1", "0/1:1,1", "0/0:1,0"]
 
 
 def _count_vcf_records(path: Path) -> int:
@@ -156,7 +156,7 @@ def test_split_alt_dot_and_nonref_only_are_invariant(tmp_path: Path):
     )
 
     inv = tmp_path / "out.inv"
-    clean = tmp_path / "out.clean"
+    clean = tmp_path / "out.clean.vcf"
 
     inv_records = _count_vcf_records(inv)
     clean_records = _count_vcf_records(clean)
@@ -164,3 +164,37 @@ def test_split_alt_dot_and_nonref_only_are_invariant(tmp_path: Path):
     # ALT="." and ALT="<NON_REF>"-only records are invariant in current logic.
     assert inv_records == 3
     assert clean_records == 0
+
+
+def test_missing_gt_exclusion_stats_written_per_sample(tmp_path: Path):
+    gvcf = tmp_path / "in.gvcf"
+    gvcf.write_text(
+        "##fileformat=VCFv4.2\n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tbad\tgood\n"
+        "1\t1\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n"
+        "1\t2\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n"
+        "1\t3\t.\tA\tG\t.\t.\tDP=10\tGT\t.\t0/1\n"
+        "1\t4\t.\tA\tG\t.\t.\tDP=10\tGT\t0/1\t0/1\n",
+        encoding="utf-8",
+    )
+
+    prefix = tmp_path / "out"
+    proc = subprocess.run(
+        [
+            sys.executable,
+            str(Path("scripts") / "split.py"),
+            "--out-prefix",
+            str(prefix),
+            str(gvcf),
+        ],
+        cwd=Path.cwd(),
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+
+    assert "WARNING:" not in proc.stderr
+    stats = (tmp_path / "out.missing_gt_snp_by_sample.tsv").read_text(encoding="utf-8")
+    assert "sample\texcluded_snp_sites\ttotal_excluded_snp_sites" in stats
+    assert "bad\t3\t3" in stats
+    assert "good\t0\t3" in stats
diff --git a/tests/test_summary_report.py b/tests/test_summary_report.py
new file mode 100644
index 0000000..f58f34e
--- /dev/null
+++ b/tests/test_summary_report.py
@@ -0,0 +1,105 @@
+import gzip
+import runpy
+from pathlib import Path
+from types import SimpleNamespace
+
+
+def _run_summary_report(tmp_path: Path, monkeypatch) -> str:
+    repo = Path(__file__).resolve().parents[1]
+    monkeypatch.chdir(tmp_path)
+
+    # The report scans these locations for warnings.
+    (tmp_path / "logs").mkdir()
+    (tmp_path / ".snakemake").mkdir()
+    (tmp_path / "logs" / "test.log").write_text(
+        "Shell command:\n"
+        '  echo "WARNING: from shell command block should be ignored"\n'
+        'echo "WARNING: explicit echo line should be ignored"\n'
+        "WARNING: real log warning should be kept\n",
+        encoding="utf-8",
+    )
+
+    maf_dir = tmp_path / "maf"
+    maf_dir.mkdir()
+    (maf_dir / "plain.maf").write_text(
+        "##maf version=1\n"
+        "a score=0\n"
+        "s chr1 0 4 + 4 ACGT\n"
+        "s sampleA 0 4 + 4 ACGT\n",
+        encoding="utf-8",
+    )
+    with gzip.open(maf_dir / "extra.maf.gz", "wt", encoding="utf-8") as handle:
+        handle.write(
+            "##maf version=1\n"
+            "a score=0\n"
+            "s maf_only_contig 0 4 + 4 ACGT\n"
+            "s sampleB 0 4 + 4 ACGT\n"
+        )
+
+    ref_fa = tmp_path / "ref.fa"
+    ref_fa.write_text(">1\nACGTACGTAC\n>fai_only_contig\nACGT\n", encoding="utf-8")
+    ref_fai = tmp_path / "ref.fa.fai"
+    ref_fai.write_text("1\t10\t0\t0\t0\n", encoding="utf-8")
+
+    status_tsv = tmp_path / "split.status.tsv"
+    status_tsv.write_text(
+        "sampleToRef\tchr1\tmissing\n"
+        "sampleToRef\tchr1\tmissing\n"
+        "sampleToRef\tchr2\tpresent\n",
+        encoding="utf-8",
+    )
+
+    report_path = tmp_path / "results" / "summary.html"
+    fake_snakemake = SimpleNamespace(
+        output=SimpleNamespace(report=str(report_path)),
+        input=SimpleNamespace(beds=[], invs=[], cleans=[], missing_gt_stats=[]),
+        params=SimpleNamespace(
+            contigs=["1"],
+            jobs=[],
+            temp_paths=[],
+            arg_outputs=[],
+            split_prefixes={},
+            split_status_files=[str(status_tsv)],
+            dropped_contigs_not_in_ref=[],
+            requested_contigs=["chr1"],
+            remapped_contigs=[("chr1", "1")],
+            contigs_not_in_all_mafs=[],
+            ploidy=1,
+            ploidy_source="test",
+            ploidy_file_values={},
+            ploidy_warnings=[],
+            maf_dir=str(maf_dir),
+            orig_ref_fasta=str(ref_fa),
+            ref_fai=str(ref_fai),
+        ),
+    )
+
+    runpy.run_path(
+        str(repo / "scripts" / "summary_report.py"),
+        init_globals={"snakemake": fake_snakemake},
+        run_name="__main__",
+    )
+    return report_path.read_text(encoding="utf-8", errors="ignore")
+
+
+def test_summary_report_warning_filters_and_deduplicates(monkeypatch, tmp_path: Path):
+    html = _run_summary_report(tmp_path, monkeypatch)
+
+    assert "Configured contigs were remapped to renamed-reference contigs" in html
+    assert "chr1-&gt;1" in html
+
+    assert "MAF contigs not present in reference" in html
+    assert "maf_only_contig" in html
+    assert "MAF contigs not present in reference (showing up to 5): chr1" not in html
+
+    assert "Reference contigs not present in MAFs" in html
+    assert "fai_only_contig" in html
+    assert "Reference contigs not present in MAFs (showing up to 5): 1" not in html
+
+    assert "gVCF sampleToRef.gvcf.gz is missing configured contigs: chr1" in html
+    assert "gVCF sampleToRef.gvcf.gz is missing configured contigs: chr1, chr1" not in html
+
+    assert "real log warning should be kept" in html
+    assert "from shell command block should be ignored" not in html
+    assert "explicit echo line should be ignored" not in html
+