From f911ef46814a3deb7f92c1e80bb603d9d2178464 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 10:17:35 -0500 Subject: [PATCH 01/29] feat: minimap2 tool wrapper --- tools/minimap2.wdl | 113 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tools/minimap2.wdl diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl new file mode 100644 index 000000000..e9ef020d9 --- /dev/null +++ b/tools/minimap2.wdl @@ -0,0 +1,113 @@ +version 1.2 + +task align { + meta { + description: "Align DNA or mRNA sequences against a large reference database" + outputs: { + alignments: "The output alignment file in SAM or PAF format" + } + } + + parameter_meta { + reads: "The input reads file in FASTQ format" + reference_index: "The minimap2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + output_name: "The name of the output alignment file" + output_paf: "If true, output in PAF format instead of SAM" + cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" + ignore_base_quality: "If true, ignore base quality scores during alignment" + output_md_tag: "If true, include MD tags in the SAM output" + eqx: "If true, use =/X CIGAR operators instead of M" + soft_clip: "If true, use soft clipping for secondary alignments in SAM format" + secondary_alignments: "If true, report secondary alignments" + seed: "Seed value for the minimap2 aligner" + threads: "Number of threads to use for alignment" + } + + input { + File reads + File reference_index + String read_group + String output_name = "aligned.sam" + Boolean output_paf = false + Boolean cigar_in_paf = true + Boolean ignore_base_quality = false + Boolean output_md_tag = true + Boolean eqx = false + Boolean soft_clip = true + Boolean secondary_alignments = true + Int seed = 11 + Int threads = 3 + } + + command <<< + minimap2 \ + ~{if output_paf then "" else "-a"} \ + ~{if output_paf && cigar_in_paf then "-c" else ""} \ + ~{if ignore_base_quality then "-Q" else ""} \ + ~{if output_md_tag then "--MD" else ""} \ + ~{if eqx then "-X" else ""} \ + ~{if soft_clip then "-Y" else ""} \ + ~{if secondary_alignments then "--secondary=yes" else "--secondary=no"} \ + -t ~{threads} \ + --seed ~{seed} \ + -R "~{read_group}" \ + "~{reference_index}" \ + "~{reads}" \ + > "~{output_name}" + >>> + + output { + File alignments = output_name + } + + requirements { + container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" + cpu: threads + memory: "4 GB" + } +} + +task index { + meta { + description: "Create a minimap2 index for a reference genome" + outputs: { + reference_index: "The generated minimap2 index file" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + alt_contigs: "Optional file containing a list of alternative contigs" + index_name: "The name of the output index file" + minimizer_kmer_size: "K-mer size for minimizer indexing" + minimizer_window_size: "Window size for minimizer indexing" + } + + input { + File reference_fasta + File? alt_contigs + String index_name = "reference.mmi" + Int minimizer_kmer_size = 15 + Int minimizer_window_size = 10 + } + + command <<< + minimap2 \ + -k ~{minimizer_kmer_size} \ + -w ~{minimizer_window_size} \ + ~{if defined(alt_contigs) then "--alt \"~{alt_contigs}\"" else ""} \ + -d "~{index_name}" \ + "~{reference_fasta}" + >>> + + output { + File reference_index = index_name + } + + requirements { + container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" + cpu: 1 + memory: "4 GB" + } +} From fd2bca4ef925630aa02c5b853f45981d4b917814 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 10:46:34 -0500 Subject: [PATCH 02/29] refactor: handle optionally gzipped reference --- tools/minimap2.wdl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index e9ef020d9..8912c42c0 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -93,12 +93,18 @@ task index { } command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + minimap2 \ -k ~{minimizer_kmer_size} \ -w ~{minimizer_window_size} \ ~{if defined(alt_contigs) then "--alt \"~{alt_contigs}\"" else ""} \ -d "~{index_name}" \ - "~{reference_fasta}" + "$ref_fasta" >>> output { From 1ebe2a064a19ea3cba37f1f914835e797a5eb49d Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 11:20:08 -0500 Subject: [PATCH 03/29] chore: fill in options --- tools/minimap2.wdl | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index 8912c42c0..262e4e324 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -9,10 +9,31 @@ task align { } parameter_meta { - reads: "The input reads file in FASTQ format" + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with minimap2" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with minimap2" reference_index: "The minimap2 index file for the reference genome" read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" output_name: "The name of the output alignment file" + preset: { + description: "Minimap2 preset for alignment", + external_help: "https://lh3.github.io/minimap2/minimap2.html#8", + options: [ + "sr", + "map-ont", + "lr:hq", + "map-hifi", + "map-pb", + "map-iclr", + "asm5", + "asm10", + "asm20", + "splice", + "splice:hq", + "splice:sr", + "ava-pb", + "ava-ont" + ], + } output_paf: "If true, output in PAF format instead of SAM" cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" ignore_base_quality: "If true, ignore base quality scores during alignment" @@ -25,9 +46,11 @@ task align { } input { - File reads + File read_one_fastq_gz File reference_index String read_group + File? read_two_fastq_gz + String? preset = "sr" String output_name = "aligned.sam" Boolean output_paf = false Boolean cigar_in_paf = true @@ -42,6 +65,7 @@ task align { command <<< minimap2 \ + ~{if defined(preset) then "-x \"~{preset}\"" else ""} \ ~{if output_paf then "" else "-a"} \ ~{if output_paf && cigar_in_paf then "-c" else ""} \ ~{if ignore_base_quality then "-Q" else ""} \ @@ -53,7 +77,8 @@ task align { --seed ~{seed} \ -R "~{read_group}" \ "~{reference_index}" \ - "~{reads}" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} \ > "~{output_name}" >>> @@ -64,7 +89,7 @@ task align { requirements { container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" cpu: threads - memory: "4 GB" + memory: "16 GB" } } @@ -82,6 +107,7 @@ task index { index_name: "The name of the output index file" minimizer_kmer_size: "K-mer size for minimizer indexing" minimizer_window_size: "Window size for minimizer indexing" + threads: "Number of threads to use for indexing" } input { @@ -90,6 +116,7 @@ task index { String index_name = "reference.mmi" Int minimizer_kmer_size = 15 Int minimizer_window_size = 10 + Int threads = 3 } command <<< @@ -103,6 +130,7 @@ task index { -k ~{minimizer_kmer_size} \ -w ~{minimizer_window_size} \ ~{if defined(alt_contigs) then "--alt \"~{alt_contigs}\"" else ""} \ + -t ~{threads} \ -d "~{index_name}" \ "$ref_fasta" >>> @@ -113,7 +141,7 @@ task index { requirements { container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" - cpu: 1 - memory: "4 GB" + cpu: threads + memory: "16 GB" } } From 9533656d9e4741e3f6a6133ecb8a521939b36051 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 14:28:48 -0500 Subject: [PATCH 04/29] chore: add samtools to minimap2 image and convert to BAM --- docker/minimap2/Dockerfile | 8 ++++++++ docker/minimap2/package.json | 5 +++++ tools/minimap2.wdl | 16 +++++++++++----- 3 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 docker/minimap2/Dockerfile create mode 100644 docker/minimap2/package.json diff --git a/docker/minimap2/Dockerfile b/docker/minimap2/Dockerfile new file mode 100644 index 000000000..7fb04869b --- /dev/null +++ b/docker/minimap2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/minimap2:2.30--h577a1d6_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "minimap2" ] \ No newline at end of file diff --git a/docker/minimap2/package.json b/docker/minimap2/package.json new file mode 100644 index 000000000..a78b7377c --- /dev/null +++ b/docker/minimap2/package.json @@ -0,0 +1,5 @@ +{ + "name": "minimap2", + "version": "2.30", + "revision": "0" +} \ No newline at end of file diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index 262e4e324..5a300509f 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -34,7 +34,7 @@ task align { "ava-ont" ], } - output_paf: "If true, output in PAF format instead of SAM" + output_paf: "If true, output in PAF format instead of BAM" cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" ignore_base_quality: "If true, ignore base quality scores during alignment" output_md_tag: "If true, include MD tags in the SAM output" @@ -51,7 +51,7 @@ task align { String read_group File? read_two_fastq_gz String? preset = "sr" - String output_name = "aligned.sam" + String output_name = "aligned.bam" Boolean output_paf = false Boolean cigar_in_paf = true Boolean ignore_base_quality = false @@ -79,7 +79,13 @@ task align { "~{reference_index}" \ "~{read_one_fastq_gz}" \ ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} \ - > "~{output_name}" + > output + + if ~{output_paf}; then + mv output "~{output_name}" + else + samtools view -b output > "~{output_name}" + fi >>> output { @@ -87,7 +93,7 @@ task align { } requirements { - container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" cpu: threads memory: "16 GB" } @@ -140,7 +146,7 @@ task index { } requirements { - container: "quay.io/biocontainers/minimap2:2.30--h577a1d6_0" + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" cpu: threads memory: "16 GB" } From 1b28075c8c8ebecf2eb5cbea2fc837d645eb6b9a Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 14:29:48 -0500 Subject: [PATCH 05/29] chore: lint --- tools/minimap2.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index 5a300509f..58b0cf15b 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -10,10 +10,9 @@ task align { parameter_meta { read_one_fastq_gz: "Input gzipped FASTQ read one file to align with minimap2" - read_two_fastq_gz: "Input gzipped FASTQ read two file to align with minimap2" reference_index: "The minimap2 index file for the reference genome" read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" - output_name: "The name of the output alignment file" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with minimap2" preset: { description: "Minimap2 preset for alignment", external_help: "https://lh3.github.io/minimap2/minimap2.html#8", @@ -31,9 +30,10 @@ task align { "splice:hq", "splice:sr", "ava-pb", - "ava-ont" + "ava-ont", ], } + output_name: "The name of the output alignment file" output_paf: "If true, output in PAF format instead of BAM" cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" ignore_base_quality: "If true, ignore base quality scores during alignment" From 0ac46dbd6183ecd1ac2d986c2e112fd4ee1bd04a Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 17:00:24 -0500 Subject: [PATCH 06/29] chore: add disk specification --- tools/minimap2.wdl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index 58b0cf15b..d953a5e39 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -43,6 +43,7 @@ task align { secondary_alignments: "If true, report secondary alignments" seed: "Seed value for the minimap2 aligner" threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" } input { @@ -61,8 +62,17 @@ task align { Boolean secondary_alignments = true Int seed = 11 Int threads = 3 + Int modify_disk_size_gb = 0 } + Int disk_size_gb = ceil( + ( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(reference_index, "GiB")) + + 10 + + modify_disk_size_gb + command <<< minimap2 \ ~{if defined(preset) then "-x \"~{preset}\"" else ""} \ @@ -96,6 +106,7 @@ task align { container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" cpu: threads memory: "16 GB" + disks: "~{disk_size_gb} GB" } } @@ -114,6 +125,7 @@ task index { minimizer_kmer_size: "K-mer size for minimizer indexing" minimizer_window_size: "Window size for minimizer indexing" threads: "Number of threads to use for indexing" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" } input { @@ -123,8 +135,11 @@ task index { Int minimizer_kmer_size = 15 Int minimizer_window_size = 10 Int threads = 3 + Int modify_disk_size_gb = 0 } + Int disk_size_gb = ceil(size(reference_fasta, "GiB")) + 10 + modify_disk_size_gb + command <<< set -euo pipefail @@ -149,5 +164,6 @@ task index { container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" cpu: threads memory: "16 GB" + disks: "~{disk_size_gb} GB" } } From 5161669f78a05a84231edb352bdaf07f2900039e Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Wed, 3 Dec 2025 17:02:24 -0500 Subject: [PATCH 07/29] feat: add bwa-mem2 task --- docker/bwamem2/Dockerfile | 8 +++ docker/bwamem2/package.json | 5 ++ tools/bwamem2.wdl | 127 ++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 docker/bwamem2/Dockerfile create mode 100644 docker/bwamem2/package.json create mode 100644 tools/bwamem2.wdl diff --git a/docker/bwamem2/Dockerfile b/docker/bwamem2/Dockerfile new file mode 100644 index 000000000..a79bab258 --- /dev/null +++ b/docker/bwamem2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/bwa-mem2:2.3--he70b90d_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "bwa-mem2" ] \ No newline at end of file diff --git a/docker/bwamem2/package.json b/docker/bwamem2/package.json new file mode 100644 index 000000000..8a483d363 --- /dev/null +++ b/docker/bwamem2/package.json @@ -0,0 +1,5 @@ +{ + "name": "bwamem2", + "version": "2.3", + "revision": "0" +} \ No newline at end of file diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl new file mode 100644 index 000000000..bdba1e258 --- /dev/null +++ b/tools/bwamem2.wdl @@ -0,0 +1,127 @@ +version 1.2 + +task align { + meta { + description: "Align DNA sequences against a large reference database using BWA-MEM2" + outputs: { + alignments: "The output alignment file in SAM format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with BWA-MEM2" + reference_index: "The BWA-MEM2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with BWA-MEM2" + prefix: "Prefix for the BAM file. The extension `.bam` will be added." + seed: "Seed value for the BWA-MEM2 aligner" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + String read_group + File? read_two_fastq_gz + String prefix = sub( + basename(read_one_fastq_gz), + "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$", + "" + ) + Int threads = 4 + Int modify_disk_size_gb = 0 + Int seed_length = 19 + Int min_score = 30 + Boolean smart_pairing = false + Boolean skip_mate_rescue = false + } + + String output_name = prefix + ".bam" + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(reference_index, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + mkdir bwa_db + tar -C bwa_db -xzf "~{reference_index}" --no-same-owner + PREFIX=$(basename bwa_db/*.ann ".ann") + + bwa-mem2 mem \ + -t ~{threads} \ + -R "~{read_group}" \ + -k ~{seed_length} \ + -T ~{min_score} \ + ~{if smart_pairing then "-p" else ""} \ + ~{if skip_mate_rescue then "-S" else ""} \ + bwa_db/"$PREFIX" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "~{read_two_fastq_gz}" else ""} | + samtools view -b -o "~{output_name}" - + >>> + + output { + File alignments = output_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: threads + memory: "~{disk_size_gb * 2} GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with minimap2" + outputs: { + reference_index: "The minimap2 index file for the reference genome" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + db_name: "The base name for the output index files" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + String db_name = "reference" + Int modify_disk_size_gb = 0 + } + + Float input_fasta_size = size(reference_fasta, "GiB") + Int disk_size_gb = ceil(input_fasta_size * 2) + 10 + modify_disk_size_gb + String bwa_db_out_name = db_name + ".tar.gz" + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + bwa-mem2 index \ + "$ref_fasta" + + tar -czf "~{bwa_db_out_name}" "$ref_fasta"* + >>> + + output { + File reference_index = bwa_db_out_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: 1 + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} From af994e2029101b4c977aaf7d26f31cf02fe8a1ae Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 4 Dec 2025 12:55:40 -0500 Subject: [PATCH 08/29] chore: lint --- tools/bwamem2.wdl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl index bdba1e258..15a499fd2 100644 --- a/tools/bwamem2.wdl +++ b/tools/bwamem2.wdl @@ -14,9 +14,12 @@ task align { read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" read_two_fastq_gz: "Input gzipped FASTQ read two file to align with BWA-MEM2" prefix: "Prefix for the BAM file. The extension `.bam` will be added." - seed: "Seed value for the BWA-MEM2 aligner" + smart_pairing: "If true, enable smart pairing mode for paired-end reads" + skip_mate_rescue: "If true, skip mate rescue for paired-end reads" threads: "Number of threads to use for alignment" modify_disk_size_gb: "Additional disk space to allocate (in GB)" + seed_length: "Seed value for the BWA-MEM2 aligner" + min_score: "Minimum score threshold for reporting alignments" } input { @@ -29,12 +32,12 @@ task align { "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$", "" ) + Boolean smart_pairing = false + Boolean skip_mate_rescue = false Int threads = 4 Int modify_disk_size_gb = 0 Int seed_length = 19 Int min_score = 30 - Boolean smart_pairing = false - Boolean skip_mate_rescue = false } String output_name = prefix + ".bam" @@ -61,7 +64,7 @@ task align { ~{if skip_mate_rescue then "-S" else ""} \ bwa_db/"$PREFIX" \ "~{read_one_fastq_gz}" \ - ~{if defined(read_two_fastq_gz) then "~{read_two_fastq_gz}" else ""} | + ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} | samtools view -b -o "~{output_name}" - >>> @@ -121,7 +124,7 @@ task index { requirements { container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" cpu: 1 - memory: "16 GB" + memory: "120 GB" disks: "~{disk_size_gb} GB" } } From 00deed43a5564a4b7c1c1231a73e017df46a376c Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 4 Dec 2025 12:56:02 -0500 Subject: [PATCH 09/29] feat: add hisat2 task --- docker/hisat2/Dockerfile | 8 ++ docker/hisat2/package.json | 5 ++ tools/hisat2.wdl | 169 +++++++++++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+) create mode 100644 docker/hisat2/Dockerfile create mode 100644 docker/hisat2/package.json create mode 100644 tools/hisat2.wdl diff --git a/docker/hisat2/Dockerfile b/docker/hisat2/Dockerfile new file mode 100644 index 000000000..a57d0c75a --- /dev/null +++ b/docker/hisat2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/hisat2:2.2.1--h503566f_8 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "hisat2" ] \ No newline at end of file diff --git a/docker/hisat2/package.json b/docker/hisat2/package.json new file mode 100644 index 000000000..e17679260 --- /dev/null +++ b/docker/hisat2/package.json @@ -0,0 +1,5 @@ +{ + "name": "hisat2", + "version": "2.2.1", + "revision": "0" +} \ No newline at end of file diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl new file mode 100644 index 000000000..bd06d2eaa --- /dev/null +++ b/tools/hisat2.wdl @@ -0,0 +1,169 @@ +version 1.2 + +task align { + meta { + description: "Align RNA-seq reads against a reference genome using HISAT2" + outputs: { + alignments: "The output alignment file in SAM format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with HISAT2" + reference_index: "The HISAT2 index files for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with HISAT2" + output_name: "The name of the output alignment file" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + File? read_two_fastq_gz + String output_name = "aligned.sam" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(reference_index, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + hisat2 \ + -q \ + -p ~{threads} \ + -x "~{reference_index}" \ + -1 "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} \ + -S "~{output_name}" + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with HISAT2" + outputs: { + reference_index: "The HISAT2 index files for the reference genome" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + snp: "List of SNPs" + haplotype: "List of haplotypes" + splice_site: "List of splice sites. Use with `exon`." + exon: "List of exons. Use with `splice_site`." + repeat_ref: "" + repeat_info: "" + repeat_snp: "" + repeat_haplotype: "" + bmax: "Maximum number of suffixes allowed in a block" + seed: "Seed for psuedo-random number generator" + bmaxdivn: "Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" + index_base_name: "The base name for the output index files" + force_large_index: "Force creation of a large index" + disable_auto_fitting: "Disable automatic fitting of index parameters" + nodc: "Disable difference-cover sample" + no_ref: "Do not build bitpacked version of reference sequence for paired-end alignment" + just_ref: "Build only the bitpacked version of reference sequence for paired-end alignment" + threads: "Number of threads to use for indexing" + dcv: "Period for the difference-cover sample. A larger period uses less memory, but may be slower. Must be a power of 2, no greater than 4096." + offrate: "The off-rate for the FM index" + ftabchars: "The lookup table to calculate initial BW range with respect to the first N characters of the query" + localoffrate: "The off-rate for the local FM index" + localftabchars: "The lookup table to calculate initial BW range for the local FM" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + File? snp + File? haplotype + File? splice_site + File? exon + File? repeat_ref + File? repeat_info + File? repeat_snp + File? repeat_haplotype + Int? bmax + Int? seed + Int? bmaxdivn = 4 + String index_base_name = "hisat2_index" + Boolean force_large_index = false + Boolean disable_auto_fitting = false + Boolean nodc = false + Boolean no_ref = false + Boolean just_ref = false + Int threads = 1 + Int dcv = 1024 + Int offrate = 5 + Int ftabchars = 10 + Int localoffrate = 3 + Int localftabchars = 6 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + hisat2-build \ + ~{if force_large_index then "--large-index" else ""} \ + ~{if disable_auto_fitting then "--disable-auto-fitting" else ""} \ + -p ~{threads} \ + ~{if defined(bmax) then "--bmax \"~{bmax}\"" else ""} \ + ~{if defined(bmaxdivn) then "--bmaxdivn \"~{bmaxdivn}\"" else ""} \ + ~{if !nodc then "--dcv \"~{dcv}\"" else ""} \ + ~{if no_ref then "--no-ref" else ""} \ + ~{if just_ref then "--just-ref" else ""} \ + --offrate "~{offrate}" \ + --ftabchars "~{ftabchars}" \ + --localoffrate "~{localoffrate}" \ + --localftabchars "~{localftabchars}" \ + ~{if defined(snp) then "--snp \"~{snp}\"" else ""} \ + ~{if defined(haplotype) then "--haplotype \"~{haplotype}\"" else ""} \ + ~{if defined(splice_site) then "--ss \"~{splice_site}\"" else ""} \ + ~{if defined(exon) then "--exon \"~{exon}\"" else ""} \ + ~{if defined(repeat_ref) then "--repeat-ref \"~{repeat_ref}\"" else ""} \ + ~{if defined(repeat_info) then "--repeat-info \"~{repeat_info}\"" else ""} \ + ~{if defined(repeat_snp) then "--repeat-snp \"~{repeat_snp}\"" else ""} \ + ~{if defined(repeat_haplotype) then "--repeat-haplotype \"~{repeat_haplotype}\"" else ""} \ + ~{if defined(seed) then "--seed \"~{seed}\"" else ""} \ + "$ref_fasta" \ + "~{index_base_name}" + + tar -czf "~{index_base_name}.tar.gz" "~{index_base_name}"* + >>> + + output { + File reference_index = "~{index_base_name}.tar.gz" + } + + requirements { + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} From 443466617858f7359ba329a618133a1513d74125 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 4 Dec 2025 13:56:15 -0500 Subject: [PATCH 10/29] chore: change base image as other segfaults --- docker/hisat2/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/hisat2/Dockerfile b/docker/hisat2/Dockerfile index a57d0c75a..85b3898dd 100644 --- a/docker/hisat2/Dockerfile +++ b/docker/hisat2/Dockerfile @@ -1,5 +1,5 @@ FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools -FROM quay.io/biocontainers/hisat2:2.2.1--h503566f_8 +FROM quay.io/biocontainers/hisat2:2.2.1--hdbdd923_7 COPY --from=samtools /usr/local/bin/ /usr/local/bin/ COPY --from=samtools /usr/local/lib/ /usr/local/lib/ From b07d7690d6f2de6903dc92b96ed520c824d77bb6 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 5 Dec 2025 11:47:35 -0500 Subject: [PATCH 11/29] feat: add `vg` indexing --- tools/vg.wdl | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tools/vg.wdl diff --git a/tools/vg.wdl b/tools/vg.wdl new file mode 100644 index 000000000..f002533ce --- /dev/null +++ b/tools/vg.wdl @@ -0,0 +1,75 @@ +version 1.2 + +task index { + meta { + description: "Index a reference genome for alignment with vg giraffe" + outputs: { + reference_index: "The vg giraffe index file for the reference genome" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + vcf_files: "VCF(s) containing variants to augment the graph" + transcript_gff: "GFF(s) containing transcript annotations" + db_prefix: "The base name for the output index files" + gff_feature: "The feature type in the GFF to use for transcripts" + gff_id_tag: "The attribute tag in the GFF to use as transcript ID" + workflow: { + description: "The vg autoindex workflow to use", + choices: [ + "map", + "mpmap", + "rpvg", + "giraffe", + "sr-giraffe", + "lr-giraffe", + ], + } + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + threads: "Number of threads to use for indexing" + } + + input { + File reference_fasta + Array[File] vcf_files = [] + Array[File] transcript_gff = [] + String db_prefix = "reference" + String gff_feature = "exon" + String gff_id_tag = "transcript_id" + String workflow = "giraffe" + Int modify_disk_size_gb = 0 + Int threads = 4 + } + + Float input_fasta_size = size(reference_fasta, "GiB") + Float vcf_size = size(vcf_files, "GiB") + Float transcript_gff_size = size(transcript_gff, "GiB") + Int disk_size_gb = ceil(input_fasta_size * 2) + + ceil(vcf_size * 2) + + ceil(transcript_gff_size * 2) + + 10 + modify_disk_size_gb + + command <<< + vg autoindex \ + --workflow "~{workflow}" \ + -r "~{reference_fasta}" \ + -p "~{db_prefix}" \ + ~{sep(" ", prefix("-v ", quote(vcf_files)))} \ + ~{sep(" ", prefix("-x ", quote(transcript_gff)))} \ + -t ~{threads} \ + --gff-feature "~{gff_feature}" \ + --gff-tx-tag "~{gff_id_tag}" + >>> + + output { + Array[File] reference_index = glob("~{db_prefix}*") + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} From cb47772356a169eebcfbc278948428cf1c6ececf Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 5 Dec 2025 12:41:18 -0500 Subject: [PATCH 12/29] chore: localize fasta for indexing --- tools/vg.wdl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/vg.wdl b/tools/vg.wdl index f002533ce..6407a79a9 100644 --- a/tools/vg.wdl +++ b/tools/vg.wdl @@ -51,9 +51,15 @@ task index { + 10 + modify_disk_size_gb command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + vg autoindex \ --workflow "~{workflow}" \ - -r "~{reference_fasta}" \ + -r "$ref_fasta" \ -p "~{db_prefix}" \ ~{sep(" ", prefix("-v ", quote(vcf_files)))} \ ~{sep(" ", prefix("-x ", quote(transcript_gff)))} \ From 39ca901c99e728c4da2ffdf60029dbdd220aecfb Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 5 Dec 2025 13:54:52 -0500 Subject: [PATCH 13/29] feat: add vg giraffe task --- tools/vg.wdl | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/vg.wdl b/tools/vg.wdl index 6407a79a9..86f2a95f2 100644 --- a/tools/vg.wdl +++ b/tools/vg.wdl @@ -1,5 +1,110 @@ version 1.2 +task giraffe { + meta { + description: "Align DNA sequences against a variation graph using vg giraffe" + outputs: { + alignments: "The output alignment file in GAM format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with vg giraffe" + gbz_graph: "The vg GBZ graph file for the reference genome" + minimizer_index: "The vg minimizer index file for the reference genome" + zipcode_name: "The vg zipcode name file for the reference genome" + distance_index: "The vg distance index file for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with vg giraffe" + haploytype: "The haplotype information file" + kff: "The KFF file containing kmer counts" + sample_name: "The sample name to include" + read_group: "The read group" + output_name: "The name of the output alignment file" + output_format: { + description: "The output format for alignments", + options: [ + "gam", + "gaf", + "json", + "tsv", + "SAM", + "BAM", + "CRAM", + ], + } + preset: { + description: "vg giraffe preset for alignment", + options: [ + "chaining-sr", + "default", + "fast", + "hifi", + "r10", + "srold", + ], + } + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File gbz_graph + File minimizer_index + File zipcode_name + File distance_index + File? read_two_fastq_gz + File? haploytype + File? kff + String? sample_name + String? read_group + String output_name = "aligned.bam" + String output_format = "BAM" + String preset = "default" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(gbz_graph, "GiB")) + + ceil(size(minimizer_index, "GiB")) + + ceil(size(distance_index, "GiB")) + + ceil(size(zipcode_name, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + vg giraffe \ + -t ~{threads} \ + -Z "~{gbz_graph}" \ + -m "~{minimizer_index}" \ + -d "~{distance_index}" \ + -z "~{zipcode_name}" \ + -f "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "-f \"~{read_two_fastq_gz}\"" else ""} \ + -o "~{output_format}" \ + ~{if defined(sample_name) then "--sample \"~{sample_name}\"" else ""} \ + ~{if defined(read_group) then "--read-group \"~{read_group}\"" else ""} \ + ~{if defined(haploytype) then "--haplotype-name \"~{haploytype}\"" else ""} \ + ~{if defined(kff) then "--kff-name \"~{kff}\"" else ""} \ + --parameter-preset "~{preset}" \ + > "~{output_name}" + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} + task index { meta { description: "Index a reference genome for alignment with vg giraffe" From a921e17f160ec24c8bbb432cf082c2532fb1b914 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Tue, 9 Dec 2025 14:23:38 -0600 Subject: [PATCH 14/29] chore: avoid writing intermediate SAM to disk --- tools/minimap2.wdl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index d953a5e39..3a8dd364d 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -89,12 +89,10 @@ task align { "~{reference_index}" \ "~{read_one_fastq_gz}" \ ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} \ - > output - - if ~{output_paf}; then - mv output "~{output_name}" + | if ~{output_paf}; then + cat - > "~{output_name}" else - samtools view -b output > "~{output_name}" + samtools view -b - > "~{output_name}" fi >>> From 2dc478a40cc8b05a350767768c6d555ba57d4db0 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 12 Dec 2025 09:33:30 -0500 Subject: [PATCH 15/29] chore: use database prefix --- tools/hisat2.wdl | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index bd06d2eaa..e61c75439 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -34,13 +34,21 @@ task align { + modify_disk_size_gb command <<< + set -euo pipefail + + mkdir hisat2_db + tar -C hisat2_db -xzf "~{hisat2_db_tar_gz}" --no-same-owner + PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") + hisat2 \ -q \ -p ~{threads} \ - -x "~{reference_index}" \ + -S "~{output_name}" \ + -x "$PREFIX" \ -1 "~{read_one_fastq_gz}" \ - ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} \ - -S "~{output_name}" + ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} + + rm -r hisat2_db >>> output { From 1c832fcc43fd2b078ca6d949c8c92d5eed0d4f7c Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 12 Dec 2025 13:23:10 -0600 Subject: [PATCH 16/29] chore: bump resources for azure --- tools/hisat2.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index e61c75439..56d143556 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -29,7 +29,7 @@ task align { Int disk_size_gb = ceil(( size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") ) * 2) - + ceil(size(reference_index, "GiB")) + + ceil(size(reference_index, "GiB") * 5) + 10 + modify_disk_size_gb @@ -37,14 +37,14 @@ task align { set -euo pipefail mkdir hisat2_db - tar -C hisat2_db -xzf "~{hisat2_db_tar_gz}" --no-same-owner + tar -C hisat2_db -xzf "~{reference_index}" --no-same-owner PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") hisat2 \ -q \ -p ~{threads} \ -S "~{output_name}" \ - -x "$PREFIX" \ + -x "hisat2_db/$PREFIX" \ -1 "~{read_one_fastq_gz}" \ ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} @@ -57,7 +57,7 @@ task align { requirements { cpu: threads - memory: "16 GB" + memory: "64 GB" disks: "~{disk_size_gb} GB" container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" } From bd841324406b61c7fd5ddf842c8228496a17d943 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 12 Dec 2025 14:28:50 -0500 Subject: [PATCH 17/29] chore: format+lint --- tools/hisat2.wdl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index 56d143556..a049ec07e 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -4,7 +4,7 @@ task align { meta { description: "Align RNA-seq reads against a reference genome using HISAT2" outputs: { - alignments: "The output alignment file in SAM format" + alignments: "The output alignment file in SAM format", } } @@ -67,7 +67,7 @@ task index { meta { description: "Index a reference genome for alignment with HISAT2" outputs: { - reference_index: "The HISAT2 index files for the reference genome" + reference_index: "The HISAT2 index files for the reference genome", } } @@ -156,7 +156,11 @@ task index { ~{if defined(repeat_ref) then "--repeat-ref \"~{repeat_ref}\"" else ""} \ ~{if defined(repeat_info) then "--repeat-info \"~{repeat_info}\"" else ""} \ ~{if defined(repeat_snp) then "--repeat-snp \"~{repeat_snp}\"" else ""} \ - ~{if defined(repeat_haplotype) then "--repeat-haplotype \"~{repeat_haplotype}\"" else ""} \ + ~{( + if defined(repeat_haplotype) + then "--repeat-haplotype \"~{repeat_haplotype}\"" + else "" + )} \ ~{if defined(seed) then "--seed \"~{seed}\"" else ""} \ "$ref_fasta" \ "~{index_base_name}" From 8abaf9c35b7d2a81537ef0e411d35fd5ea4d9a52 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 08:09:43 -0600 Subject: [PATCH 18/29] chore: remove memory oversubscribe --- tools/bwamem2.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl index 15a499fd2..c34634a73 100644 --- a/tools/bwamem2.wdl +++ b/tools/bwamem2.wdl @@ -75,7 +75,7 @@ task align { requirements { container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" cpu: threads - memory: "~{disk_size_gb * 2} GB" + memory: "64 GB" disks: "~{disk_size_gb} GB" } } From d42fbfb0e14867b77939912ba268ddcfb10ea8d3 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 13:52:47 -0500 Subject: [PATCH 19/29] feat: add strelka and manta wrappers --- tools/manta.wdl | 125 ++++++++++++++++++++++++++++++++++++++ tools/strelka.wdl | 151 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 tools/manta.wdl create mode 100644 tools/strelka.wdl diff --git a/tools/manta.wdl b/tools/manta.wdl new file mode 100644 index 000000000..29946b749 --- /dev/null +++ b/tools/manta.wdl @@ -0,0 +1,125 @@ +version 1.2 + +task manta_germline { + meta { + description: "Run Manta structural variant and indel caller" + outputs: { + manta_output: "Directory containing Manta variant calls", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_dir: "Directory to store Manta output" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_dir = "manta_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configManta.py \ + --bam "~{bam}" \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Directory manta_output = output_dir + File log_file = "~{output_dir}/manta.log" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task manta_somatic { + meta { + description: "Run Manta structural variant and indel caller in somatic mode" + outputs: { + manta_output: "Directory containing Manta variant calls", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + tumor_bam: "Input BAM file with aligned reads from tumor sample" + normal_bam: "Input BAM file with aligned reads from normal sample" + output_dir: "Directory to store Manta output" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File tumor_bam + File normal_bam + String output_dir = "manta_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(tumor_bam, "GiB")) + + ceil(size(normal_bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configManta.py \ + --normalBam "~{normal_bam}" \ + --tumorBam "~{tumor_bam}" \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Directory manta_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/strelka.wdl b/tools/strelka.wdl new file mode 100644 index 000000000..b986693a4 --- /dev/null +++ b/tools/strelka.wdl @@ -0,0 +1,151 @@ +version 1.2 + +task somatic { + meta { + description: "Run Strelka somatic variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka somatic variant calls", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + normal_bam: "Input BAM file with aligned reads for normal sample" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + indel_candidates: "Optional VCF file with candidate indels, recommended to be generated by Manta" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File normal_bam + File tumor_bam + File? indel_candidates + String output_dir = "strelka_somatic_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(normal_bam, "GiB")) + + ceil(size(tumor_bam, "GiB")) + + ( + if defined(indel_candidates) + then ceil(size(indel_candidates, "GiB")) + else 0 + ) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configureStrelkaSomaticWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --tumorBam "~{tumor_bam}" \ + --normalBam "~{normal_bam}" \ + ~{if (exome) then "--exome" else ""} \ + ~{if (rna) then "--rna" else ""} \ + ~{( + if (defined(indel_candidates)) + then "--indelCandidates '~{indel_candidates}'" + else "" + )} + + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" + >>> + + output { + Directory strelka_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--hdfd78af_2" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task germline { + meta { + description: "Run Strelka germline variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka germline variant calls", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_dir = "strelka_germline_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configureStrelkaGermlineWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --bam "~{bam}" \ + ~{if (exome) then "--exome" else ""} \ + ~{if (rna) then "--rna" else ""} + + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" + >>> + + output { + Directory strelka_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--hdfd78af_2" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} From 589190dc9061fa72bedf9aa90b5832904038ce52 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 14:25:59 -0500 Subject: [PATCH 20/29] feat: add clair3 wrapper --- tools/clair3.wdl | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tools/clair3.wdl diff --git a/tools/clair3.wdl b/tools/clair3.wdl new file mode 100644 index 000000000..6b23f1de7 --- /dev/null +++ b/tools/clair3.wdl @@ -0,0 +1,90 @@ +version 1.2 + +task clair3 { + meta { + description: "Run Clair3 variant caller for small variants using deep neural networks" + outputs: { + pileup_vcf: "VCF file with variants called using pileup model", + full_alignment_vcf: "VCF file with variants called using full-alignment model", + merged_vcf: "Final merged VCF file with variants from both models", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + model: "Pre-trained Clair3 model to use for variant calling" + bed_regions: "Optional BED file specifying regions to call variants in" + vcf_candidates: "Optional VCF file with candidate variants to consider" + output_dir: "Directory to store Clair3 output" + platform: { + description: "Sequencing platform used to generate the reads", + choices: [ + "ont", + "hifi", + "ilmn", + ], + } + all_contigs: "Boolean indicating whether to include all contigs in variant calling. If false only chr{1..22,X,Y} are called." + print_ref_calls: "Boolean indicating whether to print reference calls in the output VCF" + gvcf: "Boolean indicating whether to output gVCF format" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + File model + File? bed_regions + File? vcf_candidates + String output_dir = "clair3_output" + String platform = "ilmn" + Boolean all_contigs = false + Boolean print_ref_calls = false + Boolean gvcf = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + ./run_clair3.sh \ + --bam_fn="~{bam}" \ + --ref_fn="$ref_fasta" \ + --threads="~{threads}" \ + --platform="~{platform}" \ + --model_path="~{model}" \ + --output="~{output_dir}" \ + ~{if all_contigs then "--include_all_ctgs" else ""} \ + ~{if print_ref_calls then "--print_ref_calls" else ""} \ + ~{if defined(bed_regions) then "--bed_fn='~{bed_regions}'" else ""} \ + ~{if defined(vcf_candidates) then "--vcf_fn='~{vcf_candidates}'" else ""} \ + ~{if gvcf then "--gvcf" else ""} + + rm -rf "$ref_fasta" + >>> + + output { + File pileup_vcf = "~{output_dir}/pileup.vcf.gz" + File full_alignment_vcf = "~{output_dir}/full_alignment.vcf.gz" + File merged_vcf = "~{output_dir}/merge_output.vcf.gz" + } + + requirements { + container: "quay.io/biocontainers/clair3:1.2.0--py310h779eee5_0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} From efb1d50cc9d9542d309795e79b7d471ef51f5dfc Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 14:51:13 -0500 Subject: [PATCH 21/29] chore: fix invocation --- tools/{clair3.wdl => clair.wdl} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tools/{clair3.wdl => clair.wdl} (99%) diff --git a/tools/clair3.wdl b/tools/clair.wdl similarity index 99% rename from tools/clair3.wdl rename to tools/clair.wdl index 6b23f1de7..20225f61c 100644 --- a/tools/clair3.wdl +++ b/tools/clair.wdl @@ -59,7 +59,7 @@ task clair3 { gunzip -c "~{reference_fasta}" > "$ref_fasta" \ || ln -sf "~{reference_fasta}" "$ref_fasta" - ./run_clair3.sh \ + run_clair3.sh \ --bam_fn="~{bam}" \ --ref_fn="$ref_fasta" \ --threads="~{threads}" \ From 25681c72a9546488bed2848dedb3a8178f2126c6 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 15:48:41 -0500 Subject: [PATCH 22/29] feat: add NGSEP wrapper --- docker/ngsep/Dockerfile | 5 ++++ docker/ngsep/package.json | 5 ++++ tools/ngsep.wdl | 59 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 docker/ngsep/Dockerfile create mode 100644 docker/ngsep/package.json create mode 100644 tools/ngsep.wdl diff --git a/docker/ngsep/Dockerfile b/docker/ngsep/Dockerfile new file mode 100644 index 000000000..d7c3ada65 --- /dev/null +++ b/docker/ngsep/Dockerfile @@ -0,0 +1,5 @@ +FROM eclipse-temurin:8 + +RUN wget https://github.com/NGSEP/NGSEPcore/releases/download/v5.1.0/NGSEPcore_5.1.0.jar -O /usr/local/bin/NGSEPcore.jar + +ENTRYPOINT [ "java", "-jar", "/usr/local/bin/NGSEPcore.jar" ] \ No newline at end of file diff --git a/docker/ngsep/package.json b/docker/ngsep/package.json new file mode 100644 index 000000000..19a575139 --- /dev/null +++ b/docker/ngsep/package.json @@ -0,0 +1,5 @@ +{ + "name": "ngsep", + "version": "5.1.0", + "revision": "0" +} diff --git a/tools/ngsep.wdl b/tools/ngsep.wdl new file mode 100644 index 000000000..bf93f9472 --- /dev/null +++ b/tools/ngsep.wdl @@ -0,0 +1,59 @@ +version 1.2 + +task germline_variant { + meta { + description: "Call germline variants using NGSEP" + outputs: { + vcf_output: "VCF file containing called germline variants" + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_prefix: "Prefix for the output file with called variants" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_prefix = "ngsep_germline_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + java -Xmx16g -jar /usr/local/bin/NGSEPcore.jar \ + SingleSampleVariantsDetector \ + -r "$ref_fasta" \ + -i "~{bam}" \ + -o "~{output_prefix}" \ + -t "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Array[File] vcf_output = glob("~{output_prefix}*") + } + + requirements { + container: "ghcr.io/stjude/ngsep:5.1.0-0" + cpu: threads + memory: "20 GB" + disks: "~{disk_size_gb} GB" + } +} From 1083326853229804e8d2675c0f353a0800c412de Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 18 Dec 2025 17:27:15 -0500 Subject: [PATCH 23/29] feat: add deepsomatic and deepvariant wrappers with GPU support --- tools/deepvariant.wdl | 216 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 tools/deepvariant.wdl diff --git a/tools/deepvariant.wdl b/tools/deepvariant.wdl new file mode 100644 index 000000000..c5c1a3ad9 --- /dev/null +++ b/tools/deepvariant.wdl @@ -0,0 +1,216 @@ +version 1.2 + +task deepsomatic { + meta { + description: "Call variants using DeepSomatic" + outputs: { + vcf_output: "VCF file containing called somatic variants", + gvcf_output: "gVCF file containing called somatic variants", + runtime: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + normal_bam: "Input BAM file with aligned reads for normal sample" + output_prefix: "Prefix for output VCF and gVCF files" + tumor_sample_name: "Sample name for the tumor sample" + normal_sample_name: "Sample name for the normal sample" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File tumor_bam + File normal_bam + String output_prefix = "deepsomatic_output" + String tumor_sample_name = "tumor" + String normal_sample_name = "normal" + String model_type = "WGS" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(tumor_bam, "GiB")) + + ceil(size(normal_bam, "GiB")) + + 50 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + run_deepsomatic \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --tumor_bam="~{tumor_bam}" \ + --normal_bam="~{normal_bam}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --tumor_sample_name="~{tumor_sample_name}" \ + --normal_sample_name="~{normal_sample_name}" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + ~{if runtime_report then "--runtime_report" else ""} \ + ~{if vcf_stats_report then "--vcf_stats_report" else ""} + + + rm -rf "$ref_fasta" + + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File? runtime = "logs/runtime_by_region_vis.html" + File? vcf_stats = "logs/vcf_stats_report.html" + } + + requirements { + container: "google/deepsomatic:1.9.0-gpu" + cpu: threads + memory: "32 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} + +task deepvariant { + meta { + description: "Call variants using DeepVariant" + outputs: { + vcf_output: "VCF file containing called variants", + gvcf_output: "gVCF file containing called variants", + runtime: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads for sample" + haploid_chromosomes: "List of chromosomes to be treated as haploid during variant calling" + output_prefix: "Prefix for output VCF and gVCF files" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + Array[String] haploid_chromosomes = ["chrX", "chrY"] + String output_prefix = "deepsomatic_output" + String model_type = "WGS" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 50 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + run_deepvariant \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --reads="~{bam}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + ~{if runtime_report then "--runtime_report" else ""} \ + ~{if vcf_stats_report then "--vcf_stats_report" else ""} \ + --haploid_contigs="~{sep(",", haploid_chromosomes)}" + + + rm -rf "$ref_fasta" + + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File? runtime = "logs/runtime_by_region_vis.html" + File? vcf_stats = "logs/vcf_stats_report.html" + } + + requirements { + container: "google/deepvariant:1.9.0-gpu" + cpu: threads + memory: "32 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} From 30d11593c4ecf571ce4fb9b07bf03b1367e4cd76 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 19 Dec 2025 12:45:46 -0500 Subject: [PATCH 24/29] chore: change hisat2 output to BAM --- tools/hisat2.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index a049ec07e..9e9dcb250 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -21,7 +21,7 @@ task align { File read_one_fastq_gz File reference_index File? read_two_fastq_gz - String output_name = "aligned.sam" + String output_name = "aligned.bam" Int threads = 4 Int modify_disk_size_gb = 0 } @@ -40,13 +40,15 @@ task align { tar -C hisat2_db -xzf "~{reference_index}" --no-same-owner PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") + mkfifo hisat2_stdout_pipe hisat2 \ -q \ -p ~{threads} \ - -S "~{output_name}" \ + -S hisat2_stdout_pipe \ -x "hisat2_db/$PREFIX" \ -1 "~{read_one_fastq_gz}" \ - ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} + ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} & + samtools view -bS hisat2_stdout_pipe > "~{output_name}" rm -r hisat2_db >>> From e58c2edecbe0c6b65cca1ced3ff650a32d4cf39d Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Fri, 19 Dec 2025 12:47:43 -0500 Subject: [PATCH 25/29] chore: write to stdout instead of fifo --- tools/hisat2.wdl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index 9e9dcb250..aa3077859 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -40,15 +40,13 @@ task align { tar -C hisat2_db -xzf "~{reference_index}" --no-same-owner PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") - mkfifo hisat2_stdout_pipe hisat2 \ -q \ -p ~{threads} \ - -S hisat2_stdout_pipe \ -x "hisat2_db/$PREFIX" \ -1 "~{read_one_fastq_gz}" \ - ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} & - samtools view -bS hisat2_stdout_pipe > "~{output_name}" + ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} \ + | samtools view -bS - > "~{output_name}" rm -r hisat2_db >>> From e680a5b4248e69ecfe9d58145d0dbef7489df3d0 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Mon, 22 Dec 2025 16:07:33 -0500 Subject: [PATCH 26/29] chore: add undocumented FAI requirement --- tools/strelka.wdl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/strelka.wdl b/tools/strelka.wdl index b986693a4..73b0236b0 100644 --- a/tools/strelka.wdl +++ b/tools/strelka.wdl @@ -11,6 +11,7 @@ task somatic { parameter_meta { reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" normal_bam: "Input BAM file with aligned reads for normal sample" tumor_bam: "Input BAM file with aligned reads for tumor sample" indel_candidates: "Optional VCF file with candidate indels, recommended to be generated by Manta" @@ -23,6 +24,7 @@ task somatic { input { File reference_fasta + File reference_fasta_index File normal_bam File tumor_bam File? indel_candidates @@ -50,6 +52,7 @@ task somatic { ref_fasta=~{basename(reference_fasta, ".gz")} gunzip -c "~{reference_fasta}" > "$ref_fasta" \ || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" configureStrelkaSomaticWorkflow.py \ --referenceFasta "$ref_fasta" \ @@ -94,6 +97,7 @@ task germline { parameter_meta { reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" bam: "Input BAM file with aligned reads" output_dir: "Directory to store Strelka output" exome: "Boolean indicating if the data is exome sequencing" @@ -104,6 +108,7 @@ task germline { input { File reference_fasta + File reference_fasta_index File bam String output_dir = "strelka_germline_output" Boolean exome = false @@ -123,6 +128,7 @@ task germline { ref_fasta=~{basename(reference_fasta, ".gz")} gunzip -c "~{reference_fasta}" > "$ref_fasta" \ || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" configureStrelkaGermlineWorkflow.py \ --referenceFasta "$ref_fasta" \ From 60dce466b9861c5cfb5851852a4b0281af1d28c6 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Tue, 23 Dec 2025 09:33:09 -0500 Subject: [PATCH 27/29] chore: add error checking to minimap2 --- tools/minimap2.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index 3a8dd364d..d2118a630 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -74,6 +74,8 @@ task align { + modify_disk_size_gb command <<< + set -euo pipefail + minimap2 \ ~{if defined(preset) then "-x \"~{preset}\"" else ""} \ ~{if output_paf then "" else "-a"} \ From f614485d9a24a43467fddc588d9a25d680427262 Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Thu, 15 Jan 2026 11:55:48 -0500 Subject: [PATCH 28/29] chore: cleanup reference files --- tools/bwamem2.wdl | 4 ++++ tools/deepvariant.wdl | 1 - tools/hisat2.wdl | 2 ++ tools/minimap2.wdl | 4 +++- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl index c34634a73..765fc44a5 100644 --- a/tools/bwamem2.wdl +++ b/tools/bwamem2.wdl @@ -66,6 +66,8 @@ task align { "~{read_one_fastq_gz}" \ ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} | samtools view -b -o "~{output_name}" - + + rm -r bwa_db >>> output { @@ -115,6 +117,8 @@ task index { "$ref_fasta" tar -czf "~{bwa_db_out_name}" "$ref_fasta"* + + rm -r "$ref_fasta" >>> output { diff --git a/tools/deepvariant.wdl b/tools/deepvariant.wdl index c5c1a3ad9..24d789fb6 100644 --- a/tools/deepvariant.wdl +++ b/tools/deepvariant.wdl @@ -192,7 +192,6 @@ task deepvariant { rm -rf "$ref_fasta" - >>> output { diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl index aa3077859..f63dd49b9 100644 --- a/tools/hisat2.wdl +++ b/tools/hisat2.wdl @@ -166,6 +166,8 @@ task index { "~{index_base_name}" tar -czf "~{index_base_name}.tar.gz" "~{index_base_name}"* + + rm -r "$ref_fasta" >>> output { diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl index d2118a630..41975e1ed 100644 --- a/tools/minimap2.wdl +++ b/tools/minimap2.wdl @@ -68,7 +68,7 @@ task align { Int disk_size_gb = ceil( ( size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") - ) * 2) + ) * 3) + ceil(size(reference_index, "GiB")) + 10 + modify_disk_size_gb @@ -154,6 +154,8 @@ task index { -t ~{threads} \ -d "~{index_name}" \ "$ref_fasta" + + rm -r "$ref_fasta" >>> output { From 352fa6f11ef27a9856c63807eb396bfc2d1fcd7c Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Tue, 20 Jan 2026 11:44:08 -0500 Subject: [PATCH 29/29] chore: minimum cores to 1 --- tools/bwa.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bwa.wdl b/tools/bwa.wdl index dbba3f2e7..3c407a326 100644 --- a/tools/bwa.wdl +++ b/tools/bwa.wdl @@ -62,7 +62,7 @@ task bwa_aln { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -160,7 +160,7 @@ task bwa_aln_pe { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -257,7 +257,7 @@ task bwa_mem { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner