diff --git a/docker/bwamem2/Dockerfile b/docker/bwamem2/Dockerfile new file mode 100644 index 000000000..a79bab258 --- /dev/null +++ b/docker/bwamem2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/bwa-mem2:2.3--he70b90d_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "bwa-mem2" ] \ No newline at end of file diff --git a/docker/bwamem2/package.json b/docker/bwamem2/package.json new file mode 100644 index 000000000..8a483d363 --- /dev/null +++ b/docker/bwamem2/package.json @@ -0,0 +1,5 @@ +{ + "name": "bwamem2", + "version": "2.3", + "revision": "0" +} \ No newline at end of file diff --git a/docker/hisat2/Dockerfile b/docker/hisat2/Dockerfile new file mode 100644 index 000000000..85b3898dd --- /dev/null +++ b/docker/hisat2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/hisat2:2.2.1--hdbdd923_7 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "hisat2" ] \ No newline at end of file diff --git a/docker/hisat2/package.json b/docker/hisat2/package.json new file mode 100644 index 000000000..e17679260 --- /dev/null +++ b/docker/hisat2/package.json @@ -0,0 +1,5 @@ +{ + "name": "hisat2", + "version": "2.2.1", + "revision": "0" +} \ No newline at end of file diff --git a/docker/minimap2/Dockerfile b/docker/minimap2/Dockerfile new file mode 100644 index 000000000..7fb04869b --- /dev/null +++ b/docker/minimap2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/minimap2:2.30--h577a1d6_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "minimap2" ] \ No newline at end of file diff --git a/docker/minimap2/package.json b/docker/minimap2/package.json new file mode 100644 index 000000000..a78b7377c --- /dev/null +++ b/docker/minimap2/package.json @@ -0,0 +1,5 @@ +{ + "name": "minimap2", + "version": "2.30", + "revision": "0" +} \ No newline at end of file diff --git a/docker/ngsep/Dockerfile b/docker/ngsep/Dockerfile new file mode 100644 index 000000000..d7c3ada65 --- /dev/null +++ b/docker/ngsep/Dockerfile @@ -0,0 +1,5 @@ +FROM eclipse-temurin:8 + +RUN wget https://github.com/NGSEP/NGSEPcore/releases/download/v5.1.0/NGSEPcore_5.1.0.jar -O /usr/local/bin/NGSEPcore.jar + +ENTRYPOINT [ "java", "-jar", "/usr/local/bin/NGSEPcore.jar" ] \ No newline at end of file diff --git a/docker/ngsep/package.json b/docker/ngsep/package.json new file mode 100644 index 000000000..19a575139 --- /dev/null +++ b/docker/ngsep/package.json @@ -0,0 +1,5 @@ +{ + "name": "ngsep", + "version": "5.1.0", + "revision": "0" +} diff --git a/tools/bwa.wdl b/tools/bwa.wdl index dbba3f2e7..3c407a326 100644 --- a/tools/bwa.wdl +++ b/tools/bwa.wdl @@ -62,7 +62,7 @@ task bwa_aln { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -160,7 +160,7 @@ task bwa_aln_pe { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -257,7 +257,7 @@ task bwa_mem { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl new file mode 100644 index 000000000..765fc44a5 --- /dev/null +++ b/tools/bwamem2.wdl @@ -0,0 +1,134 @@ +version 1.2 + +task align { + meta { + description: "Align DNA sequences against a large reference database using BWA-MEM2" + outputs: { + alignments: "The output alignment file in SAM format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with BWA-MEM2" + reference_index: "The BWA-MEM2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with BWA-MEM2" + prefix: "Prefix for the BAM file. The extension `.bam` will be added." + smart_pairing: "If true, enable smart pairing mode for paired-end reads" + skip_mate_rescue: "If true, skip mate rescue for paired-end reads" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + seed_length: "Seed value for the BWA-MEM2 aligner" + min_score: "Minimum score threshold for reporting alignments" + } + + input { + File read_one_fastq_gz + File reference_index + String read_group + File? read_two_fastq_gz + String prefix = sub( + basename(read_one_fastq_gz), + "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$", + "" + ) + Boolean smart_pairing = false + Boolean skip_mate_rescue = false + Int threads = 4 + Int modify_disk_size_gb = 0 + Int seed_length = 19 + Int min_score = 30 + } + + String output_name = prefix + ".bam" + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(reference_index, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + mkdir bwa_db + tar -C bwa_db -xzf "~{reference_index}" --no-same-owner + PREFIX=$(basename bwa_db/*.ann ".ann") + + bwa-mem2 mem \ + -t ~{threads} \ + -R "~{read_group}" \ + -k ~{seed_length} \ + -T ~{min_score} \ + ~{if smart_pairing then "-p" else ""} \ + ~{if skip_mate_rescue then "-S" else ""} \ + bwa_db/"$PREFIX" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} | + samtools view -b -o "~{output_name}" - + + rm -r bwa_db + >>> + + output { + File alignments = output_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with minimap2" + outputs: { + reference_index: "The minimap2 index file for the reference genome" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + db_name: "The base name for the output index files" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + String db_name = "reference" + Int modify_disk_size_gb = 0 + } + + Float input_fasta_size = size(reference_fasta, "GiB") + Int disk_size_gb = ceil(input_fasta_size * 2) + 10 + modify_disk_size_gb + String bwa_db_out_name = db_name + ".tar.gz" + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + bwa-mem2 index \ + "$ref_fasta" + + tar -czf "~{bwa_db_out_name}" "$ref_fasta"* + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = bwa_db_out_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: 1 + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/clair.wdl b/tools/clair.wdl new file mode 100644 index 000000000..20225f61c --- /dev/null +++ b/tools/clair.wdl @@ -0,0 +1,90 @@ +version 1.2 + +task clair3 { + meta { + description: "Run Clair3 variant caller for small variants using deep neural networks" + outputs: { + pileup_vcf: "VCF file with variants called using pileup model", + full_alignment_vcf: "VCF file with variants called using full-alignment model", + merged_vcf: "Final merged VCF file with variants from both models", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + model: "Pre-trained Clair3 model to use for variant calling" + bed_regions: "Optional BED file specifying regions to call variants in" + vcf_candidates: "Optional VCF file with candidate variants to consider" + output_dir: "Directory to store Clair3 output" + platform: { + description: "Sequencing platform used to generate the reads", + choices: [ + "ont", + "hifi", + "ilmn", + ], + } + all_contigs: "Boolean indicating whether to include all contigs in variant calling. If false only chr{1..22,X,Y} are called." + print_ref_calls: "Boolean indicating whether to print reference calls in the output VCF" + gvcf: "Boolean indicating whether to output gVCF format" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + File model + File? bed_regions + File? vcf_candidates + String output_dir = "clair3_output" + String platform = "ilmn" + Boolean all_contigs = false + Boolean print_ref_calls = false + Boolean gvcf = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + run_clair3.sh \ + --bam_fn="~{bam}" \ + --ref_fn="$ref_fasta" \ + --threads="~{threads}" \ + --platform="~{platform}" \ + --model_path="~{model}" \ + --output="~{output_dir}" \ + ~{if all_contigs then "--include_all_ctgs" else ""} \ + ~{if print_ref_calls then "--print_ref_calls" else ""} \ + ~{if defined(bed_regions) then "--bed_fn='~{bed_regions}'" else ""} \ + ~{if defined(vcf_candidates) then "--vcf_fn='~{vcf_candidates}'" else ""} \ + ~{if gvcf then "--gvcf" else ""} + + rm -rf "$ref_fasta" + >>> + + output { + File pileup_vcf = "~{output_dir}/pileup.vcf.gz" + File full_alignment_vcf = "~{output_dir}/full_alignment.vcf.gz" + File merged_vcf = "~{output_dir}/merge_output.vcf.gz" + } + + requirements { + container: "quay.io/biocontainers/clair3:1.2.0--py310h779eee5_0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/deepvariant.wdl b/tools/deepvariant.wdl new file mode 100644 index 000000000..24d789fb6 --- /dev/null +++ b/tools/deepvariant.wdl @@ -0,0 +1,215 @@ +version 1.2 + +task deepsomatic { + meta { + description: "Call variants using DeepSomatic" + outputs: { + vcf_output: "VCF file containing called somatic variants", + gvcf_output: "gVCF file containing called somatic variants", + runtime: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + normal_bam: "Input BAM file with aligned reads for normal sample" + output_prefix: "Prefix for output VCF and gVCF files" + tumor_sample_name: "Sample name for the tumor sample" + normal_sample_name: "Sample name for the normal sample" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File tumor_bam + File normal_bam + String output_prefix = "deepsomatic_output" + String tumor_sample_name = "tumor" + String normal_sample_name = "normal" + String model_type = "WGS" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(tumor_bam, "GiB")) + + ceil(size(normal_bam, "GiB")) + + 50 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + run_deepsomatic \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --tumor_bam="~{tumor_bam}" \ + --normal_bam="~{normal_bam}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --tumor_sample_name="~{tumor_sample_name}" \ + --normal_sample_name="~{normal_sample_name}" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + ~{if runtime_report then "--runtime_report" else ""} \ + ~{if vcf_stats_report then "--vcf_stats_report" else ""} + + + rm -rf "$ref_fasta" + + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File? runtime = "logs/runtime_by_region_vis.html" + File? vcf_stats = "logs/vcf_stats_report.html" + } + + requirements { + container: "google/deepsomatic:1.9.0-gpu" + cpu: threads + memory: "32 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} + +task deepvariant { + meta { + description: "Call variants using DeepVariant" + outputs: { + vcf_output: "VCF file containing called variants", + gvcf_output: "gVCF file containing called variants", + runtime: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads for sample" + haploid_chromosomes: "List of chromosomes to be treated as haploid during variant calling" + output_prefix: "Prefix for output VCF and gVCF files" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + Array[String] haploid_chromosomes = ["chrX", "chrY"] + String output_prefix = "deepsomatic_output" + String model_type = "WGS" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 50 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + run_deepvariant \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --reads="~{bam}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + ~{if runtime_report then "--runtime_report" else ""} \ + ~{if vcf_stats_report then "--vcf_stats_report" else ""} \ + --haploid_contigs="~{sep(",", haploid_chromosomes)}" + + + rm -rf "$ref_fasta" + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File? runtime = "logs/runtime_by_region_vis.html" + File? vcf_stats = "logs/vcf_stats_report.html" + } + + requirements { + container: "google/deepvariant:1.9.0-gpu" + cpu: threads + memory: "32 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl new file mode 100644 index 000000000..f63dd49b9 --- /dev/null +++ b/tools/hisat2.wdl @@ -0,0 +1,183 @@ +version 1.2 + +task align { + meta { + description: "Align RNA-seq reads against a reference genome using HISAT2" + outputs: { + alignments: "The output alignment file in SAM format", + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with HISAT2" + reference_index: "The HISAT2 index files for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with HISAT2" + output_name: "The name of the output alignment file" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + File? read_two_fastq_gz + String output_name = "aligned.bam" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(reference_index, "GiB") * 5) + + 10 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + mkdir hisat2_db + tar -C hisat2_db -xzf "~{reference_index}" --no-same-owner + PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") + + hisat2 \ + -q \ + -p ~{threads} \ + -x "hisat2_db/$PREFIX" \ + -1 "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "-2 \"~{read_two_fastq_gz}\"" else ""} \ + | samtools view -bS - > "~{output_name}" + + rm -r hisat2_db + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with HISAT2" + outputs: { + reference_index: "The HISAT2 index files for the reference genome", + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + snp: "List of SNPs" + haplotype: "List of haplotypes" + splice_site: "List of splice sites. Use with `exon`." + exon: "List of exons. Use with `splice_site`." + repeat_ref: "" + repeat_info: "" + repeat_snp: "" + repeat_haplotype: "" + bmax: "Maximum number of suffixes allowed in a block" + seed: "Seed for psuedo-random number generator" + bmaxdivn: "Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" + index_base_name: "The base name for the output index files" + force_large_index: "Force creation of a large index" + disable_auto_fitting: "Disable automatic fitting of index parameters" + nodc: "Disable difference-cover sample" + no_ref: "Do not build bitpacked version of reference sequence for paired-end alignment" + just_ref: "Build only the bitpacked version of reference sequence for paired-end alignment" + threads: "Number of threads to use for indexing" + dcv: "Period for the difference-cover sample. A larger period uses less memory, but may be slower. Must be a power of 2, no greater than 4096." + offrate: "The off-rate for the FM index" + ftabchars: "The lookup table to calculate initial BW range with respect to the first N characters of the query" + localoffrate: "The off-rate for the local FM index" + localftabchars: "The lookup table to calculate initial BW range for the local FM" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + File? snp + File? haplotype + File? splice_site + File? exon + File? repeat_ref + File? repeat_info + File? repeat_snp + File? repeat_haplotype + Int? bmax + Int? seed + Int? bmaxdivn = 4 + String index_base_name = "hisat2_index" + Boolean force_large_index = false + Boolean disable_auto_fitting = false + Boolean nodc = false + Boolean no_ref = false + Boolean just_ref = false + Int threads = 1 + Int dcv = 1024 + Int offrate = 5 + Int ftabchars = 10 + Int localoffrate = 3 + Int localftabchars = 6 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + hisat2-build \ + ~{if force_large_index then "--large-index" else ""} \ + ~{if disable_auto_fitting then "--disable-auto-fitting" else ""} \ + -p ~{threads} \ + ~{if defined(bmax) then "--bmax \"~{bmax}\"" else ""} \ + ~{if defined(bmaxdivn) then "--bmaxdivn \"~{bmaxdivn}\"" else ""} \ + ~{if !nodc then "--dcv \"~{dcv}\"" else ""} \ + ~{if no_ref then "--no-ref" else ""} \ + ~{if just_ref then "--just-ref" else ""} \ + --offrate "~{offrate}" \ + --ftabchars "~{ftabchars}" \ + --localoffrate "~{localoffrate}" \ + --localftabchars "~{localftabchars}" \ + ~{if defined(snp) then "--snp \"~{snp}\"" else ""} \ + ~{if defined(haplotype) then "--haplotype \"~{haplotype}\"" else ""} \ + ~{if defined(splice_site) then "--ss \"~{splice_site}\"" else ""} \ + ~{if defined(exon) then "--exon \"~{exon}\"" else ""} \ + ~{if defined(repeat_ref) then "--repeat-ref \"~{repeat_ref}\"" else ""} \ + ~{if defined(repeat_info) then "--repeat-info \"~{repeat_info}\"" else ""} \ + ~{if defined(repeat_snp) then "--repeat-snp \"~{repeat_snp}\"" else ""} \ + ~{( + if defined(repeat_haplotype) + then "--repeat-haplotype \"~{repeat_haplotype}\"" + else "" + )} \ + ~{if defined(seed) then "--seed \"~{seed}\"" else ""} \ + "$ref_fasta" \ + "~{index_base_name}" + + tar -czf "~{index_base_name}.tar.gz" "~{index_base_name}"* + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = "~{index_base_name}.tar.gz" + } + + requirements { + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} diff --git a/tools/manta.wdl b/tools/manta.wdl new file mode 100644 index 000000000..29946b749 --- /dev/null +++ b/tools/manta.wdl @@ -0,0 +1,125 @@ +version 1.2 + +task manta_germline { + meta { + description: "Run Manta structural variant and indel caller" + outputs: { + manta_output: "Directory containing Manta variant calls", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_dir: "Directory to store Manta output" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_dir = "manta_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configManta.py \ + --bam "~{bam}" \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Directory manta_output = output_dir + File log_file = "~{output_dir}/manta.log" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task manta_somatic { + meta { + description: "Run Manta structural variant and indel caller in somatic mode" + outputs: { + manta_output: "Directory containing Manta variant calls", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + tumor_bam: "Input BAM file with aligned reads from tumor sample" + normal_bam: "Input BAM file with aligned reads from normal sample" + output_dir: "Directory to store Manta output" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File tumor_bam + File normal_bam + String output_dir = "manta_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(tumor_bam, "GiB")) + + ceil(size(normal_bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + configManta.py \ + --normalBam "~{normal_bam}" \ + --tumorBam "~{tumor_bam}" \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Directory manta_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl new file mode 100644 index 000000000..41975e1ed --- /dev/null +++ b/tools/minimap2.wdl @@ -0,0 +1,171 @@ +version 1.2 + +task align { + meta { + description: "Align DNA or mRNA sequences against a large reference database" + outputs: { + alignments: "The output alignment file in SAM or PAF format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with minimap2" + reference_index: "The minimap2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with minimap2" + preset: { + description: "Minimap2 preset for alignment", + external_help: "https://lh3.github.io/minimap2/minimap2.html#8", + options: [ + "sr", + "map-ont", + "lr:hq", + "map-hifi", + "map-pb", + "map-iclr", + "asm5", + "asm10", + "asm20", + "splice", + "splice:hq", + "splice:sr", + "ava-pb", + "ava-ont", + ], + } + output_name: "The name of the output alignment file" + output_paf: "If true, output in PAF format instead of BAM" + cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" + ignore_base_quality: "If true, ignore base quality scores during alignment" + output_md_tag: "If true, include MD tags in the SAM output" + eqx: "If true, use =/X CIGAR operators instead of M" + soft_clip: "If true, use soft clipping for secondary alignments in SAM format" + secondary_alignments: "If true, report secondary alignments" + seed: "Seed value for the minimap2 aligner" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + String read_group + File? read_two_fastq_gz + String? preset = "sr" + String output_name = "aligned.bam" + Boolean output_paf = false + Boolean cigar_in_paf = true + Boolean ignore_base_quality = false + Boolean output_md_tag = true + Boolean eqx = false + Boolean soft_clip = true + Boolean secondary_alignments = true + Int seed = 11 + Int threads = 3 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil( + ( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 3) + + ceil(size(reference_index, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + minimap2 \ + ~{if defined(preset) then "-x \"~{preset}\"" else ""} \ + ~{if output_paf then "" else "-a"} \ + ~{if output_paf && cigar_in_paf then "-c" else ""} \ + ~{if ignore_base_quality then "-Q" else ""} \ + ~{if output_md_tag then "--MD" else ""} \ + ~{if eqx then "-X" else ""} \ + ~{if soft_clip then "-Y" else ""} \ + ~{if secondary_alignments then "--secondary=yes" else "--secondary=no"} \ + -t ~{threads} \ + --seed ~{seed} \ + -R "~{read_group}" \ + "~{reference_index}" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "\"~{read_two_fastq_gz}\"" else ""} \ + | if ~{output_paf}; then + cat - > "~{output_name}" + else + samtools view -b - > "~{output_name}" + fi + >>> + + output { + File alignments = output_name + } + + requirements { + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Create a minimap2 index for a reference genome" + outputs: { + reference_index: "The generated minimap2 index file" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + alt_contigs: "Optional file containing a list of alternative contigs" + index_name: "The name of the output index file" + minimizer_kmer_size: "K-mer size for minimizer indexing" + minimizer_window_size: "Window size for minimizer indexing" + threads: "Number of threads to use for indexing" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + File? alt_contigs + String index_name = "reference.mmi" + Int minimizer_kmer_size = 15 + Int minimizer_window_size = 10 + Int threads = 3 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB")) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + minimap2 \ + -k ~{minimizer_kmer_size} \ + -w ~{minimizer_window_size} \ + ~{if defined(alt_contigs) then "--alt \"~{alt_contigs}\"" else ""} \ + -t ~{threads} \ + -d "~{index_name}" \ + "$ref_fasta" + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = index_name + } + + requirements { + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/ngsep.wdl b/tools/ngsep.wdl new file mode 100644 index 000000000..bf93f9472 --- /dev/null +++ b/tools/ngsep.wdl @@ -0,0 +1,59 @@ +version 1.2 + +task germline_variant { + meta { + description: "Call germline variants using NGSEP" + outputs: { + vcf_output: "VCF file containing called germline variants" + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_prefix: "Prefix for the output file with called variants" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_prefix = "ngsep_germline_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + java -Xmx16g -jar /usr/local/bin/NGSEPcore.jar \ + SingleSampleVariantsDetector \ + -r "$ref_fasta" \ + -i "~{bam}" \ + -o "~{output_prefix}" \ + -t "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Array[File] vcf_output = glob("~{output_prefix}*") + } + + requirements { + container: "ghcr.io/stjude/ngsep:5.1.0-0" + cpu: threads + memory: "20 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/strelka.wdl b/tools/strelka.wdl new file mode 100644 index 000000000..73b0236b0 --- /dev/null +++ b/tools/strelka.wdl @@ -0,0 +1,157 @@ +version 1.2 + +task somatic { + meta { + description: "Run Strelka somatic variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka somatic variant calls", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + normal_bam: "Input BAM file with aligned reads for normal sample" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + indel_candidates: "Optional VCF file with candidate indels, recommended to be generated by Manta" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File normal_bam + File tumor_bam + File? indel_candidates + String output_dir = "strelka_somatic_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(normal_bam, "GiB")) + + ceil(size(tumor_bam, "GiB")) + + ( + if defined(indel_candidates) + then ceil(size(indel_candidates, "GiB")) + else 0 + ) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + configureStrelkaSomaticWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --tumorBam "~{tumor_bam}" \ + --normalBam "~{normal_bam}" \ + ~{if (exome) then "--exome" else ""} \ + ~{if (rna) then "--rna" else ""} \ + ~{( + if (defined(indel_candidates)) + then "--indelCandidates '~{indel_candidates}'" + else "" + )} + + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" + >>> + + output { + Directory strelka_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--hdfd78af_2" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task germline { + meta { + description: "Run Strelka germline variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka germline variant calls", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + String output_dir = "strelka_germline_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GiB") * 2) + + ceil(size(bam, "GiB")) + + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + configureStrelkaGermlineWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --bam "~{bam}" \ + ~{if (exome) then "--exome" else ""} \ + ~{if (rna) then "--rna" else ""} + + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" + >>> + + output { + Directory strelka_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--hdfd78af_2" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/vg.wdl b/tools/vg.wdl new file mode 100644 index 000000000..86f2a95f2 --- /dev/null +++ b/tools/vg.wdl @@ -0,0 +1,186 @@ +version 1.2 + +task giraffe { + meta { + description: "Align DNA sequences against a variation graph using vg giraffe" + outputs: { + alignments: "The output alignment file in GAM format" + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with vg giraffe" + gbz_graph: "The vg GBZ graph file for the reference genome" + minimizer_index: "The vg minimizer index file for the reference genome" + zipcode_name: "The vg zipcode name file for the reference genome" + distance_index: "The vg distance index file for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with vg giraffe" + haploytype: "The haplotype information file" + kff: "The KFF file containing kmer counts" + sample_name: "The sample name to include" + read_group: "The read group" + output_name: "The name of the output alignment file" + output_format: { + description: "The output format for alignments", + options: [ + "gam", + "gaf", + "json", + "tsv", + "SAM", + "BAM", + "CRAM", + ], + } + preset: { + description: "vg giraffe preset for alignment", + options: [ + "chaining-sr", + "default", + "fast", + "hifi", + "r10", + "srold", + ], + } + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File gbz_graph + File minimizer_index + File zipcode_name + File distance_index + File? read_two_fastq_gz + File? haploytype + File? kff + String? sample_name + String? read_group + String output_name = "aligned.bam" + String output_format = "BAM" + String preset = "default" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(( + size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB") + ) * 2) + + ceil(size(gbz_graph, "GiB")) + + ceil(size(minimizer_index, "GiB")) + + ceil(size(distance_index, "GiB")) + + ceil(size(zipcode_name, "GiB")) + + 10 + + modify_disk_size_gb + + command <<< + vg giraffe \ + -t ~{threads} \ + -Z "~{gbz_graph}" \ + -m "~{minimizer_index}" \ + -d "~{distance_index}" \ + -z "~{zipcode_name}" \ + -f "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) then "-f \"~{read_two_fastq_gz}\"" else ""} \ + -o "~{output_format}" \ + ~{if defined(sample_name) then "--sample \"~{sample_name}\"" else ""} \ + ~{if defined(read_group) then "--read-group \"~{read_group}\"" else ""} \ + ~{if defined(haploytype) then "--haplotype-name \"~{haploytype}\"" else ""} \ + ~{if defined(kff) then "--kff-name \"~{kff}\"" else ""} \ + --parameter-preset "~{preset}" \ + > "~{output_name}" + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with vg giraffe" + outputs: { + reference_index: "The vg giraffe index file for the reference genome" + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + vcf_files: "VCF(s) containing variants to augment the graph" + transcript_gff: "GFF(s) containing transcript annotations" + db_prefix: "The base name for the output index files" + gff_feature: "The feature type in the GFF to use for transcripts" + gff_id_tag: "The attribute tag in the GFF to use as transcript ID" + workflow: { + description: "The vg autoindex workflow to use", + choices: [ + "map", + "mpmap", + "rpvg", + "giraffe", + "sr-giraffe", + "lr-giraffe", + ], + } + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + threads: "Number of threads to use for indexing" + } + + input { + File reference_fasta + Array[File] vcf_files = [] + Array[File] transcript_gff = [] + String db_prefix = "reference" + String gff_feature = "exon" + String gff_id_tag = "transcript_id" + String workflow = "giraffe" + Int modify_disk_size_gb = 0 + Int threads = 4 + } + + Float input_fasta_size = size(reference_fasta, "GiB") + Float vcf_size = size(vcf_files, "GiB") + Float transcript_gff_size = size(transcript_gff, "GiB") + Int disk_size_gb = ceil(input_fasta_size * 2) + + ceil(vcf_size * 2) + + ceil(transcript_gff_size * 2) + + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + vg autoindex \ + --workflow "~{workflow}" \ + -r "$ref_fasta" \ + -p "~{db_prefix}" \ + ~{sep(" ", prefix("-v ", quote(vcf_files)))} \ + ~{sep(" ", prefix("-x ", quote(transcript_gff)))} \ + -t ~{threads} \ + --gff-feature "~{gff_feature}" \ + --gff-tx-tag "~{gff_id_tag}" + >>> + + output { + Array[File] reference_index = glob("~{db_prefix}*") + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +}