From 0ac42c25fc7673859e527aed6a0dbeff6b2b4119 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 13 Nov 2023 17:57:58 +0000 Subject: [PATCH 01/23] allow multimapping genome output bams --- conf/modules.config | 24 +++-- modules.json | 5 + modules/nf-core/samtools/view/environment.yml | 6 ++ modules/nf-core/samtools/view/main.nf | 69 ++++++++++++++ modules/nf-core/samtools/view/meta.yml | 89 ++++++++++++++++++ nextflow.config | 2 +- subworkflows/goodwright/rna_align/main.nf | 92 +++++++++++-------- 7 files changed, 241 insertions(+), 46 deletions(-) create mode 100644 modules/nf-core/samtools/view/environment.yml create mode 100644 modules/nf-core/samtools/view/main.nf create mode 100644 modules/nf-core/samtools/view/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 0ec7ef2..53b5cfb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -300,6 +300,7 @@ if(params.run_alignment) { withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' { ext.args = { "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}" } + ext.prefix = { "${meta.id}_multi" } publishDir = [ [ path: { "${params.outdir}/02_alignment/genome/log" }, @@ -321,7 +322,7 @@ if(params.run_alignment) { ] } - withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' { + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' { publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -329,8 +330,8 @@ if(params.run_alignment) { ] } - withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' { - ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' { + ext.prefix = { "${meta.id}_multi.Aligned.toTranscriptome_sorted.out" } publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -338,8 +339,10 @@ if(params.run_alignment) { ] } - withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' { - ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_GENOME' { + ext.prefix = { "${meta.id}_unique_genome" } + ext.args = "-q 5 --output-fmt bam --write-index" + ext.index_type = "bai" publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -347,7 +350,16 @@ if(params.run_alignment) { ] } - + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_TRANSCRIPT' { + ext.prefix = { "${meta.id}_unique_transcriptome" } + ext.args = "-q 5 --output-fmt bam --write-index" + ext.index_type = "bai" + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } } } diff --git a/modules.json b/modules.json index 54d14a8..e450015 100644 --- a/modules.json +++ b/modules.json @@ -176,6 +176,11 @@ "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["bam_stats_samtools"] }, + "samtools/view": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, "star/align": { "branch": "master", "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 0000000..04c82f1 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 0000000..a41b876 --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,69 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + def index_type = task.ext.index_type ?: '' + def output_name = args.contains("--write-index") ? "${prefix}.${file_type}##idx##${prefix}.${file_type}.${index_type}" : + "${prefix}.${file_type}" + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${output_name}\\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 0000000..3dadafa --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,89 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/nextflow.config b/nextflow.config index 3dc2f22..f516083 100644 --- a/nextflow.config +++ b/nextflow.config @@ -76,7 +76,7 @@ params { umi_separator = "rbc:" paraclu_min_value = 10 bowtie_params = "-v 2 -m 100 --norc --best --strata" - star_params = "--outFilterMultimapNmax 1 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" + star_params = "--outFilterMultimapNmax 100 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" clippy_params = "" icount_peaks_params = "" peka_params = "" diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index 0e85370..7786d4e 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -5,16 +5,14 @@ /* * MODULES -*/ -include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' -include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' - -/* -* SUBWORKFLOWS -*/ -include { BAM_STATS_SAMTOOLS as BAM_STATS_SAMTOOLS_GENOME } from '../../nf-core/bam_stats_samtools/main.nf' -include { BAM_SORT_STATS_SAMTOOLS as BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT } from '../../nf-core/bam_sort_stats_samtools/main.nf' +*/ +include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' +include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_TRANSCRIPT } from '../../../modules/nf-core/samtools/view/main' workflow RNA_ALIGN { take: @@ -50,17 +48,27 @@ workflow RNA_ALIGN { ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) /* - * MODULE: Index genome-level BAM file + * MODULE: Index genome-level BAM file */ - SAMTOOLS_INDEX ( STAR_ALIGN.out.bam_sorted ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + SAMTOOLS_INDEX_GENOME ( STAR_ALIGN.out.bam_sorted ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_GENOME.out.versions.first()) + + /* + * MODULE: Index transcript-level BAM file + */ + SAMTOOLS_SORT_TRANSCRIPT ( STAR_ALIGN.out.bam_transcript ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_TRANSCRIPT.out.versions.first()) + + SAMTOOLS_INDEX_TRANSCRIPT ( SAMTOOLS_SORT_TRANSCRIPT.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_TRANSCRIPT.out.versions.first()) + /* * CHANNEL: Join bam and bai files */ ch_bam_bai = STAR_ALIGN.out.bam_sorted - .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) - .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) + .join(SAMTOOLS_INDEX_GENOME.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX_GENOME.out.csi, by: [0], remainder: true) .map { meta, bam, bai, csi -> if (bai) { @@ -71,38 +79,44 @@ workflow RNA_ALIGN { } /* - * SUBWORKFLOW: Stats on genome-level bam + * CHANNEL: Join bam and bai files */ - BAM_STATS_SAMTOOLS_GENOME ( - ch_bam_bai, - fasta - ) - ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS_GENOME.out.versions) + ch_transcript_bam_bai = SAMTOOLS_SORT_TRANSCRIPT.out.bam + .join(SAMTOOLS_INDEX_TRANSCRIPT.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX_TRANSCRIPT.out.csi, by: [0], remainder: true) + .map { + meta, bam, bai, csi -> + if (bai) { + [ meta, bam, bai ] + } else { + [ meta, bam, csi ] + } + } + /* - * SUBWORKFLOW: Sort, index and stats on transcript-level bam + * CHANNEL: Filter for uniquely mapping reads for downstream analysis; samtools view -b -q 5 -o output.bam alignments.bam */ - BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT ( - STAR_ALIGN.out.bam_transcript, - fasta + SAMTOOLS_VIEW_GENOME ( + ch_bam_bai, + [[],[]], + [] + ) + + SAMTOOLS_VIEW_TRANSCRIPT ( + ch_transcript_bam_bai, + [[],[]], + [] ) emit: - bt_bam = BOWTIE_ALIGN.out.bam // channel: [ val(meta), [ bam ] ] - bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] - star_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] - star_bam_transcript = STAR_ALIGN.out.bam_transcript // channel: [ val(meta), [ bam ] ] + bt_bam = BOWTIE_ALIGN.out.bam // channel: [ val(meta), [ bam ] ] + bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log = STAR_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), [ txt ] ] - genome_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] - genome_bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - genome_stats = BAM_STATS_SAMTOOLS_GENOME.out.stats // channel: [ val(meta), [ stats ] ] - genome_flagstat = BAM_STATS_SAMTOOLS_GENOME.out.flagstat // channel: [ val(meta), [ flagstat ] ] - genome_idxstats = BAM_STATS_SAMTOOLS_GENOME.out.idxstats // channel: [ val(meta), [ idxstats ] ] - transcript_bam = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] - transcript_bai = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] - transcript_stats = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.stats // channel: [ val(meta), [ stats ] ] - transcript_flagstat = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.flagstat // channel: [ val(meta), [ flagstat ] ] - transcript_idxstats = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.idxstats // channel: [ val(meta), [ idxstats ] ] + genome_bam = SAMTOOLS_VIEW_GENOME.out.bam // channel: [ val(meta), [ bam ] ] + genome_bai = SAMTOOLS_VIEW_GENOME.out.bai // channel: [ val(meta), [ bai ] ] + transcript_bam = SAMTOOLS_VIEW_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] + transcript_bai = SAMTOOLS_VIEW_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] } From 76545449ce6f1383b4349f66c46ffbecdd62dbc6 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 Nov 2023 11:32:47 +0000 Subject: [PATCH 02/23] publish bowtie premapping bam --- subworkflows/goodwright/rna_align/main.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index 7786d4e..5f3f854 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -9,6 +9,8 @@ include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME } from '../../../modules/nf-core/samtools/view/main' @@ -34,6 +36,12 @@ workflow RNA_ALIGN { ) ch_versions = ch_versions.mix(BOWTIE_ALIGN.out.versions) + SAMTOOLS_SORT_SMRNA ( BOWTIE_ALIGN.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA.out.versions) + + SAMTOOLS_INDEX_SMRNA ( SAMTOOLS_SORT_SMRNA.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA.out.versions) + /* * MODULE: Align reads that did not align to the smrna genome to the primary genome */ From 46a1203cc7e2c8c9d998d3444623a9e6811e7b74 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 Nov 2023 12:22:23 +0000 Subject: [PATCH 03/23] dedup multi bam and smrna bam --- conf/modules.config | 41 +++++++++++++--- main.nf | 58 ++++++++++++++++------- subworkflows/goodwright/rna_align/main.nf | 8 +++- 3 files changed, 82 insertions(+), 25 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 53b5cfb..5cc1a77 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -268,7 +268,7 @@ if(params.run_alignment) { path: { "${params.outdir}/02_alignment/smrna" }, mode: "${params.publish_dir_mode}", pattern: '*.bam', - enabled: false + enabled: params.save_align_intermed ], [ path: { "${params.outdir}/02_alignment/smrna/unmapped" }, @@ -405,9 +405,9 @@ if(params.run_read_filter) { if(params.run_umi_dedup) { process { - withName: 'CLIPSEQ:GENOME_DEDUP:UMICOLLAPSE' { + withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:UMICOLLAPSE' { ext.args = { "--umi-sep ${params.umi_separator}" } - ext.prefix = { "${meta.id}.genome.dedup" } + ext.prefix = { "${meta.id}.unique_genome.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", @@ -415,7 +415,9 @@ if(params.run_umi_dedup) { ] } - withName: 'CLIPSEQ:GENOME_DEDUP:SAMTOOLS_INDEX' { + withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.multi_genome.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", @@ -423,12 +425,37 @@ if(params.run_umi_dedup) { ] } - withName: 'CLIPSEQ:GENOME_DEDUP:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.genome.dedup" } + withName: 'CLIPSEQ:SMRNA_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.smrna.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", - pattern: "*.{stats,flagstat,idxstats}" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:SMRNA_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/main.nf b/main.nf index df58fce..d60e54b 100644 --- a/main.nf +++ b/main.nf @@ -125,7 +125,9 @@ include { PREPARE_CLIPSEQ } from './subwor include { PARSE_FASTQ_INPUT } from './subworkflows/goodwright/parse_fastq_input/main' include { FASTQC_TRIMGALORE } from './subworkflows/goodwright/fastqc_trimgalore/main' include { RNA_ALIGN } from './subworkflows/goodwright/rna_align/main' -include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_UNIQUE_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' @@ -315,13 +317,17 @@ workflow CLIPSEQ { ch_filtered_gtf, ch_fasta ) - ch_versions = ch_versions.mix(RNA_ALIGN.out.versions) - ch_genome_bam = RNA_ALIGN.out.genome_bam - ch_genome_bai = RNA_ALIGN.out.genome_bai - ch_transcript_bam = RNA_ALIGN.out.transcript_bam - ch_transcript_bai = RNA_ALIGN.out.transcript_bai - ch_bt_log = RNA_ALIGN.out.bt_log - ch_star_log = RNA_ALIGN.out.star_log_final + ch_versions = ch_versions.mix(RNA_ALIGN.out.versions) + ch_genome_unique_bam = RNA_ALIGN.out.genome_unique_bam + ch_genome_unique_bai = RNA_ALIGN.out.genome_unique_bai + ch_genome_multi_bam = RNA_ALIGN.out.genome_multi_bam + ch_genome_multi_bai = RNA_ALIGN.out.genome_multi_bai + ch_smrna_bam = RNA_ALIGN.out.smrna_bam + ch_smrna_bai = RNA_ALIGN.out.smrna_bai + ch_transcript_bam = RNA_ALIGN.out.transcript_bam + ch_transcript_bai = RNA_ALIGN.out.transcript_bai + ch_bt_log = RNA_ALIGN.out.bt_log + ch_star_log = RNA_ALIGN.out.star_log_final } if(params.run_read_filter) { @@ -360,9 +366,19 @@ workflow CLIPSEQ { /* * CHANNEL: Combine bam and bai files on id */ - ch_genome_bam_bai = ch_genome_bam + ch_genome_unique_bam_bai = ch_genome_unique_bam .map { row -> [row[0].id, row ].flatten()} - .join ( ch_genome_bai.map { row -> [row[0].id, row ].flatten()} ) + .join ( ch_genome_unique_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + + ch_genome_multi_bam_bai = ch_genome_multi_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_genome_multi_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + + ch_smrna_bam_bai = ch_smrna_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_smrna_bai.map { row -> [row[0].id, row ].flatten()} ) .map { row -> [row[1], row[2], row[4]] } ch_transcript_bam_bai = ch_transcript_bam @@ -373,13 +389,23 @@ workflow CLIPSEQ { /* * SUBWORKFLOW: Run umi deduplication on genome-level alignments */ - GENOME_DEDUP ( - ch_genome_bam_bai + GENOME_UNIQUE_DEDUP ( + ch_genome_unique_bam_bai + ) + ch_versions = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions) + ch_genome_bam = GENOME_UNIQUE_DEDUP.out.bam + ch_genome_bai = GENOME_UNIQUE_DEDUP.out.bai + ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log + + GENOME_MULTI_DEDUP ( + ch_genome_multi_bam_bai + ) + ch_versions = ch_versions.mix(GENOME_MULTI_DEDUP.out.versions) + + SMRNA_DEDUP ( + ch_smrna_bam_bai ) - ch_versions = ch_versions.mix(GENOME_DEDUP.out.versions) - ch_genome_bam = GENOME_DEDUP.out.bam - ch_genome_bai = GENOME_DEDUP.out.bai - ch_umi_log = GENOME_DEDUP.out.umi_log + ch_versions = ch_versions.mix(SMRNA_DEDUP.out.versions) /* * SUBWORKFLOW: Run umi deduplication on transcript-level alignments diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index 5f3f854..ec03724 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -122,9 +122,13 @@ workflow RNA_ALIGN { bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log = STAR_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), [ txt ] ] - genome_bam = SAMTOOLS_VIEW_GENOME.out.bam // channel: [ val(meta), [ bam ] ] - genome_bai = SAMTOOLS_VIEW_GENOME.out.bai // channel: [ val(meta), [ bai ] ] + genome_unique_bam = SAMTOOLS_VIEW_GENOME.out.bam // channel: [ val(meta), [ bam ] ] + genome_unique_bai = SAMTOOLS_VIEW_GENOME.out.bai // channel: [ val(meta), [ bai ] ] + genome_multi_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] + genome_multi_bai = SAMTOOLS_INDEX_GENOME.out.bai // channel: [ val(meta), [ bai ] ] transcript_bam = SAMTOOLS_VIEW_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] transcript_bai = SAMTOOLS_VIEW_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] + smrna_bam = SAMTOOLS_SORT_SMRNA.out.bam // channel: [ val(meta), [ bam ] ] + smrna_bai = SAMTOOLS_INDEX_SMRNA.out.bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] } From a99fc3c6fc604080c7fe8898c5f31fc056fa25d0 Mon Sep 17 00:00:00 2001 From: Chris Cheshire Date: Thu, 16 Nov 2023 11:33:21 +0000 Subject: [PATCH 04/23] Update schema --- schema/clipseq.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/clipseq.json b/schema/clipseq.json index 1fe89cc..ba9b1e1 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -302,7 +302,7 @@ }, "cli_options": { "name": "Command Line Options", - "description": "Proide customised command line options to specific processes", + "description": "Provide customised command line options to specific processes", "advanced": true, "properties": { "bowtie_params": { From c357a3a2158acbe052b62a41f06a72bb368d8f23 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 16:42:02 +0000 Subject: [PATCH 05/23] add trna/rrna premap count to icount_summary output --- conf/modules.config | 86 ++++++++++++++++++- main.nf | 30 +++++++ .../goodwright/clipseq/merge_summary/main.nf | 29 +++++++ .../goodwright/clipseq/merge_summary/meta.yml | 52 +++++++++++ .../merge_summary/templates/merge_summary.py | 66 ++++++++++++++ schema/clipseq.json | 2 +- .../goodwright/icount_analyse/main.nf | 15 +++- subworkflows/goodwright/rna_align/main.nf | 23 ++++- 8 files changed, 298 insertions(+), 5 deletions(-) create mode 100644 modules/goodwright/clipseq/merge_summary/main.nf create mode 100644 modules/goodwright/clipseq/merge_summary/meta.yml create mode 100644 modules/goodwright/clipseq/merge_summary/templates/merge_summary.py diff --git a/conf/modules.config b/conf/modules.config index 5cc1a77..8d33bbd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -279,6 +279,31 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:BOWTIE_ALIGN_K1' { + ext.args = { "-v 2 -m 100 --norc --best --strata -k 1" } + ext.prefix = "_withK1" + publishDir = [ + [ + path: { "${params.outdir}/02_alignment/smrna" }, + mode: "${params.publish_dir_mode}", + pattern: '*.out', + enabled: true + ], + [ + path: { "${params.outdir}/02_alignment/smrna" }, + mode: "${params.publish_dir_mode}", + pattern: '*.bam', + enabled: false + ], + [ + path: { "${params.outdir}/02_alignment/smrna/unmapped" }, + mode: "${params.publish_dir_mode}", + pattern: '*.fastq.gz', + enabled: false + ] + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA' { ext.prefix = { "${meta.id}_sorted" } publishDir = [ @@ -435,6 +460,16 @@ if(params.run_umi_dedup) { ] } + withName: 'CLIPSEQ:SMRNA_K1_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.smrna_withk1.dedup" } + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:SAMTOOLS_INDEX' { publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, @@ -459,6 +494,14 @@ if(params.run_umi_dedup) { ] } + withName: 'CLIPSEQ:SMRNA_K1_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CLIPSEQ:TRANSCRIPT_DEDUP:UMICOLLAPSE' { ext.args = { "--umi-sep ${params.umi_separator}" } ext.prefix = { "${meta.id}.transcript.dedup" } @@ -608,6 +651,37 @@ if(params.run_calc_crosslinks) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' { + ext.cmd1 = 'sort -k1,1 -k2,2n' + ext.suffix = '.smrna_withk1' + ext.ext = 'bed' + publishDir = [ + path: { "${params.outdir}/04_crosslinks" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' { + ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + ext.suffix = '.smrna_withk1' + ext.ext = 'bedgraph' + publishDir = [ + path: { "${params.outdir}/04_crosslinks" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { + ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' + ext.suffix = '.norm.smrna_withk1' + ext.ext = 'bedgraph' + publishDir = [ + enabled: false + ] + } } } @@ -709,8 +783,13 @@ if(params.run_peak_calling) { ] } - withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' { + publishDir = [ + enabled: false + ] + } + + withName: 'CLIPSEQ:ICOUNT_ANALYSE:MERGE_SUMMARY' { publishDir = [ path: { "${params.outdir}/05_peak_calling/icount" }, mode: "${params.publish_dir_mode}", @@ -728,7 +807,10 @@ if(params.run_peak_calling) { withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SIGXLS' { publishDir = [ - enabled: false + path: { "${params.outdir}/05_peak_calling/icount" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.scores.tsv" ] } diff --git a/main.nf b/main.nf index d60e54b..7d32840 100644 --- a/main.nf +++ b/main.nf @@ -128,7 +128,9 @@ include { RNA_ALIGN } from './subwor include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_UNIQUE_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME } from './subworkflows/goodwright/paraclu_analyse/main' @@ -324,6 +326,8 @@ workflow CLIPSEQ { ch_genome_multi_bai = RNA_ALIGN.out.genome_multi_bai ch_smrna_bam = RNA_ALIGN.out.smrna_bam ch_smrna_bai = RNA_ALIGN.out.smrna_bai + ch_smrna_k1_bam = RNA_ALIGN.out.smrna_k1_bam + ch_smrna_k1_bai = RNA_ALIGN.out.smrna_k1_bai ch_transcript_bam = RNA_ALIGN.out.transcript_bam ch_transcript_bai = RNA_ALIGN.out.transcript_bai ch_bt_log = RNA_ALIGN.out.bt_log @@ -381,6 +385,11 @@ workflow CLIPSEQ { .join ( ch_smrna_bai.map { row -> [row[0].id, row ].flatten()} ) .map { row -> [row[1], row[2], row[4]] } + ch_smrna_k1_bam_bai = ch_smrna_k1_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_smrna_k1_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + ch_transcript_bam_bai = ch_transcript_bam .map { row -> [row[0].id, row ].flatten()} .join ( ch_transcript_bai.map { row -> [row[0].id, row ].flatten()} ) @@ -407,6 +416,14 @@ workflow CLIPSEQ { ) ch_versions = ch_versions.mix(SMRNA_DEDUP.out.versions) + SMRNA_K1_DEDUP ( + ch_smrna_k1_bam_bai + ) + ch_versions = ch_versions.mix(SMRNA_K1_DEDUP.out.versions) + ch_smrna_k1_bam = SMRNA_K1_DEDUP.out.bam + ch_smrna_k1_bai = SMRNA_K1_DEDUP.out.bai + ch_umi_log = SMRNA_K1_DEDUP.out.umi_log + /* * SUBWORKFLOW: Run umi deduplication on transcript-level alignments */ @@ -425,6 +442,18 @@ workflow CLIPSEQ { ch_trans_crosslink_coverage = Channel.empty() ch_trans_crosslink_coverage_norm = Channel.empty() if(params.run_calc_crosslinks) { + /* + * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1 + */ + CALC_SMRNA_K1_CROSSLINKS ( + ch_smrna_k1_bam, + ch_smrna_fasta_fai.collect{ it[1] } + ) + ch_versions = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions) + ch_smrna_crosslink_bed = CALC_SMRNA_K1_CROSSLINKS.out.bed + ch_smrna_crosslink_coverage = CALC_SMRNA_K1_CROSSLINKS.out.coverage + ch_smrna_crosslink_coverage_norm = CALC_SMRNA_K1_CROSSLINKS.out.coverage_norm + /* * SUBWORKFLOW: Run crosslink calculation for genome */ @@ -489,6 +518,7 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run iCount on genome-level crosslinks */ ICOUNT_ANALYSE ( + ch_smrna_crosslink_bed, ch_genome_crosslink_bed, ch_regions_resolved_gtf.collect{ it[1] }, ch_seg_resolved_gtf.collect{ it[1] }, diff --git a/modules/goodwright/clipseq/merge_summary/main.nf b/modules/goodwright/clipseq/merge_summary/main.nf new file mode 100644 index 0000000..966e244 --- /dev/null +++ b/modules/goodwright/clipseq/merge_summary/main.nf @@ -0,0 +1,29 @@ +process MERGE_SUMMARY { + tag "$gtf" + label "process_single" + + conda "conda-forge::pandas=1.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.4.3': + 'biocontainers/pandas:1.4.3' }" + + input: + tuple val(meta), path(summary_type) + tuple val(meta), path(summary_subtype) + tuple val(meta), path(summary_gene) + tuple val(meta), path(smrna_premapped_k1_cDNA) + + output: + tuple val(meta), path("*summary_type_premapadjusted.tsv") , emit: summary_type_adjusted + tuple val(meta), path("*summary_subtype_premapadjusted.tsv"), emit: summary_subtype_adjusted + tuple val(meta), path("*summary_gene_premapadjusted.tsv") , emit: summary_gene_adjusted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + process_name = task.process + template 'merge_summary.py' +} + diff --git a/modules/goodwright/clipseq/merge_summary/meta.yml b/modules/goodwright/clipseq/merge_summary/meta.yml new file mode 100644 index 0000000..cdbb89e --- /dev/null +++ b/modules/goodwright/clipseq/merge_summary/meta.yml @@ -0,0 +1,52 @@ +name: merge_summary +description: Merge results of pre-mapping with results of iCount summary +tools: + - pandas: + description: | + Flexible and powerful data analysis / manipulation library for Python, + providing labeled data structures similar to R data.frame objects, + statistical functions, and much more. + homepage: https://pandas.pydata.org/ + documentation: https://pandas.pydata.org/docs/ + licence: ["BSD-3"] +input: + - summary_type: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - summary_subtype: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - summary_gene: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - smrna_premapped_k1_cDNA: + type: file + description: smRNA premapped k1 cDNA (deduplicated) bed file + pattern: "*.bed" + - smrna_premapped_k1_reads_log: + type: file + description: smRNA premapped k1 reads bowtie log to get read number before deduplication + pattern: "*.out" + +output: + - summary_type_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - summary_subtype_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - summary_gene_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@charlotteanne" diff --git a/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py b/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py new file mode 100644 index 0000000..2cc032a --- /dev/null +++ b/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +"""Merge pre-mapped results with genome-computed iCount Summary""" + +import platform +import argparse +from sys import exit +import pandas as pd +import os + + +def dump_versions(process_name): + with open("versions.yml", "w") as out_f: + out_f.write(process_name + ":\n") + out_f.write(" python: " + platform.python_version() + "\n") + out_f.write(" pandas: " + pd.__version__ + "\n") + +def extract_cdna_from_bed(file_path): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=None) + # Sum the values in the 5th column (index 4) + total_sum = df[4].sum() + return total_sum + +def adjust_summary_file(file_path, number_cdnas_premapped): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=0) + # Add the new values + new_row = ["premapped rRNA_tRNA", "NA", number_cdnas_premapped, 0] + # Append the new row using loc indexer + df.loc[len(df)] = new_row + # Correct the percentages + # Calculate the total cDNA # + total_cDNA = df['cDNA #'].sum() + # Update the cDNA % column + df['cDNA %'] = (df['cDNA #'] / total_cDNA) * 100 + # Create the output file name + base_name, extension = os.path.splitext(os.path.basename(file_path)) + output_file = base_name + "_premapadjusted" + extension + print(f"Saving to: {output_file}") # Debugging + # Write the updated DataFrame to the new file + df.to_csv(output_file, sep='\t', index=False) + +def main(processname, subtype, type, gene, cdna): + # Dump version file + dump_versions(processname) + + # Get number of cDNAs + number_cdnas_premapped = extract_cdna_from_bed(cdna) + print("Number of cDNAs premapped: " + str(number_cdnas_premapped)) + + adjust_summary_file(type, number_cdnas_premapped) + adjust_summary_file(subtype, number_cdnas_premapped) + adjust_summary_file(gene, number_cdnas_premapped) + +if __name__ == "__main__": + # Allows switching between nextflow templating and standalone python running using arguments + parser = argparse.ArgumentParser() + parser.add_argument("--processname", default="!{process_name}") + parser.add_argument("--subtype", default="!{summary_subtype}") + parser.add_argument("--type", default="!{summary_type}") + parser.add_argument("--gene", default="!{summary_gene}") + parser.add_argument("--cdna", default="!{smrna_premapped_k1_cDNA}") + args = parser.parse_args() + + main(args.processname, args.subtype, args.type, args.gene, args.cdna) diff --git a/schema/clipseq.json b/schema/clipseq.json index b9666d0..c56f845 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -353,7 +353,7 @@ "name": "Crosslink summary", "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).", "filetype": "tsv", - "process": "ICOUNT_SUMMARY" + "process": "MERGE_SUMMARY" }, { "name": "K-mer enrichment", diff --git a/subworkflows/goodwright/icount_analyse/main.nf b/subworkflows/goodwright/icount_analyse/main.nf index b6e5ab5..2ec3ff6 100644 --- a/subworkflows/goodwright/icount_analyse/main.nf +++ b/subworkflows/goodwright/icount_analyse/main.nf @@ -5,15 +5,18 @@ /* * MODULES */ + include { ICOUNT_SUMMARY } from '../../../modules/goodwright/icount/summary/main.nf' include { ICOUNT_RNAMAPS } from '../../../modules/goodwright/icount/rnamaps/main.nf' include { ICOUNT_SIGXLS } from '../../../modules/goodwright/icount/sigxls/main.nf' include { ICOUNT_PEAKS } from '../../../modules/goodwright/icount/peaks/main.nf' include { GUNZIP as GUNZIP_SIGXLS } from '../../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_PEAKS } from '../../../modules/nf-core/gunzip/main.nf' +include { MERGE_SUMMARY } from '../../../modules/goodwright/clipseq/merge_summary/main.nf' workflow ICOUNT_ANALYSE { take: + smrna_bed // channel: [ val(meta), [ bed ] ] bed // channel: [ val(meta), [ bed ] ] gtf_regions // channel: [ [ gtf ] ] gtf_resolved // channel: [ [ gtf.gz ] ] @@ -23,14 +26,24 @@ workflow ICOUNT_ANALYSE { ch_versions = Channel.empty() /* - * MODULE: Run iCount summary + * MODULE: Run iCount summary */ + + ICOUNT_SUMMARY ( bed, gtf_regions ) ch_versions = ch_versions.mix(ICOUNT_SUMMARY.out.versions) + MERGE_SUMMARY ( + ICOUNT_SUMMARY.out.summary_type, + ICOUNT_SUMMARY.out.summary_subtype, + ICOUNT_SUMMARY.out.summary_gene, + smrna_bed + ) + ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions) + /* * MODULE: Run iCount rnamaps */ diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index ec03724..ae1fc17 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -7,10 +7,13 @@ * MODULES */ include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' +include { BOWTIE_ALIGN as BOWTIE_ALIGN_K1 } from '../../../modules/nf-core/bowtie/align/main.nf' include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1 } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA_K1 } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME } from '../../../modules/nf-core/samtools/view/main' @@ -42,6 +45,23 @@ workflow RNA_ALIGN { SAMTOOLS_INDEX_SMRNA ( SAMTOOLS_SORT_SMRNA.out.bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA.out.versions) + /* + * MODULE: Align reads to smrna genome, here allowing 100 multimappers but only reporting one alignment per multimapped read + * so that we can accurately count it in the crosslink summary later + */ + + BOWTIE_ALIGN_K1 ( + fastq, + bt_index.collect{it[1]} + ) + ch_versions = ch_versions.mix(BOWTIE_ALIGN_K1.out.versions) + + SAMTOOLS_SORT_SMRNA_K1 ( BOWTIE_ALIGN_K1.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA_K1.out.versions) + + SAMTOOLS_INDEX_SMRNA_K1 ( SAMTOOLS_SORT_SMRNA_K1.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA_K1.out.versions) + /* * MODULE: Align reads that did not align to the smrna genome to the primary genome */ @@ -118,7 +138,6 @@ workflow RNA_ALIGN { ) emit: - bt_bam = BOWTIE_ALIGN.out.bam // channel: [ val(meta), [ bam ] ] bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log = STAR_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), [ txt ] ] @@ -130,5 +149,7 @@ workflow RNA_ALIGN { transcript_bai = SAMTOOLS_VIEW_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] smrna_bam = SAMTOOLS_SORT_SMRNA.out.bam // channel: [ val(meta), [ bam ] ] smrna_bai = SAMTOOLS_INDEX_SMRNA.out.bai // channel: [ val(meta), [ bai ] ] + smrna_k1_bam = SAMTOOLS_SORT_SMRNA_K1.out.bam // channel: [ val(meta), [ bam ] ] + smrna_k1_bai = SAMTOOLS_INDEX_SMRNA_K1.out.bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] } From 614e7251b9c19ebc3f214755496ef38d47c4c3b8 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 16:44:40 +0000 Subject: [PATCH 06/23] expose all trim galore args --- conf/modules.config | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8d33bbd..0a876e8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -225,7 +225,7 @@ if(params.run_trim_galore_fastqc && !params.skip_fastqc) { if(params.run_trim_galore_fastqc && !params.skip_trimming) { process { withName: 'CLIPSEQ:FASTQC_TRIMGALORE:TRIMGALORE' { - ext.args = "--fastqc --length ${params.trim_length} -q 20" + ext.args = "${params.trimgalore_params}" publishDir = [ [ path: { "${params.outdir}/01_prealign/post_trim_fastqc" }, diff --git a/nextflow.config b/nextflow.config index f516083..6bbe635 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,9 +72,9 @@ params { move_umi_to_header = false umi_header_format = null save_unaligned = true // DO NOT CHANGE - trim_length = 10 umi_separator = "rbc:" paraclu_min_value = 10 + trimgalore_params = "--fastqc --length 10 -q 20" bowtie_params = "-v 2 -m 100 --norc --best --strata" star_params = "--outFilterMultimapNmax 100 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" clippy_params = "" From fcb224e37979fdb9cae1aeb93f2f985d47db5b2b Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 17:05:26 +0000 Subject: [PATCH 07/23] tidy up the output folders --- conf/modules.config | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0a876e8..86b6467 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -268,7 +268,7 @@ if(params.run_alignment) { path: { "${params.outdir}/02_alignment/smrna" }, mode: "${params.publish_dir_mode}", pattern: '*.bam', - enabled: params.save_align_intermed + enabled: false ], [ path: { "${params.outdir}/02_alignment/smrna/unmapped" }, @@ -281,7 +281,7 @@ if(params.run_alignment) { withName: 'CLIPSEQ:RNA_ALIGN:BOWTIE_ALIGN_K1' { ext.args = { "-v 2 -m 100 --norc --best --strata -k 1" } - ext.prefix = "_withK1" + ext.prefix = { "${meta.id}_withK1" } publishDir = [ [ path: { "${params.outdir}/02_alignment/smrna" }, @@ -322,6 +322,18 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA_K1' { + publishDir = [ + enabled: false + ] + } + + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_SMRNA_K1' { + publishDir = [ + enabled: false + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' { ext.args = { "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}" } @@ -355,6 +367,14 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' { + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' { ext.prefix = { "${meta.id}_multi.Aligned.toTranscriptome_sorted.out" } publishDir = [ From f49ba741ba890017d02733349600e366b9251a03 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 17:07:07 +0000 Subject: [PATCH 08/23] update flow schema --- schema/clipseq.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/schema/clipseq.json b/schema/clipseq.json index b79b832..1068df9 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -308,14 +308,19 @@ "description": "Provide customised command line options to specific processes", "advanced": true, "properties": { + "trimgalore_params": { + "name": "Trim Galore! parameters", + "description": "Parameters for Trim Galore! trimming. Defaults are -q 20 and minimum length 10 to keep reads.", + "type": "string" + }, "bowtie_params": { "name": "Bowtie parameters", - "description": "Mapping parameters for Bowtie pre-mapping", + "description": "Mapping parameters for Bowtie pre-mapping. Do not touch unless you know what you are doing!", "type": "string" }, "star_params": { "name": "STAR parameters", - "description": "Mapping parameters for STAR mapping", + "description": "Mapping parameters for STAR mapping. Do not touch unless you know what you are doing!", "type": "string" }, "clippy_params": { From de15a01bf1056a37b108e99ae494fda484b733f9 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 17:12:32 +0000 Subject: [PATCH 09/23] get rid of min trim from schema, doesnt exist anymore --- schema/clipseq.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/schema/clipseq.json b/schema/clipseq.json index 1068df9..2ca3536 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -270,11 +270,6 @@ "description": "Additional pipeline configuration options.", "advanced": true, "properties": { - "trim_length": { - "name": "Minimum trim length.", - "description": "Minimum length of read to keep after Trim Galore! trimming.", - "type": "number" - }, "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", From 6d3cd6734344f221d96fc09f89cf934aa27e668a Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 28 Nov 2023 19:33:37 +0000 Subject: [PATCH 10/23] test for icount summary publishing --- conf/modules.config | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 86b6467..258e24a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -805,7 +805,10 @@ if(params.run_peak_calling) { withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' { publishDir = [ - enabled: false + enabled: false, + path: null, + mode: null, + saveAs: null ] } From bf7db756eb22095fcadc1e9e51fffa5bc52ae954 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Thu, 30 Nov 2023 17:57:13 +0000 Subject: [PATCH 11/23] use samtools to sort multimapped genome bam instead of star to fix memory issues --- conf/modules.config | 16 +++++++++++----- subworkflows/goodwright/rna_align/main.nf | 11 +++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 258e24a..9301b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -336,7 +336,7 @@ if(params.run_alignment) { withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' { - ext.args = { "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}" } + ext.args = { "--readFilesCommand zcat --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM ${params.star_params}" } ext.prefix = { "${meta.id}_multi" } publishDir = [ [ @@ -367,6 +367,15 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_GENOME' { + ext.prefix = { "${meta.id}_multi.Aligned.toGenome_sorted.out" } + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' { publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, @@ -805,10 +814,7 @@ if(params.run_peak_calling) { withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' { publishDir = [ - enabled: false, - path: null, - mode: null, - saveAs: null + enabled: false ] } diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index ae1fc17..f2c67a9 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -9,9 +9,10 @@ include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' include { BOWTIE_ALIGN as BOWTIE_ALIGN_K1 } from '../../../modules/nf-core/bowtie/align/main.nf' include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_GENOME } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA } from '../../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1 } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1 } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA_K1 } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main' @@ -75,10 +76,12 @@ workflow RNA_ALIGN { ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) + SAMTOOLS_SORT_GENOME ( STAR_ALIGN.out.bam ) + /* * MODULE: Index genome-level BAM file */ - SAMTOOLS_INDEX_GENOME ( STAR_ALIGN.out.bam_sorted ) + SAMTOOLS_INDEX_GENOME ( SAMTOOLS_SORT_GENOME.out.bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX_GENOME.out.versions.first()) /* @@ -94,7 +97,7 @@ workflow RNA_ALIGN { /* * CHANNEL: Join bam and bai files */ - ch_bam_bai = STAR_ALIGN.out.bam_sorted + ch_bam_bai = SAMTOOLS_SORT_GENOME.out.bam .join(SAMTOOLS_INDEX_GENOME.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX_GENOME.out.csi, by: [0], remainder: true) .map { @@ -143,7 +146,7 @@ workflow RNA_ALIGN { star_log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), [ txt ] ] genome_unique_bam = SAMTOOLS_VIEW_GENOME.out.bam // channel: [ val(meta), [ bam ] ] genome_unique_bai = SAMTOOLS_VIEW_GENOME.out.bai // channel: [ val(meta), [ bai ] ] - genome_multi_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] + genome_multi_bam = SAMTOOLS_SORT_GENOME.out.bam // channel: [ val(meta), [ bam ] ] genome_multi_bai = SAMTOOLS_INDEX_GENOME.out.bai // channel: [ val(meta), [ bai ] ] transcript_bam = SAMTOOLS_VIEW_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] transcript_bai = SAMTOOLS_VIEW_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] From 1d0aa3a113c27aeeec549dec8c5002c2b576f2e8 Mon Sep 17 00:00:00 2001 From: slbai01 Date: Thu, 29 Feb 2024 22:16:06 +0800 Subject: [PATCH 12/23] avoid UMICOLLAPSE Error: java.lang.StackOverflowError --- modules/goodwright/umicollapse/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index 355dcb8..8c2e7df 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -20,7 +20,7 @@ process UMICOLLAPSE { def prefix = task.ext.prefix ?: "${meta.id}" """ - java -jar /UMICollapse/umicollapse.jar \\ + java -Xss1G -jar /UMICollapse/umicollapse.jar \\ bam \\ -i $bam \\ -o ${prefix}.bam \\ From 591e8fb23cce2c6b1207f660ed2b9c408b90880c Mon Sep 17 00:00:00 2001 From: slbai01 Date: Thu, 29 Feb 2024 22:18:21 +0800 Subject: [PATCH 13/23] avoid samtools Error: Argument list too long --- modules/goodwright/samtools/simple_view/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf index 945661a..f55b1b9 100644 --- a/modules/goodwright/samtools/simple_view/main.nf +++ b/modules/goodwright/samtools/simple_view/main.nf @@ -5,7 +5,8 @@ process SAMTOOLS_SIMPLE_VIEW { conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'biocontainers/samtools:1.16.1--h6899075_1' }" + 'mgibio/samtools-cwl:1.16.1' }" +// 'biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(input), path(index) @@ -35,14 +36,13 @@ process SAMTOOLS_SIMPLE_VIEW { input.getExtension() if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ - samtools \\ + xargs --arg-file=longest_transcript.txt samtools \\ view \\ --threads ${task.cpus-1} \\ ${reference} \\ $args \\ -o ${prefix}.${file_type} \\ $input \\ - `cat ${filter_file}` \\ $args2 cat <<-END_VERSIONS > versions.yml From 82203b4f5185fab7357c2b33f321c5d6d91cd58e Mon Sep 17 00:00:00 2001 From: slbai01 Date: Thu, 29 Feb 2024 22:27:57 +0800 Subject: [PATCH 14/23] Fix filename absolute references --- modules/goodwright/samtools/simple_view/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf index f55b1b9..bb94cfb 100644 --- a/modules/goodwright/samtools/simple_view/main.nf +++ b/modules/goodwright/samtools/simple_view/main.nf @@ -36,7 +36,7 @@ process SAMTOOLS_SIMPLE_VIEW { input.getExtension() if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ - xargs --arg-file=longest_transcript.txt samtools \\ + xargs --arg-file=${filter_file} samtools \\ view \\ --threads ${task.cpus-1} \\ ${reference} \\ From 720a90a4fd30e35ef6a57a304ab5a150f868d986 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Wed, 10 Apr 2024 11:06:16 +0100 Subject: [PATCH 15/23] update peka outputs --- modules/goodwright/peka/main.nf | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/goodwright/peka/main.nf b/modules/goodwright/peka/main.nf index 82c833f..127274d 100644 --- a/modules/goodwright/peka/main.nf +++ b/modules/goodwright/peka/main.nf @@ -16,11 +16,14 @@ process PEKA { path gtf output: - tuple val(meta), path("*mer_cluster_distribution*"), emit: cluster, optional: true - tuple val(meta), path("*mer_distribution*") , emit: distribution, optional: true - tuple val(meta), path("*rtxn*") , emit: rtxn, optional: true - tuple val(meta), path("*.pdf") , emit: pdf, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*mer_cluster_distribution*") , emit: cluster, optional: true + tuple val(meta), path("*mer_distribution*") , emit: distribution, optional: true + tuple val(meta), path("*rtxn*") , emit: rtxn, optional: true + tuple val(meta), path("*.pdf") , emit: pdf, optional: true + tuple val(meta), path("*thresholded_sites*.bed.gz") , emit: tsites, optional: true + tuple val(meta), path("*oxn*.bed.gz") , emit: oxn, optional: true + tuple val(meta), path("*_clusters.csv") , emit: clust, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when From 653b3453e360db2eed6594f461eae61b9f4fff00 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Wed, 24 Apr 2024 16:19:17 +0100 Subject: [PATCH 16/23] revert changes on samtools simple view --- modules/goodwright/samtools/simple_view/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf index bb94cfb..945661a 100644 --- a/modules/goodwright/samtools/simple_view/main.nf +++ b/modules/goodwright/samtools/simple_view/main.nf @@ -5,8 +5,7 @@ process SAMTOOLS_SIMPLE_VIEW { conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'mgibio/samtools-cwl:1.16.1' }" -// 'biocontainers/samtools:1.16.1--h6899075_1' }" + 'biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(input), path(index) @@ -36,13 +35,14 @@ process SAMTOOLS_SIMPLE_VIEW { input.getExtension() if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ - xargs --arg-file=${filter_file} samtools \\ + samtools \\ view \\ --threads ${task.cpus-1} \\ ${reference} \\ $args \\ -o ${prefix}.${file_type} \\ $input \\ + `cat ${filter_file}` \\ $args2 cat <<-END_VERSIONS > versions.yml From 461acb770b8457fca02cf91b139e93f9e67654a4 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 26 Apr 2024 14:41:11 +0100 Subject: [PATCH 17/23] update umicollapse --- modules/goodwright/umicollapse/main.nf | 61 ++++++++++++++++++++------ 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index 8c2e7df..bdd2a5b 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -1,16 +1,21 @@ process UMICOLLAPSE { tag "$meta.id" label "process_high" + label "process_high_memory" - container 'docker.io/elly1502/umicollapse:latest' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' : + 'biocontainers/umicollapse:1.0.0--hdfd78af_1' }" input: - tuple val(meta), path(bam), path(bai) + tuple val(meta), path(input), path(bai) output: - tuple val(meta), path("*.bam"), emit: bam - tuple val(meta), path("*.log"), emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*dedup*fastq.gz"), emit: fastq, optional: true + tuple val(meta), path("*_UMICollapse.log"), emit: log + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -18,19 +23,49 @@ process UMICOLLAPSE { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - + def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for + // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90% + // which leaves 5% for stuff happening outside of java without the scheduler killing the process. + def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue() + def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" """ - java -Xss1G -jar /UMICollapse/umicollapse.jar \\ - bam \\ - -i $bam \\ - -o ${prefix}.bam \\ - $args + # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated + # by conda that allows to set the heap size (Xmx), but not the stack size (Xss). + # `which` allows us to get the directory that contains `umicollapse`, independent of whether we + # are in a container or conda environment. + UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar + java \\ + -Xmx${max_heap_size_mega}M \\ + -Xss${max_stack_size_mega}M \\ + -jar \$UMICOLLAPSE_JAR \\ + -i ${input} \\ + -o ${prefix}.${extension} \\ + $args | tee ${prefix}_UMICollapse.log - mv .command.log ${prefix}_UMICollapse.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: $VERSION + END_VERSIONS + """ + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.0-1' + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" + """ + touch ${prefix}.dedup.${extension} + touch ${prefix}_UMICollapse.log cat <<-END_VERSIONS > versions.yml "${task.process}": - umicollapse: NA + umicollapse: $VERSION END_VERSIONS """ } From 84e81e3f6adffc3fe9930ba53f8cd345e493d184 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 26 Apr 2024 16:41:32 +0100 Subject: [PATCH 18/23] Update main.nf --- modules/goodwright/umicollapse/main.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index bdd2a5b..ad26a21 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -29,10 +29,6 @@ process UMICOLLAPSE { // which leaves 5% for stuff happening outside of java without the scheduler killing the process. def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue() def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max - if ( mode !in [ 'fastq', 'bam' ] ) { - error "Mode must be one of 'fastq' or 'bam'." - } - extension = mode.contains("fastq") ? "fastq.gz" : "bam" """ # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated # by conda that allows to set the heap size (Xmx), but not the stack size (Xss). From 8837d1ea65bf89f1ef0fc3e98f3a452ad5b77250 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 26 Apr 2024 18:25:41 +0100 Subject: [PATCH 19/23] Update main.nf --- modules/goodwright/umicollapse/main.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index ad26a21..85d8202 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -40,7 +40,7 @@ process UMICOLLAPSE { -Xss${max_stack_size_mega}M \\ -jar \$UMICOLLAPSE_JAR \\ -i ${input} \\ - -o ${prefix}.${extension} \\ + -o ${prefix}.bam \\ $args | tee ${prefix}_UMICollapse.log cat <<-END_VERSIONS > versions.yml @@ -55,9 +55,8 @@ process UMICOLLAPSE { if ( mode !in [ 'fastq', 'bam' ] ) { error "Mode must be one of 'fastq' or 'bam'." } - extension = mode.contains("fastq") ? "fastq.gz" : "bam" """ - touch ${prefix}.dedup.${extension} + touch ${prefix}.dedup.bam touch ${prefix}_UMICollapse.log cat <<-END_VERSIONS > versions.yml "${task.process}": From 3f9b9fc40679536a3003096e1352a0c34755dd68 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 26 Apr 2024 20:17:13 +0100 Subject: [PATCH 20/23] Update main.nf --- modules/goodwright/umicollapse/main.nf | 54 ++++++-------------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index 85d8202..1e5f171 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -1,21 +1,16 @@ process UMICOLLAPSE { tag "$meta.id" label "process_high" - label "process_high_memory" - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' : - 'biocontainers/umicollapse:1.0.0--hdfd78af_1' }" + container 'docker.io/elly1502/umicollapse:latest' input: - tuple val(meta), path(input), path(bai) + tuple val(meta), path(bam), path(bai) output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*dedup*fastq.gz"), emit: fastq, optional: true - tuple val(meta), path("*_UMICollapse.log"), emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.log"), emit: log + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,44 +18,19 @@ process UMICOLLAPSE { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for - // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90% - // which leaves 5% for stuff happening outside of java without the scheduler killing the process. - def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue() - def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max + """ - # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated - # by conda that allows to set the heap size (Xmx), but not the stack size (Xss). - # `which` allows us to get the directory that contains `umicollapse`, independent of whether we - # are in a container or conda environment. - UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar - java \\ - -Xmx${max_heap_size_mega}M \\ - -Xss${max_stack_size_mega}M \\ - -jar \$UMICOLLAPSE_JAR \\ - -i ${input} \\ + java -Xmx184320M -jar /UMICollapse/umicollapse.jar \\ + bam \\ + -i $bam \\ -o ${prefix}.bam \\ - $args | tee ${prefix}_UMICollapse.log + $args - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umicollapse: $VERSION - END_VERSIONS - """ + mv .command.log ${prefix}_UMICollapse.log - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = '1.0.0-1' - if ( mode !in [ 'fastq', 'bam' ] ) { - error "Mode must be one of 'fastq' or 'bam'." - } - """ - touch ${prefix}.dedup.bam - touch ${prefix}_UMICollapse.log cat <<-END_VERSIONS > versions.yml "${task.process}": - umicollapse: $VERSION + umicollapse: NA END_VERSIONS """ } From 35f883a56c59e547a992731984ab2a4da54b1ffd Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 29 Apr 2024 11:35:12 +0100 Subject: [PATCH 21/23] update to using nf-core umicollapse module --- modules.json | 5 + modules/goodwright/umicollapse/main.nf | 4 +- .../clipseq/merge_summary/main.nf | 0 .../clipseq/merge_summary/meta.yml | 0 .../merge_summary/templates/merge_summary.py | 0 modules/nf-core/umicollapse/environment.yml | 7 + modules/nf-core/umicollapse/main.nf | 73 +++++ modules/nf-core/umicollapse/meta.yml | 63 +++++ .../nf-core/umicollapse/tests/main.nf.test | 249 ++++++++++++++++++ .../umicollapse/tests/main.nf.test.snap | 124 +++++++++ .../nf-core/umicollapse/tests/nextflow.config | 8 + .../umicollapse/tests/nextflow_PE.config | 10 + .../umicollapse/tests/nextflow_SE.config | 10 + modules/nf-core/umicollapse/tests/tags.yml | 2 + .../bam_dedup_samtools_umitools/main.nf | 5 +- .../goodwright/icount_analyse/main.nf | 2 +- 16 files changed, 557 insertions(+), 5 deletions(-) rename modules/{goodwright => local}/clipseq/merge_summary/main.nf (100%) rename modules/{goodwright => local}/clipseq/merge_summary/meta.yml (100%) rename modules/{goodwright => local}/clipseq/merge_summary/templates/merge_summary.py (100%) create mode 100644 modules/nf-core/umicollapse/environment.yml create mode 100644 modules/nf-core/umicollapse/main.nf create mode 100644 modules/nf-core/umicollapse/meta.yml create mode 100644 modules/nf-core/umicollapse/tests/main.nf.test create mode 100644 modules/nf-core/umicollapse/tests/main.nf.test.snap create mode 100644 modules/nf-core/umicollapse/tests/nextflow.config create mode 100644 modules/nf-core/umicollapse/tests/nextflow_PE.config create mode 100644 modules/nf-core/umicollapse/tests/nextflow_SE.config create mode 100644 modules/nf-core/umicollapse/tests/tags.yml diff --git a/modules.json b/modules.json index e450015..3519b29 100644 --- a/modules.json +++ b/modules.json @@ -196,6 +196,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "umicollapse": { + "branch": "master", + "git_sha": "b97197968ac12dde2463fa54541f6350c46f2035", + "installed_by": ["modules"] + }, "umitools/extract": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index 1e5f171..1090815 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -1,6 +1,6 @@ process UMICOLLAPSE { tag "$meta.id" - label "process_high" + label "process_medium" container 'docker.io/elly1502/umicollapse:latest' @@ -20,7 +20,7 @@ process UMICOLLAPSE { def prefix = task.ext.prefix ?: "${meta.id}" """ - java -Xmx184320M -jar /UMICollapse/umicollapse.jar \\ + java -jar /UMICollapse/umicollapse.jar \\ bam \\ -i $bam \\ -o ${prefix}.bam \\ diff --git a/modules/goodwright/clipseq/merge_summary/main.nf b/modules/local/clipseq/merge_summary/main.nf similarity index 100% rename from modules/goodwright/clipseq/merge_summary/main.nf rename to modules/local/clipseq/merge_summary/main.nf diff --git a/modules/goodwright/clipseq/merge_summary/meta.yml b/modules/local/clipseq/merge_summary/meta.yml similarity index 100% rename from modules/goodwright/clipseq/merge_summary/meta.yml rename to modules/local/clipseq/merge_summary/meta.yml diff --git a/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py b/modules/local/clipseq/merge_summary/templates/merge_summary.py similarity index 100% rename from modules/goodwright/clipseq/merge_summary/templates/merge_summary.py rename to modules/local/clipseq/merge_summary/templates/merge_summary.py diff --git a/modules/nf-core/umicollapse/environment.yml b/modules/nf-core/umicollapse/environment.yml new file mode 100644 index 0000000..8dbc65d --- /dev/null +++ b/modules/nf-core/umicollapse/environment.yml @@ -0,0 +1,7 @@ +name: umicollapse +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::umicollapse=1.0.0 diff --git a/modules/nf-core/umicollapse/main.nf b/modules/nf-core/umicollapse/main.nf new file mode 100644 index 0000000..dae290e --- /dev/null +++ b/modules/nf-core/umicollapse/main.nf @@ -0,0 +1,73 @@ +process UMICOLLAPSE { + tag "$meta.id" + label "process_high" + label "process_high_memory" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' : + 'biocontainers/umicollapse:1.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(input), path(bai) + val(mode) + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*dedup*fastq.gz"), emit: fastq, optional: true + tuple val(meta), path("*_UMICollapse.log"), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for + // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90% + // which leaves 5% for stuff happening outside of java without the scheduler killing the process. + def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue() + def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" + """ + # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated + # by conda that allows to set the heap size (Xmx), but not the stack size (Xss). + # `which` allows us to get the directory that contains `umicollapse`, independent of whether we + # are in a container or conda environment. + UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar + java \\ + -Xmx${max_heap_size_mega}M \\ + -Xss${max_stack_size_mega}M \\ + -jar \$UMICOLLAPSE_JAR \\ + $mode \\ + -i ${input} \\ + -o ${prefix}.${extension} \\ + $args | tee ${prefix}_UMICollapse.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.0-1' + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" + """ + touch ${prefix}.dedup.${extension} + touch ${prefix}_UMICollapse.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/umicollapse/meta.yml b/modules/nf-core/umicollapse/meta.yml new file mode 100644 index 0000000..c1361f9 --- /dev/null +++ b/modules/nf-core/umicollapse/meta.yml @@ -0,0 +1,63 @@ +--- +name: "umicollapse" +description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: + - umicollapse + - deduplication + - genomics +tools: + - "umicollapse": + description: "UMICollapse contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs)." + homepage: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + documentation: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + tool_dev_url: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + doi: "10.7717/peerj.8275" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file containing reads to be deduplicated via UMIs. + pattern: "*.{bam}" + - bai: + type: file + description: | + BAM index files corresponding to the input BAM file. Optionally can be skipped using [] when using FastQ input. + pattern: "*.{bai}" + - mode: + type: string + description: | + Selects the mode of Umicollapse - either fastq or bam need to be provided. + pattern: "{fastq,bam}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with deduplicated UMIs. + pattern: "*.{bam}" + - log: + type: file + description: A log file with the deduplication statistics. + pattern: "*_{UMICollapse.log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@CharlotteAnne" + - "@chris-cheshire" +maintainers: + - "@CharlotteAnne" + - "@chris-cheshire" + - "@apeltzer" + - "@MatthiasZepper" diff --git a/modules/nf-core/umicollapse/tests/main.nf.test b/modules/nf-core/umicollapse/tests/main.nf.test new file mode 100644 index 0000000..2dec45b --- /dev/null +++ b/modules/nf-core/umicollapse/tests/main.nf.test @@ -0,0 +1,249 @@ +nextflow_process { + + name "Test Process UMICOLLAPSE" + script "../main.nf" + process "UMICOLLAPSE" + + tag "modules" + tag "modules_nfcore" + tag "umicollapse" + tag "umitools/extract" + tag "samtools/index" + tag "bwa/index" + tag "bwa/mem" + + test("umicollapse single end test") { + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_SE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + + when { + config "./nextflow_SE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.versions).match() } + ) + } + + } + + test("umicollapse paired tests") { + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_PE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [ + [ id:'sarscov2'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + + when { + config "./nextflow_PE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.versions).match() } + ) + } + + } + + test("umicollapse fastq tests") { + + when { + config "./nextflow_SE.config" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + [] + ] + input[1] = 'fastq' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.fastq, + process.out.versions).match() } + ) + } + } + + test("umicollapse stub tests") { + options "-stub-run" + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_PE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [ + [ id:'sarscov2'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + when { + config "./nextflow_PE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/main.nf.test.snap b/modules/nf-core/umicollapse/tests/main.nf.test.snap new file mode 100644 index 0000000..861e9ca --- /dev/null +++ b/modules/nf-core/umicollapse/tests/main.nf.test.snap @@ -0,0 +1,124 @@ +{ + "umicollapse single end test": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.dedup.bam:md5,05c5331185263cbee6f508c0669be864" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T13:41:23.869211282" + }, + "umicollapse fastq tests": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.dedup.fastq.gz:md5,c9bac08c7fd8df3e0203e3eeafc73155" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-30T10:45:56.053352008" + }, + "umicollapse stub tests": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-30T10:46:12.482697713" + }, + "umicollapse paired tests": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.bam:md5,f4f05467cb456309fe22851d8b4d4387" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T13:41:54.486079388" + } +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/nextflow.config b/modules/nf-core/umicollapse/tests/nextflow.config new file mode 100644 index 0000000..844edbd --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/nextflow_PE.config b/modules/nf-core/umicollapse/tests/nextflow_PE.config new file mode 100644 index 0000000..ae4c963 --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow_PE.config @@ -0,0 +1,10 @@ +process { + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN" --bc-pattern2="NNNN"' + } + + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} diff --git a/modules/nf-core/umicollapse/tests/nextflow_SE.config b/modules/nf-core/umicollapse/tests/nextflow_SE.config new file mode 100644 index 0000000..d4b9443 --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow_SE.config @@ -0,0 +1,10 @@ +process { + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} diff --git a/modules/nf-core/umicollapse/tests/tags.yml b/modules/nf-core/umicollapse/tests/tags.yml new file mode 100644 index 0000000..912879c --- /dev/null +++ b/modules/nf-core/umicollapse/tests/tags.yml @@ -0,0 +1,2 @@ +umicollapse: + - "modules/nf-core/umicollapse/**" diff --git a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf index 0cb71f3..58ad0b6 100644 --- a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf +++ b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf @@ -2,7 +2,7 @@ * UMIcollapse, index BAM file and run samtools stats, flagstat and idxstats */ -include { UMICOLLAPSE } from '../../../modules/goodwright/umicollapse/main' +include { UMICOLLAPSE } from '../../../modules/nf-core/umicollapse/main' include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' workflow BAM_DEDUP_SAMTOOLS_UMITOOLS { @@ -16,7 +16,8 @@ workflow BAM_DEDUP_SAMTOOLS_UMITOOLS { * MODULE: UMI-tools collapse */ UMICOLLAPSE ( - bam_bai + bam_bai, + 'bam' ) ch_versions = ch_versions.mix(UMICOLLAPSE.out.versions) diff --git a/subworkflows/goodwright/icount_analyse/main.nf b/subworkflows/goodwright/icount_analyse/main.nf index 2ec3ff6..d2fabd6 100644 --- a/subworkflows/goodwright/icount_analyse/main.nf +++ b/subworkflows/goodwright/icount_analyse/main.nf @@ -12,7 +12,7 @@ include { ICOUNT_SIGXLS } from '../../../modules/goodwright/icount/sig include { ICOUNT_PEAKS } from '../../../modules/goodwright/icount/peaks/main.nf' include { GUNZIP as GUNZIP_SIGXLS } from '../../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_PEAKS } from '../../../modules/nf-core/gunzip/main.nf' -include { MERGE_SUMMARY } from '../../../modules/goodwright/clipseq/merge_summary/main.nf' +include { MERGE_SUMMARY } from '../../../modules/local/clipseq/merge_summary/main.nf' workflow ICOUNT_ANALYSE { take: From 1148dc67e3a387ae228650fdd57bcaf1dd50a856 Mon Sep 17 00:00:00 2001 From: Sam Ireland Date: Mon, 2 Dec 2024 02:50:40 +0000 Subject: [PATCH 22/23] 1.2 clipseq.json upgrade --- schema/clipseq.json | 218 +++++++++++++++++++++++--------------------- 1 file changed, 115 insertions(+), 103 deletions(-) diff --git a/schema/clipseq.json b/schema/clipseq.json index 2ca3536..0fe828e 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -1,275 +1,292 @@ { - "inputs": { - "sample_options": { + "inputs": [ + { "name": "Sample options", "description": "Parameters relating to the sample being analysed.", "advanced": false, - "properties": { + "params": { "samplesheet": { "name": "Samples", - "type": "sample", - "pattern": "csv|xlsx", - "required": true, - "categories": ["CLIP"], "description": "The samples to process.", - "csv": { - "group": { - "property": "", - "user_override": true, - "required": true + "type": "csv", + "required": true, + "takes_samples": true, + "sample_types": ["CLIP"], + "columns": [ + { + "name": "group", + "type": "string", + "required": true, + "render": true }, - "replicate": { - "property": "", - "user_override": true, - "required": true + { + "name": "replicate", + "type": "string", + "required": true, + "render": true }, - "fastq_1": { - "property": "input.1", - "user_override": false + { + "name": "fastq_1", + "type": "data", + "from_sample": 1, + "required": true, + "render": false }, - "fastq_2": { - "property": "input.2", - "user_override": false + { + "name": "fastq_2", + "type": "data", + "from_sample": 2, + "required": false, + "render": false } - } + ] } } }, - "genome_options": { + { "name": "Genome options", "description": "The genome being aligned to.", "advanced": false, - "takes_genome": true, - "properties": { + "from_execution": true, + "params": { "fasta": { - "name": "Genome FASTA", - "type": "file", - "pattern": "fasta|fa$", + "name": "FASTA", + "type": "data", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "required": true, - "genome_file": "fasta", + "execution_output": { + "process": null, + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$" + }, "description": "A raw genome FASTA file." }, "gtf": { "name": "GTF", - "type": "file", - "pattern": "gtf", + "type": "data", + "pattern": "^\\S+\\.gtf(\\.gz)?$", "required": true, - "genome_file": "gtf", + "execution_output": { + "process": null, + "pattern": "^\\S+\\.gtf(\\.gz)?$" + }, "description": "An annotation for the genome." }, "smrna_fasta": { "name": "smRNA FASTA", - "type": "file", + "type": "data", "pattern": "fasta|fa$", "required": true, - "genome_file": "fasta|fa$", + "execution_output": { + "process": null, + "pattern": "(smrna|trna)\\.(fasta|fa)$" + }, "description": "FASTA file to be mapped to before the genome file, typically containing rRNA and tRNA sequences." }, "fasta_fai": { "name": "Genome FASTA index", - "type": "file", + "type": "data", "pattern": "fai$", "required": false, - "genome_output": { + "execution_output": { "process": "PREPARE_PRIMARY_GENOME:CUSTOM_GETCHROMSIZES", - "filetype": "fai" + "pattern": "\\.fai$" }, "description": "A genome FASTA file index generated by Samtools faidx." }, "chrom_sizes": { "name": "Genome chromosome lengths", - "type": "file", + "type": "data", "pattern": "sizes$", "required": false, - "genome_output": { + "execution_output": { "process": "PREPARE_PRIMARY_GENOME:CUSTOM_GETCHROMSIZES", - "filetype": "sizes" + "pattern": "\\.sizes$" }, "description": "A tabulated file of chromosome names and lengths." }, "target_genome_index": { "name": "Genome STAR index", - "type": "file", + "type": "data", "pattern": "", "required": false, - "genome_output": { + "execution_output": { "process": "STAR_GENOMEGENERATE", - "filetype": "" + "pattern": "" }, "description": "A genome index generated by STAR." }, "smrna_genome_index": { "name": "Small RNA Bowtie index", - "type": "file", + "type": "data", "pattern": "", "required": false, - "genome_output": { + "execution_output": { "process": "BOWTIE_BUILD", - "filetype": "" + "pattern": "" }, "description": "A small RNA index for pre-mapping generated by Bowtie." }, "smrna_fasta_fai": { "name": "Small RNA FASTA index", - "type": "file", + "type": "data", "pattern": "fai$", "required": false, - "genome_output": { + "execution_output": { "process": "PREPARE_SMRNA_GENOME:CUSTOM_GETCHROMSIZES", - "filetype": "fai" + "pattern": "\\.fai$" }, "description": "A small RNA FASTA file index generated by Samtools faidx." }, "smrna_chrom_sizes": { "name": "Small RNA lengths", - "type": "file", + "type": "data", "pattern": "sizes$", "required": false, - "genome_output": { + "execution_output": { "process": "PREPARE_SMRNA_GENOME:CUSTOM_GETCHROMSIZES", - "filetype": "sizes" + "pattern": "\\.sizes$" }, "description": "A tabulated file of small RNA names and lengths." }, "longest_transcript": { "name": "Longest transcript IDs", - "type": "file", + "type": "data", "pattern": "txt$", "required": false, - "genome_output": { + "execution_output": { "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT", - "filetype": "txt" + "pattern": "\\.txt$" }, "description": "A list of transcript IDs for the longest transcript for each gene in provided GTF annotation." }, "longest_transcript_fai": { "name": "Longest transcript IDs and lengths", - "type": "file", + "type": "data", "pattern": "fai$", "required": false, - "genome_output": { + "execution_output": { "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT", - "filetype": "fai" + "pattern": "\\.fai$" }, "description": "A tabulated file of transcript IDs and lengths for the longest transcript for each gene in provided GTF annotation." }, "longest_transcript_gtf": { "name": "Longest transcript IDs", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "A list of transcript IDs for the longest transcript for each gene in provided GTF annotation." }, "filtered_gtf": { "name": "Filtered GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "CLIPSEQ_FILTER_GTF", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "GTF filtered for 'basic' transcript tag and support levels TSL1 and TSL2 to improve performance of downstream tools." }, "seg_gtf": { "name": "Segmented GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "ICOUNT_SEG_GTF", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "GTF segmented for use in iCount peak calling using iCount segment command." }, "seg_filt_gtf": { "name": "Segmented filtered GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "ICOUNT_SEG_FILTGTF", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "Filtered GTF segmented for use in iCount peak calling using iCount segment command." }, "seg_resolved_gtf": { "name": "Segmented resolved filtered GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "RESOLVE_UNANNOTATED", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "Filtered GTF segmented using iCount segment command and then resolve unannotated regions by overlapping transcript segments." }, "seg_resolved_gtf_genic": { "name": "Segmented resolved genic filtered GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "RESOLVE_UNANNOTATED_GENIC_OTHER", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "Filtered GTF segmented using iCount segment command and then resolve unannotated regions by annotating as 'genic other'." }, "regions_gtf": { "name": "Regions GTF", - "type": "file", + "type": "data", "pattern": "gtf.gz$", "required": false, - "genome_output": { + "execution_output": { "process": "ICOUNT_SEG_GTF", - "filetype": "gtf.gz" + "pattern": "\\.gtf.gz$" }, "description": "GTF regions for use in PEKA using iCount segment command." }, "regions_filt_gtf": { "name": "Filtered regions GTF", - "type": "file", + "type": "data", "pattern": "gtf.gz$", "required": false, - "genome_output": { + "execution_output": { "process": "ICOUNT_SEG_FILTGTF", - "filetype": "gtf.gz" + "pattern": "\\.gtf.gz$" }, "description": "Filtered GTF regions for use in PEKA using iCount segment command." }, "regions_resolved_gtf": { "name": "Filtered resolved regions GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "RESOLVE_UNANNOTATED_REGIONS", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "Filtered GTF regions using iCount segment command and then resolve unannotated regions by overlapping transcript segments." }, "regions_resolved_gtf_genic": { "name": "Filtered resolved regions genic GTF", - "type": "file", + "type": "data", "pattern": "gtf$", "required": false, - "genome_output": { + "execution_output": { "process": "RESOLVE_UNANNOTATED_GENIC_OTHER_REGIONS", - "filetype": "gtf" + "pattern": "\\.gtf$" }, "description": "Filtered GTF regions using iCount segment command and then resolve unannotated regions by annotating as 'genic other'." } } }, - "pipeline_options": { + { "name": "Settings", "description": "Additional pipeline configuration options.", "advanced": true, - "properties": { + "params": { "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", @@ -298,24 +315,19 @@ } } }, - "cli_options": { + { "name": "Command Line Options", - "description": "Provide customised command line options to specific processes", + "description": "Proide customised command line options to specific processes", "advanced": true, - "properties": { - "trimgalore_params": { - "name": "Trim Galore! parameters", - "description": "Parameters for Trim Galore! trimming. Defaults are -q 20 and minimum length 10 to keep reads.", - "type": "string" - }, + "params": { "bowtie_params": { "name": "Bowtie parameters", - "description": "Mapping parameters for Bowtie pre-mapping. Do not touch unless you know what you are doing!", + "description": "Mapping parameters for Bowtie pre-mapping", "type": "string" }, "star_params": { "name": "STAR parameters", - "description": "Mapping parameters for STAR mapping. Do not touch unless you know what you are doing!", + "description": "Mapping parameters for STAR mapping", "type": "string" }, "clippy_params": { @@ -335,7 +347,7 @@ } } } - }, + ], "outputs": [ { "name": "Normalised genome crosslink bedgraph", @@ -353,7 +365,7 @@ "name": "Crosslink summary", "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).", "filetype": "tsv", - "process": "MERGE_SUMMARY" + "process": "ICOUNT_SUMMARY" }, { "name": "K-mer enrichment", From 94cf9a062d1d3c71af3578168af23dc8abe340a4 Mon Sep 17 00:00:00 2001 From: Sam Ireland Date: Mon, 2 Dec 2024 02:51:15 +0000 Subject: [PATCH 23/23] 1.2 prepare_genome.json upgrade --- schema/prepare_genome.json | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/schema/prepare_genome.json b/schema/prepare_genome.json index a861593..c197887 100644 --- a/schema/prepare_genome.json +++ b/schema/prepare_genome.json @@ -1,37 +1,38 @@ { - "inputs": { - "genome_options": { + "inputs": [ + { "name": "Genome options", "description": "The genome being aligned to.", - "takes_genome": true, - "properties": { + "from_fileset": true, + "fileset_requires_organism": true, + "params": { "fasta": { "name": "Genome FASTA", - "type": "file", + "type": "data", "pattern": "fasta|fa$", "required": true, - "genome_file": "fasta", + "fileset_pattern": "fasta", "description": "A raw genome FASTA file." }, "gtf": { "name": "GTF", - "type": "file", - "pattern": "gtf", + "type": "data", + "pattern": "gtf$", "required": true, - "genome_file": "gtf", + "fileset_pattern": "gtf$", "description": "An annotation for the genome." }, "smrna_fasta": { "name": "smRNA FASTA", - "type": "file", + "type": "data", "pattern": "fasta|fa$", "required": true, - "genome_file": "fasta|fa$", + "fileset_pattern": "(smrna|trna)\\.(fasta|fa)$", "description": "FASTA file to be mapped to before the genome file, typically containing rRNA and tRNA sequences." } } } - }, + ], "outputs": [ { "name": "Genome STAR index",