From 0ac42c25fc7673859e527aed6a0dbeff6b2b4119 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 13 Nov 2023 17:57:58 +0000
Subject: [PATCH 01/23] allow multimapping genome output bams

---
 conf/modules.config                           | 24 +++--
 modules.json                                  |  5 +
 modules/nf-core/samtools/view/environment.yml |  6 ++
 modules/nf-core/samtools/view/main.nf         | 69 ++++++++++++++
 modules/nf-core/samtools/view/meta.yml        | 89 ++++++++++++++++++
 nextflow.config                               |  2 +-
 subworkflows/goodwright/rna_align/main.nf     | 92 +++++++++++--------
 7 files changed, 241 insertions(+), 46 deletions(-)
 create mode 100644 modules/nf-core/samtools/view/environment.yml
 create mode 100644 modules/nf-core/samtools/view/main.nf
 create mode 100644 modules/nf-core/samtools/view/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 0ec7ef2..53b5cfb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -300,6 +300,7 @@ if(params.run_alignment) {
 
         withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' {
             ext.args   = {  "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}"  }
+            ext.prefix = { "${meta.id}_multi" }
             publishDir = [
                 [
                     path: { "${params.outdir}/02_alignment/genome/log" },
@@ -321,7 +322,7 @@ if(params.run_alignment) {
             ]
         }
 
-        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' {
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' {
             publishDir = [
                 path: { "${params.outdir}/02_alignment/genome" },
                 mode: "${params.publish_dir_mode}",
@@ -329,8 +330,8 @@ if(params.run_alignment) {
             ]
         }
 
-        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' {
-            ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" }
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' {
+            ext.prefix = { "${meta.id}_multi.Aligned.toTranscriptome_sorted.out" }
             publishDir = [
                 path: { "${params.outdir}/02_alignment/genome" },
                 mode: "${params.publish_dir_mode}",
@@ -338,8 +339,10 @@ if(params.run_alignment) {
             ]
         }
 
-        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' {
-            ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" }
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_GENOME' {
+            ext.prefix = { "${meta.id}_unique_genome" }
+            ext.args   = "-q 5 --output-fmt bam --write-index"
+            ext.index_type = "bai"
             publishDir = [
                 path: { "${params.outdir}/02_alignment/genome" },
                 mode: "${params.publish_dir_mode}",
@@ -347,7 +350,16 @@ if(params.run_alignment) {
             ]
         }
 
-
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_TRANSCRIPT' {
+            ext.prefix = { "${meta.id}_unique_transcriptome" }
+            ext.args   = "-q 5 --output-fmt bam --write-index"
+            ext.index_type = "bai"
+            publishDir = [
+                path: { "${params.outdir}/02_alignment/genome" },
+                mode: "${params.publish_dir_mode}",
+                enabled: params.save_align_intermed
+            ]
+        } 
     }
 }
 
diff --git a/modules.json b/modules.json
index 54d14a8..e450015 100644
--- a/modules.json
+++ b/modules.json
@@ -176,6 +176,11 @@
                         "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1",
                         "installed_by": ["bam_stats_samtools"]
                     },
+                    "samtools/view": {
+                        "branch": "master",
+                        "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
+                        "installed_by": ["modules"]
+                    },
                     "star/align": {
                         "branch": "master",
                         "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3",
diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml
new file mode 100644
index 0000000..04c82f1
--- /dev/null
+++ b/modules/nf-core/samtools/view/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf
new file mode 100644
index 0000000..a41b876
--- /dev/null
+++ b/modules/nf-core/samtools/view/main.nf
@@ -0,0 +1,69 @@
+process SAMTOOLS_VIEW {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(input), path(index)
+    tuple val(meta2), path(fasta)
+    path qname
+
+    output:
+    tuple val(meta), path("*.bam"),  emit: bam,     optional: true
+    tuple val(meta), path("*.cram"), emit: cram,    optional: true
+    tuple val(meta), path("*.sam"),  emit: sam,     optional: true
+    tuple val(meta), path("*.bai"),  emit: bai,     optional: true
+    tuple val(meta), path("*.csi"),  emit: csi,     optional: true
+    tuple val(meta), path("*.crai"), emit: crai,    optional: true
+    path  "versions.yml",            emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def reference = fasta ? "--reference ${fasta}" : ""
+    def readnames = qname ? "--qname-file ${qname}": ""
+    def file_type = args.contains("--output-fmt sam") ? "sam" :
+                    args.contains("--output-fmt bam") ? "bam" :
+                    args.contains("--output-fmt cram") ? "cram" :
+                    input.getExtension()
+    def index_type = task.ext.index_type ?: ''
+    def output_name = args.contains("--write-index") ?  "${prefix}.${file_type}##idx##${prefix}.${file_type}.${index_type}" :
+                    "${prefix}.${file_type}"
+    if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    samtools \\
+        view \\
+        --threads ${task.cpus-1} \\
+        ${reference} \\
+        ${readnames} \\
+        $args \\
+        -o ${output_name}\\
+        $input \\
+        $args2
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.bam
+    touch ${prefix}.cram
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml
new file mode 100644
index 0000000..3dadafa
--- /dev/null
+++ b/modules/nf-core/samtools/view/meta.yml
@@ -0,0 +1,89 @@
+name: samtools_view
+description: filter/convert SAM/BAM/CRAM file
+keywords:
+  - view
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - index:
+      type: file
+      description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional)
+      pattern: "*.{.bai,.csi,.crai}"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'test' ]
+  - fasta:
+      type: file
+      description: Reference file the CRAM was created with (optional)
+      pattern: "*.{fasta,fa}"
+  - qname:
+      type: file
+      description: Optional file with read names to output only select alignments
+      pattern: "*.{txt,list}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: optional filtered/converted BAM file
+      pattern: "*.{bam}"
+  - cram:
+      type: file
+      description: optional filtered/converted CRAM file
+      pattern: "*.{cram}"
+  - sam:
+      type: file
+      description: optional filtered/converted SAM file
+      pattern: "*.{sam}"
+  # bai, csi, and crai are created with `--write-index`
+  - bai:
+      type: file
+      description: optional BAM file index
+      pattern: "*.{bai}"
+  - csi:
+      type: file
+      description: optional tabix BAM file index
+      pattern: "*.{csi}"
+  - crai:
+      type: file
+      description: optional CRAM file index
+      pattern: "*.{crai}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@joseespinosa"
+  - "@FriederikeHanssen"
+  - "@priyanka-surana"
+maintainers:
+  - "@drpatelh"
+  - "@joseespinosa"
+  - "@FriederikeHanssen"
+  - "@priyanka-surana"
diff --git a/nextflow.config b/nextflow.config
index 3dc2f22..f516083 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -76,7 +76,7 @@ params {
     umi_separator       = "rbc:"
     paraclu_min_value   = 10
 	bowtie_params       = "-v 2 -m 100 --norc --best --strata"
-    star_params         = "--outFilterMultimapNmax 1 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic"
+    star_params         = "--outFilterMultimapNmax 100 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic"
     clippy_params       = ""
     icount_peaks_params = ""
     peka_params         = ""
diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf
index 0e85370..7786d4e 100644
--- a/subworkflows/goodwright/rna_align/main.nf
+++ b/subworkflows/goodwright/rna_align/main.nf
@@ -5,16 +5,14 @@
 
 /*
 * MODULES
-*/
-include { BOWTIE_ALIGN   } from '../../../modules/nf-core/bowtie/align/main.nf'
-include { STAR_ALIGN     } from '../../../modules/nf-core/star/align/main.nf'
-include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main'
-
-/*
-* SUBWORKFLOWS
-*/
-include { BAM_STATS_SAMTOOLS as BAM_STATS_SAMTOOLS_GENOME               } from '../../nf-core/bam_stats_samtools/main.nf'
-include { BAM_SORT_STATS_SAMTOOLS as BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT } from '../../nf-core/bam_sort_stats_samtools/main.nf'
+*/ 
+include { BOWTIE_ALIGN                                } from '../../../modules/nf-core/bowtie/align/main.nf'
+include { STAR_ALIGN                                  } from '../../../modules/nf-core/star/align/main.nf'
+include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT   } from '../../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME     } from '../../../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME       } from '../../../modules/nf-core/samtools/view/main'
+include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_TRANSCRIPT   } from '../../../modules/nf-core/samtools/view/main'  
 
 workflow RNA_ALIGN {
     take:
@@ -50,17 +48,27 @@ workflow RNA_ALIGN {
     ch_versions = ch_versions.mix(STAR_ALIGN.out.versions)
 
     /*
-    * MODULE: Index genome-level BAM file
+    * MODULE: Index genome-level BAM file 
     */
-    SAMTOOLS_INDEX ( STAR_ALIGN.out.bam_sorted )
-    ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first())
+    SAMTOOLS_INDEX_GENOME ( STAR_ALIGN.out.bam_sorted )
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_GENOME.out.versions.first())
+
+    /*
+    * MODULE: Index transcript-level BAM file 
+    */
+    SAMTOOLS_SORT_TRANSCRIPT ( STAR_ALIGN.out.bam_transcript )
+    ch_versions = ch_versions.mix(SAMTOOLS_SORT_TRANSCRIPT.out.versions.first())
+
+    SAMTOOLS_INDEX_TRANSCRIPT ( SAMTOOLS_SORT_TRANSCRIPT.out.bam )
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_TRANSCRIPT.out.versions.first())
+
 
     /*
     * CHANNEL: Join bam and bai files
     */
     ch_bam_bai = STAR_ALIGN.out.bam_sorted
-        .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true)
-        .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true)
+        .join(SAMTOOLS_INDEX_GENOME.out.bai, by: [0], remainder: true)
+        .join(SAMTOOLS_INDEX_GENOME.out.csi, by: [0], remainder: true)
         .map {
             meta, bam, bai, csi ->
                 if (bai) {
@@ -71,38 +79,44 @@ workflow RNA_ALIGN {
         }
 
     /*
-    * SUBWORKFLOW: Stats on genome-level bam
+    * CHANNEL: Join bam and bai files
     */
-    BAM_STATS_SAMTOOLS_GENOME (
-        ch_bam_bai,
-        fasta
-    )
-    ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS_GENOME.out.versions)
+    ch_transcript_bam_bai = SAMTOOLS_SORT_TRANSCRIPT.out.bam
+        .join(SAMTOOLS_INDEX_TRANSCRIPT.out.bai, by: [0], remainder: true)
+        .join(SAMTOOLS_INDEX_TRANSCRIPT.out.csi, by: [0], remainder: true)
+        .map {
+            meta, bam, bai, csi ->
+                if (bai) {
+                    [ meta, bam, bai ]
+                } else {
+                    [ meta, bam, csi ]
+                }
+        }
+
 
     /*
-    * SUBWORKFLOW: Sort, index and stats on transcript-level bam
+    * CHANNEL: Filter for uniquely mapping reads for downstream analysis; samtools view -b -q 5 -o output.bam alignments.bam
     */
-    BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT (
-        STAR_ALIGN.out.bam_transcript,
-        fasta
+    SAMTOOLS_VIEW_GENOME (
+        ch_bam_bai,
+        [[],[]],
+        []
+    )
+
+    SAMTOOLS_VIEW_TRANSCRIPT (
+        ch_transcript_bam_bai,
+        [[],[]],
+        []
     )
 
     emit:
-    bt_bam              = BOWTIE_ALIGN.out.bam                           // channel: [ val(meta), [ bam ] ]
-    bt_log              = BOWTIE_ALIGN.out.log                           // channel: [ val(meta), [ txt ] ]
-    star_bam            = STAR_ALIGN.out.bam_sorted                       // channel: [ val(meta), [ bam ] ]
-    star_bam_transcript = STAR_ALIGN.out.bam_transcript                   // channel: [ val(meta), [ bam ] ]
+    bt_bam              = BOWTIE_ALIGN.out.bam                            // channel: [ val(meta), [ bam ] ]
+    bt_log              = BOWTIE_ALIGN.out.log                            // channel: [ val(meta), [ txt ] ]
     star_log            = STAR_ALIGN.out.log                              // channel: [ val(meta), [ txt ] ]
     star_log_final      = STAR_ALIGN.out.log_final                        // channel: [ val(meta), [ txt ] ]
-    genome_bam          = STAR_ALIGN.out.bam_sorted                       // channel: [ val(meta), [ bam ] ]
-    genome_bai          = SAMTOOLS_INDEX.out.bai                          // channel: [ val(meta), [ bai ] ]
-    genome_stats        = BAM_STATS_SAMTOOLS_GENOME.out.stats             // channel: [ val(meta), [ stats ] ]
-    genome_flagstat     = BAM_STATS_SAMTOOLS_GENOME.out.flagstat          // channel: [ val(meta), [ flagstat ] ]
-    genome_idxstats     = BAM_STATS_SAMTOOLS_GENOME.out.idxstats          // channel: [ val(meta), [ idxstats ] ]
-    transcript_bam      = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bam      // channel: [ val(meta), [ bam ] ]
-    transcript_bai      = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bai      // channel: [ val(meta), [ bai ] ]
-    transcript_stats    = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.stats    // channel: [ val(meta), [ stats ] ]
-    transcript_flagstat = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.flagstat // channel: [ val(meta), [ flagstat ] ]
-    transcript_idxstats = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.idxstats // channel: [ val(meta), [ idxstats ] ]
+    genome_bam          = SAMTOOLS_VIEW_GENOME.out.bam                    // channel: [ val(meta), [ bam ] ]
+    genome_bai          = SAMTOOLS_VIEW_GENOME.out.bai                    // channel: [ val(meta), [ bai ] ]
+    transcript_bam      = SAMTOOLS_VIEW_TRANSCRIPT.out.bam                // channel: [ val(meta), [ bam ] ]
+    transcript_bai      = SAMTOOLS_VIEW_TRANSCRIPT.out.bai                // channel: [ val(meta), [ bai ] ]
     versions            = ch_versions                                     // channel: [ versions.yml ]
 }

From 76545449ce6f1383b4349f66c46ffbecdd62dbc6 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 14 Nov 2023 11:32:47 +0000
Subject: [PATCH 02/23] publish bowtie premapping bam

---
 subworkflows/goodwright/rna_align/main.nf | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf
index 7786d4e..5f3f854 100644
--- a/subworkflows/goodwright/rna_align/main.nf
+++ b/subworkflows/goodwright/rna_align/main.nf
@@ -9,6 +9,8 @@
 include { BOWTIE_ALIGN                                } from '../../../modules/nf-core/bowtie/align/main.nf'
 include { STAR_ALIGN                                  } from '../../../modules/nf-core/star/align/main.nf'
 include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT   } from '../../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA        } from '../../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA      } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME     } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME       } from '../../../modules/nf-core/samtools/view/main'
@@ -34,6 +36,12 @@ workflow RNA_ALIGN {
     )
     ch_versions = ch_versions.mix(BOWTIE_ALIGN.out.versions)
 
+    SAMTOOLS_SORT_SMRNA ( BOWTIE_ALIGN.out.bam )
+    ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA.out.versions)
+
+    SAMTOOLS_INDEX_SMRNA ( SAMTOOLS_SORT_SMRNA.out.bam )
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA.out.versions)
+
     /*
     * MODULE: Align reads that did not align to the smrna genome to the primary genome
     */

From 46a1203cc7e2c8c9d998d3444623a9e6811e7b74 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 14 Nov 2023 12:22:23 +0000
Subject: [PATCH 03/23] dedup multi bam and smrna bam

---
 conf/modules.config                       | 41 +++++++++++++---
 main.nf                                   | 58 ++++++++++++++++-------
 subworkflows/goodwright/rna_align/main.nf |  8 +++-
 3 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 53b5cfb..5cc1a77 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -268,7 +268,7 @@ if(params.run_alignment) {
                     path: { "${params.outdir}/02_alignment/smrna" },
                     mode: "${params.publish_dir_mode}",
                     pattern: '*.bam',
-                    enabled: false
+                    enabled: params.save_align_intermed
                 ],
                 [
                     path: { "${params.outdir}/02_alignment/smrna/unmapped" },
@@ -405,9 +405,9 @@ if(params.run_read_filter) {
 
 if(params.run_umi_dedup) {
     process {
-        withName: 'CLIPSEQ:GENOME_DEDUP:UMICOLLAPSE' {
+        withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:UMICOLLAPSE' {
             ext.args = { "--umi-sep ${params.umi_separator}" }
-            ext.prefix = { "${meta.id}.genome.dedup" }
+            ext.prefix = { "${meta.id}.unique_genome.dedup" }
             publishDir = [
                 path: { "${params.outdir}/03_filt_dedup" },
                 mode: "${params.publish_dir_mode}",
@@ -415,7 +415,9 @@ if(params.run_umi_dedup) {
             ]
         }
 
-        withName: 'CLIPSEQ:GENOME_DEDUP:SAMTOOLS_INDEX' {
+        withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:UMICOLLAPSE' {
+            ext.args = { "--umi-sep ${params.umi_separator}" }
+            ext.prefix = { "${meta.id}.multi_genome.dedup" }
             publishDir = [
                 path: { "${params.outdir}/03_filt_dedup" },
                 mode: "${params.publish_dir_mode}",
@@ -423,12 +425,37 @@ if(params.run_umi_dedup) {
             ]
         }
 
-        withName: 'CLIPSEQ:GENOME_DEDUP:BAM_STATS_SAMTOOLS:.*' {
-            ext.prefix = { "${meta.id}.genome.dedup" }
+        withName: 'CLIPSEQ:SMRNA_DEDUP:UMICOLLAPSE' {
+            ext.args = { "--umi-sep ${params.umi_separator}" }
+            ext.prefix = { "${meta.id}.smrna.dedup" }
             publishDir = [
                 path: { "${params.outdir}/03_filt_dedup" },
                 mode: "${params.publish_dir_mode}",
-                pattern: "*.{stats,flagstat,idxstats}"
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
+        withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:SAMTOOLS_INDEX' {
+            publishDir = [
+                path: { "${params.outdir}/03_filt_dedup" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
+        withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:SAMTOOLS_INDEX' {
+            publishDir = [
+                path: { "${params.outdir}/03_filt_dedup" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
+        withName: 'CLIPSEQ:SMRNA_DEDUP:SAMTOOLS_INDEX' {
+            publishDir = [
+                path: { "${params.outdir}/03_filt_dedup" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
 
diff --git a/main.nf b/main.nf
index df58fce..d60e54b 100644
--- a/main.nf
+++ b/main.nf
@@ -125,7 +125,9 @@ include { PREPARE_CLIPSEQ                                       } from './subwor
 include { PARSE_FASTQ_INPUT                                     } from './subworkflows/goodwright/parse_fastq_input/main'
 include { FASTQC_TRIMGALORE                                     } from './subworkflows/goodwright/fastqc_trimgalore/main'
 include { RNA_ALIGN                                             } from './subworkflows/goodwright/rna_align/main'
-include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_DEDUP           } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
+include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_UNIQUE_DEDUP    } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
+include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP     } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
+include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP            } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP       } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './subworkflows/goodwright/clip_calc_crosslinks/main'
@@ -315,13 +317,17 @@ workflow CLIPSEQ {
             ch_filtered_gtf,
             ch_fasta
         )
-        ch_versions       = ch_versions.mix(RNA_ALIGN.out.versions)
-        ch_genome_bam     = RNA_ALIGN.out.genome_bam
-        ch_genome_bai     = RNA_ALIGN.out.genome_bai
-        ch_transcript_bam = RNA_ALIGN.out.transcript_bam
-        ch_transcript_bai = RNA_ALIGN.out.transcript_bai
-        ch_bt_log         = RNA_ALIGN.out.bt_log
-        ch_star_log       = RNA_ALIGN.out.star_log_final
+        ch_versions           = ch_versions.mix(RNA_ALIGN.out.versions)
+        ch_genome_unique_bam  = RNA_ALIGN.out.genome_unique_bam
+        ch_genome_unique_bai  = RNA_ALIGN.out.genome_unique_bai
+        ch_genome_multi_bam   = RNA_ALIGN.out.genome_multi_bam
+        ch_genome_multi_bai   = RNA_ALIGN.out.genome_multi_bai
+        ch_smrna_bam          = RNA_ALIGN.out.smrna_bam
+        ch_smrna_bai          = RNA_ALIGN.out.smrna_bai
+        ch_transcript_bam     = RNA_ALIGN.out.transcript_bam
+        ch_transcript_bai     = RNA_ALIGN.out.transcript_bai
+        ch_bt_log             = RNA_ALIGN.out.bt_log
+        ch_star_log           = RNA_ALIGN.out.star_log_final
     }
 
     if(params.run_read_filter) {
@@ -360,9 +366,19 @@ workflow CLIPSEQ {
         /*
         * CHANNEL: Combine bam and bai files on id
         */
-        ch_genome_bam_bai = ch_genome_bam
+        ch_genome_unique_bam_bai = ch_genome_unique_bam
             .map { row -> [row[0].id, row ].flatten()}
-            .join ( ch_genome_bai.map { row -> [row[0].id, row ].flatten()} )
+            .join ( ch_genome_unique_bai.map { row -> [row[0].id, row ].flatten()} )
+            .map { row -> [row[1], row[2], row[4]] }
+
+        ch_genome_multi_bam_bai = ch_genome_multi_bam
+            .map { row -> [row[0].id, row ].flatten()}
+            .join ( ch_genome_multi_bai.map { row -> [row[0].id, row ].flatten()} )
+            .map { row -> [row[1], row[2], row[4]] }
+
+        ch_smrna_bam_bai = ch_smrna_bam
+            .map { row -> [row[0].id, row ].flatten()}
+            .join ( ch_smrna_bai.map { row -> [row[0].id, row ].flatten()} )
             .map { row -> [row[1], row[2], row[4]] }
 
         ch_transcript_bam_bai = ch_transcript_bam
@@ -373,13 +389,23 @@ workflow CLIPSEQ {
         /*
         * SUBWORKFLOW: Run umi deduplication on genome-level alignments
         */
-        GENOME_DEDUP (
-            ch_genome_bam_bai
+        GENOME_UNIQUE_DEDUP (
+            ch_genome_unique_bam_bai
+        )
+        ch_versions   = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions)
+        ch_genome_bam = GENOME_UNIQUE_DEDUP.out.bam
+        ch_genome_bai = GENOME_UNIQUE_DEDUP.out.bai
+        ch_umi_log    = GENOME_UNIQUE_DEDUP.out.umi_log
+
+        GENOME_MULTI_DEDUP (
+            ch_genome_multi_bam_bai
+        )
+        ch_versions   = ch_versions.mix(GENOME_MULTI_DEDUP.out.versions)
+
+        SMRNA_DEDUP (
+            ch_smrna_bam_bai
         )
-        ch_versions   = ch_versions.mix(GENOME_DEDUP.out.versions)
-        ch_genome_bam = GENOME_DEDUP.out.bam
-        ch_genome_bai = GENOME_DEDUP.out.bai
-        ch_umi_log    = GENOME_DEDUP.out.umi_log
+        ch_versions   = ch_versions.mix(SMRNA_DEDUP.out.versions)
 
         /*
         * SUBWORKFLOW: Run umi deduplication on transcript-level alignments
diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf
index 5f3f854..ec03724 100644
--- a/subworkflows/goodwright/rna_align/main.nf
+++ b/subworkflows/goodwright/rna_align/main.nf
@@ -122,9 +122,13 @@ workflow RNA_ALIGN {
     bt_log              = BOWTIE_ALIGN.out.log                            // channel: [ val(meta), [ txt ] ]
     star_log            = STAR_ALIGN.out.log                              // channel: [ val(meta), [ txt ] ]
     star_log_final      = STAR_ALIGN.out.log_final                        // channel: [ val(meta), [ txt ] ]
-    genome_bam          = SAMTOOLS_VIEW_GENOME.out.bam                    // channel: [ val(meta), [ bam ] ]
-    genome_bai          = SAMTOOLS_VIEW_GENOME.out.bai                    // channel: [ val(meta), [ bai ] ]
+    genome_unique_bam   = SAMTOOLS_VIEW_GENOME.out.bam                    // channel: [ val(meta), [ bam ] ]
+    genome_unique_bai   = SAMTOOLS_VIEW_GENOME.out.bai                    // channel: [ val(meta), [ bai ] ]
+    genome_multi_bam    = STAR_ALIGN.out.bam_sorted                       // channel: [ val(meta), [ bam ] ]
+    genome_multi_bai    = SAMTOOLS_INDEX_GENOME.out.bai                   // channel: [ val(meta), [ bai ] ]
     transcript_bam      = SAMTOOLS_VIEW_TRANSCRIPT.out.bam                // channel: [ val(meta), [ bam ] ]
     transcript_bai      = SAMTOOLS_VIEW_TRANSCRIPT.out.bai                // channel: [ val(meta), [ bai ] ]
+    smrna_bam           = SAMTOOLS_SORT_SMRNA.out.bam                     // channel: [ val(meta), [ bam ] ]
+    smrna_bai           = SAMTOOLS_INDEX_SMRNA.out.bai                    // channel: [ val(meta), [ bai ] ]
     versions            = ch_versions                                     // channel: [ versions.yml ]
 }

From a99fc3c6fc604080c7fe8898c5f31fc056fa25d0 Mon Sep 17 00:00:00 2001
From: Chris Cheshire <chris.j.cheshire@gmail.com>
Date: Thu, 16 Nov 2023 11:33:21 +0000
Subject: [PATCH 04/23] Update schema

---
 schema/clipseq.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index 1fe89cc..ba9b1e1 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -302,7 +302,7 @@
         },
         "cli_options": {
             "name": "Command Line Options",
-            "description": "Proide customised command line options to specific processes",
+            "description": "Provide customised command line options to specific processes",
             "advanced": true,
             "properties": {
                 "bowtie_params": {

From c357a3a2158acbe052b62a41f06a72bb368d8f23 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 16:42:02 +0000
Subject: [PATCH 05/23] add trna/rrna premap count to icount_summary output

---
 conf/modules.config                           | 86 ++++++++++++++++++-
 main.nf                                       | 30 +++++++
 .../goodwright/clipseq/merge_summary/main.nf  | 29 +++++++
 .../goodwright/clipseq/merge_summary/meta.yml | 52 +++++++++++
 .../merge_summary/templates/merge_summary.py  | 66 ++++++++++++++
 schema/clipseq.json                           |  2 +-
 .../goodwright/icount_analyse/main.nf         | 15 +++-
 subworkflows/goodwright/rna_align/main.nf     | 23 ++++-
 8 files changed, 298 insertions(+), 5 deletions(-)
 create mode 100644 modules/goodwright/clipseq/merge_summary/main.nf
 create mode 100644 modules/goodwright/clipseq/merge_summary/meta.yml
 create mode 100644 modules/goodwright/clipseq/merge_summary/templates/merge_summary.py

diff --git a/conf/modules.config b/conf/modules.config
index 5cc1a77..8d33bbd 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -279,6 +279,31 @@ if(params.run_alignment) {
             ]
         }
 
+        withName: 'CLIPSEQ:RNA_ALIGN:BOWTIE_ALIGN_K1' {
+            ext.args = { "-v 2 -m 100 --norc --best --strata -k 1" }
+            ext.prefix = "_withK1"
+            publishDir = [
+                [
+                    path: { "${params.outdir}/02_alignment/smrna" },
+                    mode: "${params.publish_dir_mode}",
+                    pattern: '*.out',
+                    enabled: true
+                ],
+                [
+                    path: { "${params.outdir}/02_alignment/smrna" },
+                    mode: "${params.publish_dir_mode}",
+                    pattern: '*.bam',
+                    enabled: false
+                ],
+                [
+                    path: { "${params.outdir}/02_alignment/smrna/unmapped" },
+                    mode: "${params.publish_dir_mode}",
+                    pattern: '*.fastq.gz',
+                    enabled: false
+                ]
+            ]
+        }
+
         withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA' {
             ext.prefix = { "${meta.id}_sorted" }
             publishDir = [
@@ -435,6 +460,16 @@ if(params.run_umi_dedup) {
             ]
         }
 
+        withName: 'CLIPSEQ:SMRNA_K1_DEDUP:UMICOLLAPSE' {
+            ext.args = { "--umi-sep ${params.umi_separator}" }
+            ext.prefix = { "${meta.id}.smrna_withk1.dedup" }
+            publishDir = [
+                path: { "${params.outdir}/03_filt_dedup" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
         withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:SAMTOOLS_INDEX' {
             publishDir = [
                 path: { "${params.outdir}/03_filt_dedup" },
@@ -459,6 +494,14 @@ if(params.run_umi_dedup) {
             ]
         }
 
+        withName: 'CLIPSEQ:SMRNA_K1_DEDUP:SAMTOOLS_INDEX' {
+            publishDir = [
+                path: { "${params.outdir}/03_filt_dedup" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
         withName: 'CLIPSEQ:TRANSCRIPT_DEDUP:UMICOLLAPSE' {
             ext.args = { "--umi-sep ${params.umi_separator}" }
             ext.prefix = { "${meta.id}.transcript.dedup" }
@@ -608,6 +651,37 @@ if(params.run_calc_crosslinks) {
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
+        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' {
+            ext.cmd1 = 'sort -k1,1 -k2,2n'
+            ext.suffix = '.smrna_withk1'
+            ext.ext = 'bed'
+            publishDir = [
+                path: { "${params.outdir}/04_crosslinks" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
+        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' {
+            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
+            ext.suffix = '.smrna_withk1'
+            ext.ext = 'bedgraph'
+            publishDir = [
+                path: { "${params.outdir}/04_crosslinks" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+
+        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
+            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
+            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
+            ext.suffix = '.norm.smrna_withk1'
+            ext.ext = 'bedgraph'
+            publishDir = [
+                enabled: false
+            ]
+        }
     }
 }
 
@@ -709,8 +783,13 @@ if(params.run_peak_calling) {
             ]
         }
 
-
         withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' {
+            publishDir = [
+                enabled: false
+            ]
+        }
+
+        withName: 'CLIPSEQ:ICOUNT_ANALYSE:MERGE_SUMMARY' {
             publishDir = [
                 path: { "${params.outdir}/05_peak_calling/icount" },
                 mode: "${params.publish_dir_mode}",
@@ -728,7 +807,10 @@ if(params.run_peak_calling) {
 
         withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SIGXLS' {
             publishDir = [
-                enabled: false
+                path: { "${params.outdir}/05_peak_calling/icount" },
+                mode: "${params.publish_dir_mode}",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                pattern: "*.scores.tsv"
             ]
         }
 
diff --git a/main.nf b/main.nf
index d60e54b..7d32840 100644
--- a/main.nf
+++ b/main.nf
@@ -128,7 +128,9 @@ include { RNA_ALIGN                                             } from './subwor
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_UNIQUE_DEDUP    } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP     } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP            } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
+include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP         } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP       } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
+include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS      } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME             } from './subworkflows/goodwright/paraclu_analyse/main'
@@ -324,6 +326,8 @@ workflow CLIPSEQ {
         ch_genome_multi_bai   = RNA_ALIGN.out.genome_multi_bai
         ch_smrna_bam          = RNA_ALIGN.out.smrna_bam
         ch_smrna_bai          = RNA_ALIGN.out.smrna_bai
+        ch_smrna_k1_bam       = RNA_ALIGN.out.smrna_k1_bam
+        ch_smrna_k1_bai       = RNA_ALIGN.out.smrna_k1_bai
         ch_transcript_bam     = RNA_ALIGN.out.transcript_bam
         ch_transcript_bai     = RNA_ALIGN.out.transcript_bai
         ch_bt_log             = RNA_ALIGN.out.bt_log
@@ -381,6 +385,11 @@ workflow CLIPSEQ {
             .join ( ch_smrna_bai.map { row -> [row[0].id, row ].flatten()} )
             .map { row -> [row[1], row[2], row[4]] }
 
+        ch_smrna_k1_bam_bai = ch_smrna_k1_bam
+            .map { row -> [row[0].id, row ].flatten()}
+            .join ( ch_smrna_k1_bai.map { row -> [row[0].id, row ].flatten()} )
+            .map { row -> [row[1], row[2], row[4]] }
+
         ch_transcript_bam_bai = ch_transcript_bam
             .map { row -> [row[0].id, row ].flatten()}
             .join ( ch_transcript_bai.map { row -> [row[0].id, row ].flatten()} )
@@ -407,6 +416,14 @@ workflow CLIPSEQ {
         )
         ch_versions   = ch_versions.mix(SMRNA_DEDUP.out.versions)
 
+        SMRNA_K1_DEDUP (
+            ch_smrna_k1_bam_bai
+        )
+        ch_versions     = ch_versions.mix(SMRNA_K1_DEDUP.out.versions)
+        ch_smrna_k1_bam = SMRNA_K1_DEDUP.out.bam
+        ch_smrna_k1_bai = SMRNA_K1_DEDUP.out.bai
+        ch_umi_log      = SMRNA_K1_DEDUP.out.umi_log
+
         /*
         * SUBWORKFLOW: Run umi deduplication on transcript-level alignments
         */
@@ -425,6 +442,18 @@ workflow CLIPSEQ {
     ch_trans_crosslink_coverage       = Channel.empty()
     ch_trans_crosslink_coverage_norm  = Channel.empty()
     if(params.run_calc_crosslinks) {
+        /*
+        * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1
+        */
+        CALC_SMRNA_K1_CROSSLINKS (
+            ch_smrna_k1_bam,
+            ch_smrna_fasta_fai.collect{ it[1] }
+        )
+        ch_versions                      = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions)
+        ch_smrna_crosslink_bed           = CALC_SMRNA_K1_CROSSLINKS.out.bed
+        ch_smrna_crosslink_coverage      = CALC_SMRNA_K1_CROSSLINKS.out.coverage
+        ch_smrna_crosslink_coverage_norm = CALC_SMRNA_K1_CROSSLINKS.out.coverage_norm
+
         /*
         * SUBWORKFLOW: Run crosslink calculation for genome
         */
@@ -489,6 +518,7 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run iCount on genome-level crosslinks
         */
         ICOUNT_ANALYSE (
+            ch_smrna_crosslink_bed,
             ch_genome_crosslink_bed,
             ch_regions_resolved_gtf.collect{ it[1] },
             ch_seg_resolved_gtf.collect{ it[1] },
diff --git a/modules/goodwright/clipseq/merge_summary/main.nf b/modules/goodwright/clipseq/merge_summary/main.nf
new file mode 100644
index 0000000..966e244
--- /dev/null
+++ b/modules/goodwright/clipseq/merge_summary/main.nf
@@ -0,0 +1,29 @@
+process MERGE_SUMMARY {
+    tag "$gtf"
+    label "process_single"
+
+    conda "conda-forge::pandas=1.4.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.4.3':
+        'biocontainers/pandas:1.4.3' }"
+
+    input:
+    tuple val(meta), path(summary_type)
+    tuple val(meta), path(summary_subtype)
+    tuple val(meta), path(summary_gene)
+    tuple val(meta), path(smrna_premapped_k1_cDNA)
+
+    output:
+    tuple val(meta), path("*summary_type_premapadjusted.tsv")   , emit: summary_type_adjusted
+    tuple val(meta), path("*summary_subtype_premapadjusted.tsv"), emit: summary_subtype_adjusted
+    tuple val(meta), path("*summary_gene_premapadjusted.tsv")   , emit: summary_gene_adjusted
+    path "versions.yml"                                         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    shell:
+    process_name = task.process
+    template 'merge_summary.py'
+}
+
diff --git a/modules/goodwright/clipseq/merge_summary/meta.yml b/modules/goodwright/clipseq/merge_summary/meta.yml
new file mode 100644
index 0000000..cdbb89e
--- /dev/null
+++ b/modules/goodwright/clipseq/merge_summary/meta.yml
@@ -0,0 +1,52 @@
+name: merge_summary
+description: Merge results of pre-mapping with results of iCount summary
+tools:
+  - pandas:
+      description: |
+        Flexible and powerful data analysis / manipulation library for Python,
+        providing labeled data structures similar to R data.frame objects,
+        statistical functions, and much more.
+      homepage: https://pandas.pydata.org/
+      documentation: https://pandas.pydata.org/docs/
+      licence: ["BSD-3"]
+input:
+  - summary_type:
+      type: file
+      description: Output from iCount Summary
+      pattern: "*.tsv"
+  - summary_subtype:
+      type: file
+      description: Output from iCount Summary
+      pattern: "*.tsv"
+  - summary_gene:
+      type: file
+      description: Output from iCount Summary
+      pattern: "*.tsv"
+  - smrna_premapped_k1_cDNA:
+      type: file
+      description: smRNA premapped k1 cDNA (deduplicated) bed file
+      pattern: "*.bed"
+  - smrna_premapped_k1_reads_log:
+      type: file
+      description: smRNA premapped k1 reads bowtie log to get read number before deduplication
+      pattern: "*.out"
+
+output:
+  - summary_type_adjusted:
+      type: file
+      description: Output from iCount Summary adjusted with pre-mapping
+      pattern: "*.tsv"
+  - summary_subtype_adjusted:
+      type: file
+      description: Output from iCount Summary adjusted with pre-mapping
+      pattern: "*.tsv"
+  - summary_gene_adjusted:
+      type: file
+      description: Output from iCount Summary adjusted with pre-mapping
+      pattern: "*.tsv"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@charlotteanne"
diff --git a/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py b/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py
new file mode 100644
index 0000000..2cc032a
--- /dev/null
+++ b/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+"""Merge pre-mapped results with genome-computed iCount Summary"""
+
+import platform
+import argparse
+from sys import exit
+import pandas as pd
+import os
+
+
+def dump_versions(process_name):
+    with open("versions.yml", "w") as out_f:
+        out_f.write(process_name + ":\n")
+        out_f.write("    python: " + platform.python_version() + "\n")
+        out_f.write("    pandas: " + pd.__version__ + "\n")
+
+def extract_cdna_from_bed(file_path):
+    # Read the file into a DataFrame
+    df = pd.read_csv(file_path, sep='\t', header=None)
+    # Sum the values in the 5th column (index 4)
+    total_sum = df[4].sum()
+    return total_sum
+
+def adjust_summary_file(file_path, number_cdnas_premapped):
+    # Read the file into a DataFrame
+    df = pd.read_csv(file_path, sep='\t', header=0)
+    # Add the new values
+    new_row = ["premapped rRNA_tRNA", "NA", number_cdnas_premapped, 0]
+    # Append the new row using loc indexer
+    df.loc[len(df)] = new_row
+    # Correct the percentages
+    # Calculate the total cDNA #
+    total_cDNA = df['cDNA #'].sum()
+    # Update the cDNA % column
+    df['cDNA %'] = (df['cDNA #'] / total_cDNA) * 100
+    # Create the output file name
+    base_name, extension = os.path.splitext(os.path.basename(file_path))
+    output_file = base_name + "_premapadjusted" + extension
+    print(f"Saving to: {output_file}")  # Debugging
+    # Write the updated DataFrame to the new file
+    df.to_csv(output_file, sep='\t', index=False)
+
+def main(processname, subtype, type, gene, cdna):
+    # Dump version file
+    dump_versions(processname)
+
+    # Get number of cDNAs
+    number_cdnas_premapped = extract_cdna_from_bed(cdna)
+    print("Number of cDNAs premapped: " + str(number_cdnas_premapped))
+
+    adjust_summary_file(type, number_cdnas_premapped)
+    adjust_summary_file(subtype, number_cdnas_premapped)
+    adjust_summary_file(gene, number_cdnas_premapped)
+
+if __name__ == "__main__":
+    # Allows switching between nextflow templating and standalone python running using arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--processname", default="!{process_name}")
+    parser.add_argument("--subtype", default="!{summary_subtype}")
+    parser.add_argument("--type", default="!{summary_type}")
+    parser.add_argument("--gene", default="!{summary_gene}")
+    parser.add_argument("--cdna", default="!{smrna_premapped_k1_cDNA}")
+    args = parser.parse_args()
+
+    main(args.processname, args.subtype, args.type, args.gene, args.cdna)
diff --git a/schema/clipseq.json b/schema/clipseq.json
index b9666d0..c56f845 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -353,7 +353,7 @@
             "name": "Crosslink summary",
             "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).",
             "filetype": "tsv",
-            "process": "ICOUNT_SUMMARY"
+            "process": "MERGE_SUMMARY"
         },
         {
             "name": "K-mer enrichment",
diff --git a/subworkflows/goodwright/icount_analyse/main.nf b/subworkflows/goodwright/icount_analyse/main.nf
index b6e5ab5..2ec3ff6 100644
--- a/subworkflows/goodwright/icount_analyse/main.nf
+++ b/subworkflows/goodwright/icount_analyse/main.nf
@@ -5,15 +5,18 @@
 /*
 * MODULES
 */
+
 include { ICOUNT_SUMMARY          } from '../../../modules/goodwright/icount/summary/main.nf'
 include { ICOUNT_RNAMAPS          } from '../../../modules/goodwright/icount/rnamaps/main.nf'
 include { ICOUNT_SIGXLS           } from '../../../modules/goodwright/icount/sigxls/main.nf'
 include { ICOUNT_PEAKS            } from '../../../modules/goodwright/icount/peaks/main.nf'
 include { GUNZIP as GUNZIP_SIGXLS } from '../../../modules/nf-core/gunzip/main.nf'
 include { GUNZIP as GUNZIP_PEAKS  } from '../../../modules/nf-core/gunzip/main.nf'
+include { MERGE_SUMMARY           } from '../../../modules/goodwright/clipseq/merge_summary/main.nf'
 
 workflow ICOUNT_ANALYSE {
     take:
+    smrna_bed       // channel: [ val(meta), [ bed ] ]
     bed             // channel: [ val(meta), [ bed ] ]
     gtf_regions     // channel: [ [ gtf ] ]
     gtf_resolved    // channel: [ [ gtf.gz ] ]
@@ -23,14 +26,24 @@ workflow ICOUNT_ANALYSE {
     ch_versions = Channel.empty()
 
     /*
-    * MODULE: Run iCount summary
+    * MODULE: Run iCount summary 
     */
+ 
+
     ICOUNT_SUMMARY (
         bed,
         gtf_regions
     )
     ch_versions = ch_versions.mix(ICOUNT_SUMMARY.out.versions)
 
+    MERGE_SUMMARY (
+        ICOUNT_SUMMARY.out.summary_type,
+        ICOUNT_SUMMARY.out.summary_subtype,
+        ICOUNT_SUMMARY.out.summary_gene,
+        smrna_bed
+    )
+    ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions)
+
     /*
     * MODULE: Run iCount rnamaps
     */
diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf
index ec03724..ae1fc17 100644
--- a/subworkflows/goodwright/rna_align/main.nf
+++ b/subworkflows/goodwright/rna_align/main.nf
@@ -7,10 +7,13 @@
 * MODULES
 */ 
 include { BOWTIE_ALIGN                                } from '../../../modules/nf-core/bowtie/align/main.nf'
+include { BOWTIE_ALIGN as BOWTIE_ALIGN_K1             } from '../../../modules/nf-core/bowtie/align/main.nf'
 include { STAR_ALIGN                                  } from '../../../modules/nf-core/star/align/main.nf'
 include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT   } from '../../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA        } from '../../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1    } from '../../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA      } from '../../../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA_K1   } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME     } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME       } from '../../../modules/nf-core/samtools/view/main'
@@ -42,6 +45,23 @@ workflow RNA_ALIGN {
     SAMTOOLS_INDEX_SMRNA ( SAMTOOLS_SORT_SMRNA.out.bam )
     ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA.out.versions)
 
+    /*
+    * MODULE: Align reads to smrna genome, here allowing 100 multimappers but only reporting one alignment per multimapped read
+    * so that we can accurately count it in the crosslink summary later
+    */
+
+    BOWTIE_ALIGN_K1 (
+        fastq,
+        bt_index.collect{it[1]}
+    )
+    ch_versions = ch_versions.mix(BOWTIE_ALIGN_K1.out.versions)
+
+    SAMTOOLS_SORT_SMRNA_K1 ( BOWTIE_ALIGN_K1.out.bam )
+    ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA_K1.out.versions)
+
+    SAMTOOLS_INDEX_SMRNA_K1 ( SAMTOOLS_SORT_SMRNA_K1.out.bam )
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA_K1.out.versions)
+
     /*
     * MODULE: Align reads that did not align to the smrna genome to the primary genome
     */
@@ -118,7 +138,6 @@ workflow RNA_ALIGN {
     )
 
     emit:
-    bt_bam              = BOWTIE_ALIGN.out.bam                            // channel: [ val(meta), [ bam ] ]
     bt_log              = BOWTIE_ALIGN.out.log                            // channel: [ val(meta), [ txt ] ]
     star_log            = STAR_ALIGN.out.log                              // channel: [ val(meta), [ txt ] ]
     star_log_final      = STAR_ALIGN.out.log_final                        // channel: [ val(meta), [ txt ] ]
@@ -130,5 +149,7 @@ workflow RNA_ALIGN {
     transcript_bai      = SAMTOOLS_VIEW_TRANSCRIPT.out.bai                // channel: [ val(meta), [ bai ] ]
     smrna_bam           = SAMTOOLS_SORT_SMRNA.out.bam                     // channel: [ val(meta), [ bam ] ]
     smrna_bai           = SAMTOOLS_INDEX_SMRNA.out.bai                    // channel: [ val(meta), [ bai ] ]
+    smrna_k1_bam        = SAMTOOLS_SORT_SMRNA_K1.out.bam                  // channel: [ val(meta), [ bam ] ]
+    smrna_k1_bai        = SAMTOOLS_INDEX_SMRNA_K1.out.bai                 // channel: [ val(meta), [ bai ] ]
     versions            = ch_versions                                     // channel: [ versions.yml ]
 }

From 614e7251b9c19ebc3f214755496ef38d47c4c3b8 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 16:44:40 +0000
Subject: [PATCH 06/23] expose all trim galore args

---
 conf/modules.config | 2 +-
 nextflow.config     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 8d33bbd..0a876e8 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -225,7 +225,7 @@ if(params.run_trim_galore_fastqc && !params.skip_fastqc) {
 if(params.run_trim_galore_fastqc && !params.skip_trimming) {
     process {
         withName: 'CLIPSEQ:FASTQC_TRIMGALORE:TRIMGALORE' {
-            ext.args   = "--fastqc --length ${params.trim_length} -q 20"
+            ext.args   = "${params.trimgalore_params}"
             publishDir = [
                 [
                     path: { "${params.outdir}/01_prealign/post_trim_fastqc" },
diff --git a/nextflow.config b/nextflow.config
index f516083..6bbe635 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -72,9 +72,9 @@ params {
     move_umi_to_header  = false
     umi_header_format   = null
     save_unaligned      = true // DO NOT CHANGE
-    trim_length         = 10
     umi_separator       = "rbc:"
     paraclu_min_value   = 10
+    trimgalore_params   = "--fastqc --length 10 -q 20"
 	bowtie_params       = "-v 2 -m 100 --norc --best --strata"
     star_params         = "--outFilterMultimapNmax 100 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic"
     clippy_params       = ""

From fcb224e37979fdb9cae1aeb93f2f985d47db5b2b Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 17:05:26 +0000
Subject: [PATCH 07/23] tidy up the output folders

---
 conf/modules.config | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 0a876e8..86b6467 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -268,7 +268,7 @@ if(params.run_alignment) {
                     path: { "${params.outdir}/02_alignment/smrna" },
                     mode: "${params.publish_dir_mode}",
                     pattern: '*.bam',
-                    enabled: params.save_align_intermed
+                    enabled: false
                 ],
                 [
                     path: { "${params.outdir}/02_alignment/smrna/unmapped" },
@@ -281,7 +281,7 @@ if(params.run_alignment) {
 
         withName: 'CLIPSEQ:RNA_ALIGN:BOWTIE_ALIGN_K1' {
             ext.args = { "-v 2 -m 100 --norc --best --strata -k 1" }
-            ext.prefix = "_withK1"
+            ext.prefix = { "${meta.id}_withK1" }
             publishDir = [
                 [
                     path: { "${params.outdir}/02_alignment/smrna" },
@@ -322,6 +322,18 @@ if(params.run_alignment) {
             ]
         }
 
+            withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA_K1' {
+            publishDir = [
+                enabled: false
+            ]
+        }
+
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_SMRNA_K1' {
+            publishDir = [
+                enabled: false
+            ]
+        }
+
 
         withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' {
             ext.args   = {  "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}"  }
@@ -355,6 +367,14 @@ if(params.run_alignment) {
             ]
         }
 
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' {
+            publishDir = [
+                path: { "${params.outdir}/02_alignment/genome" },
+                mode: "${params.publish_dir_mode}",
+                enabled: params.save_align_intermed
+            ]
+        }
+
         withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' {
             ext.prefix = { "${meta.id}_multi.Aligned.toTranscriptome_sorted.out" }
             publishDir = [

From f49ba741ba890017d02733349600e366b9251a03 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 17:07:07 +0000
Subject: [PATCH 08/23] update flow schema

---
 schema/clipseq.json | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index b79b832..1068df9 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -308,14 +308,19 @@
             "description": "Provide customised command line options to specific processes",
             "advanced": true,
             "properties": {
+                "trimgalore_params": {
+                    "name": "Trim Galore! parameters",
+                    "description": "Parameters for Trim Galore! trimming. Defaults are -q 20 and minimum length 10 to keep reads.",
+                    "type": "string"
+                },
                 "bowtie_params": {
                     "name": "Bowtie parameters",
-                    "description": "Mapping parameters for Bowtie pre-mapping",
+                    "description": "Mapping parameters for Bowtie pre-mapping. Do not touch unless you know what you are doing!",
                     "type": "string"
                 },
                 "star_params": {
                     "name": "STAR parameters",
-                    "description": "Mapping parameters for STAR mapping",
+                    "description": "Mapping parameters for STAR mapping. Do not touch unless you know what you are doing!",
                     "type": "string"
                 },
                 "clippy_params": {

From de15a01bf1056a37b108e99ae494fda484b733f9 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 17:12:32 +0000
Subject: [PATCH 09/23] get rid of min trim from schema, doesnt exist anymore

---
 schema/clipseq.json | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index 1068df9..2ca3536 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -270,11 +270,6 @@
             "description": "Additional pipeline configuration options.",
             "advanced": true,
             "properties": {
-                "trim_length": {
-                    "name": "Minimum trim length.",
-                    "description": "Minimum length of read to keep after Trim Galore! trimming.",
-                    "type": "number"
-                },
                 "move_umi_to_header": {
                     "name": "Extract UMI to header",
                     "description": "Runs UMI to header extraction based on the head format provided in UMI header format.",

From 6d3cd6734344f221d96fc09f89cf934aa27e668a Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 28 Nov 2023 19:33:37 +0000
Subject: [PATCH 10/23] test for icount summary publishing

---
 conf/modules.config | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 86b6467..258e24a 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -805,7 +805,10 @@ if(params.run_peak_calling) {
 
         withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' {
             publishDir = [
-                enabled: false
+                enabled: false,
+                path: null,
+                mode: null,
+                saveAs: null
             ]
         }
 

From bf7db756eb22095fcadc1e9e51fffa5bc52ae954 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Thu, 30 Nov 2023 17:57:13 +0000
Subject: [PATCH 11/23] use samtools to sort multimapped genome bam instead of
 star

to fix memory issues
---
 conf/modules.config                       | 16 +++++++++++-----
 subworkflows/goodwright/rna_align/main.nf | 11 +++++++----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 258e24a..9301b85 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -336,7 +336,7 @@ if(params.run_alignment) {
 
 
         withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' {
-            ext.args   = {  "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}"  }
+            ext.args   = {  "--readFilesCommand zcat --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM ${params.star_params}"  }
             ext.prefix = { "${meta.id}_multi" }
             publishDir = [
                 [
@@ -367,6 +367,15 @@ if(params.run_alignment) {
             ]
         }
 
+        withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_GENOME' {
+            ext.prefix = { "${meta.id}_multi.Aligned.toGenome_sorted.out" }
+            publishDir = [
+                path: { "${params.outdir}/02_alignment/genome" },
+                mode: "${params.publish_dir_mode}",
+                enabled: params.save_align_intermed
+            ]
+        }
+
         withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' {
             publishDir = [
                 path: { "${params.outdir}/02_alignment/genome" },
@@ -805,10 +814,7 @@ if(params.run_peak_calling) {
 
         withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' {
             publishDir = [
-                enabled: false,
-                path: null,
-                mode: null,
-                saveAs: null
+                enabled: false
             ]
         }
 
diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf
index ae1fc17..f2c67a9 100644
--- a/subworkflows/goodwright/rna_align/main.nf
+++ b/subworkflows/goodwright/rna_align/main.nf
@@ -9,9 +9,10 @@
 include { BOWTIE_ALIGN                                } from '../../../modules/nf-core/bowtie/align/main.nf'
 include { BOWTIE_ALIGN as BOWTIE_ALIGN_K1             } from '../../../modules/nf-core/bowtie/align/main.nf'
 include { STAR_ALIGN                                  } from '../../../modules/nf-core/star/align/main.nf'
+include { SAMTOOLS_SORT as SAMTOOLS_SORT_GENOME       } from '../../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT   } from '../../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA        } from '../../../modules/nf-core/samtools/sort/main'
-include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1    } from '../../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1     } from '../../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA      } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA_K1   } from '../../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main'
@@ -75,10 +76,12 @@ workflow RNA_ALIGN {
     )
     ch_versions = ch_versions.mix(STAR_ALIGN.out.versions)
 
+    SAMTOOLS_SORT_GENOME ( STAR_ALIGN.out.bam )
+
     /*
     * MODULE: Index genome-level BAM file 
     */
-    SAMTOOLS_INDEX_GENOME ( STAR_ALIGN.out.bam_sorted )
+    SAMTOOLS_INDEX_GENOME ( SAMTOOLS_SORT_GENOME.out.bam )
     ch_versions = ch_versions.mix(SAMTOOLS_INDEX_GENOME.out.versions.first())
 
     /*
@@ -94,7 +97,7 @@ workflow RNA_ALIGN {
     /*
     * CHANNEL: Join bam and bai files
     */
-    ch_bam_bai = STAR_ALIGN.out.bam_sorted
+    ch_bam_bai = SAMTOOLS_SORT_GENOME.out.bam
         .join(SAMTOOLS_INDEX_GENOME.out.bai, by: [0], remainder: true)
         .join(SAMTOOLS_INDEX_GENOME.out.csi, by: [0], remainder: true)
         .map {
@@ -143,7 +146,7 @@ workflow RNA_ALIGN {
     star_log_final      = STAR_ALIGN.out.log_final                        // channel: [ val(meta), [ txt ] ]
     genome_unique_bam   = SAMTOOLS_VIEW_GENOME.out.bam                    // channel: [ val(meta), [ bam ] ]
     genome_unique_bai   = SAMTOOLS_VIEW_GENOME.out.bai                    // channel: [ val(meta), [ bai ] ]
-    genome_multi_bam    = STAR_ALIGN.out.bam_sorted                       // channel: [ val(meta), [ bam ] ]
+    genome_multi_bam    = SAMTOOLS_SORT_GENOME.out.bam                    // channel: [ val(meta), [ bam ] ]
     genome_multi_bai    = SAMTOOLS_INDEX_GENOME.out.bai                   // channel: [ val(meta), [ bai ] ]
     transcript_bam      = SAMTOOLS_VIEW_TRANSCRIPT.out.bam                // channel: [ val(meta), [ bam ] ]
     transcript_bai      = SAMTOOLS_VIEW_TRANSCRIPT.out.bai                // channel: [ val(meta), [ bai ] ]

From 1d0aa3a113c27aeeec549dec8c5002c2b576f2e8 Mon Sep 17 00:00:00 2001
From: slbai01 <slbai01@foxmail.com>
Date: Thu, 29 Feb 2024 22:16:06 +0800
Subject: [PATCH 12/23] avoid UMICOLLAPSE Error: java.lang.StackOverflowError

---
 modules/goodwright/umicollapse/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index 355dcb8..8c2e7df 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -20,7 +20,7 @@ process UMICOLLAPSE {
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """
-    java -jar /UMICollapse/umicollapse.jar \\
+    java -Xss1G -jar /UMICollapse/umicollapse.jar \\
         bam \\
         -i $bam \\
         -o ${prefix}.bam \\

From 591e8fb23cce2c6b1207f660ed2b9c408b90880c Mon Sep 17 00:00:00 2001
From: slbai01 <slbai01@foxmail.com>
Date: Thu, 29 Feb 2024 22:18:21 +0800
Subject: [PATCH 13/23] avoid samtools Error: Argument list too long

---
 modules/goodwright/samtools/simple_view/main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf
index 945661a..f55b1b9 100644
--- a/modules/goodwright/samtools/simple_view/main.nf
+++ b/modules/goodwright/samtools/simple_view/main.nf
@@ -5,7 +5,8 @@ process SAMTOOLS_SIMPLE_VIEW {
     conda "bioconda::samtools=1.16.1"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' :
-        'biocontainers/samtools:1.16.1--h6899075_1' }"
+        'mgibio/samtools-cwl:1.16.1' }"
+//         'biocontainers/samtools:1.16.1--h6899075_1' }"
 
     input:
     tuple val(meta), path(input), path(index)
@@ -35,14 +36,13 @@ process SAMTOOLS_SIMPLE_VIEW {
                     input.getExtension()
     if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
     """
-    samtools \\
+    xargs --arg-file=longest_transcript.txt samtools \\
         view \\
         --threads ${task.cpus-1} \\
         ${reference} \\
         $args \\
         -o ${prefix}.${file_type} \\
         $input \\
-        `cat ${filter_file}` \\
         $args2
 
     cat <<-END_VERSIONS > versions.yml

From 82203b4f5185fab7357c2b33f321c5d6d91cd58e Mon Sep 17 00:00:00 2001
From: slbai01 <slbai01@foxmail.com>
Date: Thu, 29 Feb 2024 22:27:57 +0800
Subject: [PATCH 14/23] Fix filename absolute references

---
 modules/goodwright/samtools/simple_view/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf
index f55b1b9..bb94cfb 100644
--- a/modules/goodwright/samtools/simple_view/main.nf
+++ b/modules/goodwright/samtools/simple_view/main.nf
@@ -36,7 +36,7 @@ process SAMTOOLS_SIMPLE_VIEW {
                     input.getExtension()
     if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
     """
-    xargs --arg-file=longest_transcript.txt samtools \\
+    xargs --arg-file=${filter_file} samtools \\
         view \\
         --threads ${task.cpus-1} \\
         ${reference} \\

From 720a90a4fd30e35ef6a57a304ab5a150f868d986 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Wed, 10 Apr 2024 11:06:16 +0100
Subject: [PATCH 15/23] update peka outputs

---
 modules/goodwright/peka/main.nf | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/modules/goodwright/peka/main.nf b/modules/goodwright/peka/main.nf
index 82c833f..127274d 100644
--- a/modules/goodwright/peka/main.nf
+++ b/modules/goodwright/peka/main.nf
@@ -16,11 +16,14 @@ process PEKA {
     path gtf
 
     output:
-    tuple val(meta), path("*mer_cluster_distribution*"), emit: cluster,      optional: true
-    tuple val(meta), path("*mer_distribution*")        , emit: distribution, optional: true
-    tuple val(meta), path("*rtxn*")                    , emit: rtxn,         optional: true
-    tuple val(meta), path("*.pdf")                     , emit: pdf,          optional: true
-    path "versions.yml"                                , emit: versions
+    tuple val(meta), path("*mer_cluster_distribution*")    , emit: cluster,      optional: true
+    tuple val(meta), path("*mer_distribution*")            , emit: distribution, optional: true
+    tuple val(meta), path("*rtxn*")                        , emit: rtxn,         optional: true
+    tuple val(meta), path("*.pdf")                         , emit: pdf,          optional: true
+    tuple val(meta), path("*thresholded_sites*.bed.gz")    , emit: tsites,       optional: true
+    tuple val(meta), path("*oxn*.bed.gz")                  , emit: oxn,          optional: true
+    tuple val(meta), path("*_clusters.csv")                , emit: clust,        optional: true
+    path "versions.yml"                                    , emit: versions
 
     when:
     task.ext.when == null || task.ext.when

From 653b3453e360db2eed6594f461eae61b9f4fff00 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:19:17 +0100
Subject: [PATCH 16/23] revert changes on samtools simple view

---
 modules/goodwright/samtools/simple_view/main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/goodwright/samtools/simple_view/main.nf b/modules/goodwright/samtools/simple_view/main.nf
index bb94cfb..945661a 100644
--- a/modules/goodwright/samtools/simple_view/main.nf
+++ b/modules/goodwright/samtools/simple_view/main.nf
@@ -5,8 +5,7 @@ process SAMTOOLS_SIMPLE_VIEW {
     conda "bioconda::samtools=1.16.1"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' :
-        'mgibio/samtools-cwl:1.16.1' }"
-//         'biocontainers/samtools:1.16.1--h6899075_1' }"
+        'biocontainers/samtools:1.16.1--h6899075_1' }"
 
     input:
     tuple val(meta), path(input), path(index)
@@ -36,13 +35,14 @@ process SAMTOOLS_SIMPLE_VIEW {
                     input.getExtension()
     if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
     """
-    xargs --arg-file=${filter_file} samtools \\
+    samtools \\
         view \\
         --threads ${task.cpus-1} \\
         ${reference} \\
         $args \\
         -o ${prefix}.${file_type} \\
         $input \\
+        `cat ${filter_file}` \\
         $args2
 
     cat <<-END_VERSIONS > versions.yml

From 461acb770b8457fca02cf91b139e93f9e67654a4 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Fri, 26 Apr 2024 14:41:11 +0100
Subject: [PATCH 17/23] update umicollapse

---
 modules/goodwright/umicollapse/main.nf | 61 ++++++++++++++++++++------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index 8c2e7df..bdd2a5b 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -1,16 +1,21 @@
 process UMICOLLAPSE {
     tag "$meta.id"
     label "process_high"
+    label "process_high_memory"
 
-    container 'docker.io/elly1502/umicollapse:latest'
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' :
+        'biocontainers/umicollapse:1.0.0--hdfd78af_1' }"
 
     input:
-    tuple val(meta), path(bam), path(bai)
+    tuple val(meta), path(input), path(bai)
 
     output:
-    tuple val(meta), path("*.bam"), emit: bam
-    tuple val(meta), path("*.log"), emit: log
-    path  "versions.yml"          , emit: versions
+    tuple val(meta), path("*.bam"),                   emit: bam,        optional: true
+    tuple val(meta), path("*dedup*fastq.gz"),         emit: fastq,      optional: true
+    tuple val(meta), path("*_UMICollapse.log"),       emit: log
+    path  "versions.yml" ,                            emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -18,19 +23,49 @@ process UMICOLLAPSE {
     script:
     def args   = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-
+    def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for
+    // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90%
+    // which leaves 5% for stuff happening outside of java without the scheduler killing the process.
+    def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue()
+    def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max
+    if ( mode !in [ 'fastq', 'bam' ] ) {
+        error "Mode must be one of 'fastq' or 'bam'."
+    }
+    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
     """
-    java -Xss1G -jar /UMICollapse/umicollapse.jar \\
-        bam \\
-        -i $bam \\
-        -o ${prefix}.bam \\
-        $args
+    # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated
+    # by conda that allows to set the heap size (Xmx), but not the stack size (Xss).
+    # `which` allows us to get the directory that contains `umicollapse`, independent of whether we
+    # are in a container or conda environment.
+    UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar
+    java \\
+        -Xmx${max_heap_size_mega}M \\
+        -Xss${max_stack_size_mega}M \\
+        -jar \$UMICOLLAPSE_JAR \\
+        -i ${input} \\
+        -o ${prefix}.${extension} \\
+        $args | tee ${prefix}_UMICollapse.log
 
-    mv .command.log ${prefix}_UMICollapse.log
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        umicollapse: $VERSION
+    END_VERSIONS
+    """
 
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.0-1'
+    if ( mode !in [ 'fastq', 'bam' ] ) {
+        error "Mode must be one of 'fastq' or 'bam'."
+    }
+    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
+    """
+    touch ${prefix}.dedup.${extension}
+    touch ${prefix}_UMICollapse.log
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        umicollapse: NA
+        umicollapse: $VERSION
     END_VERSIONS
     """
 }

From 84e81e3f6adffc3fe9930ba53f8cd345e493d184 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:41:32 +0100
Subject: [PATCH 18/23] Update main.nf

---
 modules/goodwright/umicollapse/main.nf | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index bdd2a5b..ad26a21 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -29,10 +29,6 @@ process UMICOLLAPSE {
     // which leaves 5% for stuff happening outside of java without the scheduler killing the process.
     def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue()
     def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max
-    if ( mode !in [ 'fastq', 'bam' ] ) {
-        error "Mode must be one of 'fastq' or 'bam'."
-    }
-    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
     """
     # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated
     # by conda that allows to set the heap size (Xmx), but not the stack size (Xss).

From 8837d1ea65bf89f1ef0fc3e98f3a452ad5b77250 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Fri, 26 Apr 2024 18:25:41 +0100
Subject: [PATCH 19/23] Update main.nf

---
 modules/goodwright/umicollapse/main.nf | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index ad26a21..85d8202 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -40,7 +40,7 @@ process UMICOLLAPSE {
         -Xss${max_stack_size_mega}M \\
         -jar \$UMICOLLAPSE_JAR \\
         -i ${input} \\
-        -o ${prefix}.${extension} \\
+        -o ${prefix}.bam \\
         $args | tee ${prefix}_UMICollapse.log
 
     cat <<-END_VERSIONS > versions.yml
@@ -55,9 +55,8 @@ process UMICOLLAPSE {
     if ( mode !in [ 'fastq', 'bam' ] ) {
         error "Mode must be one of 'fastq' or 'bam'."
     }
-    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
     """
-    touch ${prefix}.dedup.${extension}
+    touch ${prefix}.dedup.bam
     touch ${prefix}_UMICollapse.log
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 3f9b9fc40679536a3003096e1352a0c34755dd68 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Fri, 26 Apr 2024 20:17:13 +0100
Subject: [PATCH 20/23] Update main.nf

---
 modules/goodwright/umicollapse/main.nf | 54 ++++++--------------------
 1 file changed, 12 insertions(+), 42 deletions(-)

diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index 85d8202..1e5f171 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -1,21 +1,16 @@
 process UMICOLLAPSE {
     tag "$meta.id"
     label "process_high"
-    label "process_high_memory"
 
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' :
-        'biocontainers/umicollapse:1.0.0--hdfd78af_1' }"
+    container 'docker.io/elly1502/umicollapse:latest'
 
     input:
-    tuple val(meta), path(input), path(bai)
+    tuple val(meta), path(bam), path(bai)
 
     output:
-    tuple val(meta), path("*.bam"),                   emit: bam,        optional: true
-    tuple val(meta), path("*dedup*fastq.gz"),         emit: fastq,      optional: true
-    tuple val(meta), path("*_UMICollapse.log"),       emit: log
-    path  "versions.yml" ,                            emit: versions
+    tuple val(meta), path("*.bam"), emit: bam
+    tuple val(meta), path("*.log"), emit: log
+    path  "versions.yml"          , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -23,44 +18,19 @@ process UMICOLLAPSE {
     script:
     def args   = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
-    // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for
-    // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90%
-    // which leaves 5% for stuff happening outside of java without the scheduler killing the process.
-    def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue()
-    def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max
+
     """
-    # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated
-    # by conda that allows to set the heap size (Xmx), but not the stack size (Xss).
-    # `which` allows us to get the directory that contains `umicollapse`, independent of whether we
-    # are in a container or conda environment.
-    UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar
-    java \\
-        -Xmx${max_heap_size_mega}M \\
-        -Xss${max_stack_size_mega}M \\
-        -jar \$UMICOLLAPSE_JAR \\
-        -i ${input} \\
+    java -Xmx184320M -jar /UMICollapse/umicollapse.jar \\
+        bam \\
+        -i $bam \\
         -o ${prefix}.bam \\
-        $args | tee ${prefix}_UMICollapse.log
+        $args
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        umicollapse: $VERSION
-    END_VERSIONS
-    """
+    mv .command.log ${prefix}_UMICollapse.log
 
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    def VERSION = '1.0.0-1'
-    if ( mode !in [ 'fastq', 'bam' ] ) {
-        error "Mode must be one of 'fastq' or 'bam'."
-    }
-    """
-    touch ${prefix}.dedup.bam
-    touch ${prefix}_UMICollapse.log
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        umicollapse: $VERSION
+        umicollapse: NA
     END_VERSIONS
     """
 }

From 35f883a56c59e547a992731984ab2a4da54b1ffd Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 29 Apr 2024 11:35:12 +0100
Subject: [PATCH 21/23] update to using nf-core umicollapse module

---
 modules.json                                  |   5 +
 modules/goodwright/umicollapse/main.nf        |   4 +-
 .../clipseq/merge_summary/main.nf             |   0
 .../clipseq/merge_summary/meta.yml            |   0
 .../merge_summary/templates/merge_summary.py  |   0
 modules/nf-core/umicollapse/environment.yml   |   7 +
 modules/nf-core/umicollapse/main.nf           |  73 +++++
 modules/nf-core/umicollapse/meta.yml          |  63 +++++
 .../nf-core/umicollapse/tests/main.nf.test    | 249 ++++++++++++++++++
 .../umicollapse/tests/main.nf.test.snap       | 124 +++++++++
 .../nf-core/umicollapse/tests/nextflow.config |   8 +
 .../umicollapse/tests/nextflow_PE.config      |  10 +
 .../umicollapse/tests/nextflow_SE.config      |  10 +
 modules/nf-core/umicollapse/tests/tags.yml    |   2 +
 .../bam_dedup_samtools_umitools/main.nf       |   5 +-
 .../goodwright/icount_analyse/main.nf         |   2 +-
 16 files changed, 557 insertions(+), 5 deletions(-)
 rename modules/{goodwright => local}/clipseq/merge_summary/main.nf (100%)
 rename modules/{goodwright => local}/clipseq/merge_summary/meta.yml (100%)
 rename modules/{goodwright => local}/clipseq/merge_summary/templates/merge_summary.py (100%)
 create mode 100644 modules/nf-core/umicollapse/environment.yml
 create mode 100644 modules/nf-core/umicollapse/main.nf
 create mode 100644 modules/nf-core/umicollapse/meta.yml
 create mode 100644 modules/nf-core/umicollapse/tests/main.nf.test
 create mode 100644 modules/nf-core/umicollapse/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/umicollapse/tests/nextflow.config
 create mode 100644 modules/nf-core/umicollapse/tests/nextflow_PE.config
 create mode 100644 modules/nf-core/umicollapse/tests/nextflow_SE.config
 create mode 100644 modules/nf-core/umicollapse/tests/tags.yml

diff --git a/modules.json b/modules.json
index e450015..3519b29 100644
--- a/modules.json
+++ b/modules.json
@@ -196,6 +196,11 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "umicollapse": {
+                        "branch": "master",
+                        "git_sha": "b97197968ac12dde2463fa54541f6350c46f2035",
+                        "installed_by": ["modules"]
+                    },
                     "umitools/extract": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf
index 1e5f171..1090815 100644
--- a/modules/goodwright/umicollapse/main.nf
+++ b/modules/goodwright/umicollapse/main.nf
@@ -1,6 +1,6 @@
 process UMICOLLAPSE {
     tag "$meta.id"
-    label "process_high"
+    label "process_medium"
 
     container 'docker.io/elly1502/umicollapse:latest'
 
@@ -20,7 +20,7 @@ process UMICOLLAPSE {
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """
-    java -Xmx184320M -jar /UMICollapse/umicollapse.jar \\
+    java -jar /UMICollapse/umicollapse.jar \\
         bam \\
         -i $bam \\
         -o ${prefix}.bam \\
diff --git a/modules/goodwright/clipseq/merge_summary/main.nf b/modules/local/clipseq/merge_summary/main.nf
similarity index 100%
rename from modules/goodwright/clipseq/merge_summary/main.nf
rename to modules/local/clipseq/merge_summary/main.nf
diff --git a/modules/goodwright/clipseq/merge_summary/meta.yml b/modules/local/clipseq/merge_summary/meta.yml
similarity index 100%
rename from modules/goodwright/clipseq/merge_summary/meta.yml
rename to modules/local/clipseq/merge_summary/meta.yml
diff --git a/modules/goodwright/clipseq/merge_summary/templates/merge_summary.py b/modules/local/clipseq/merge_summary/templates/merge_summary.py
similarity index 100%
rename from modules/goodwright/clipseq/merge_summary/templates/merge_summary.py
rename to modules/local/clipseq/merge_summary/templates/merge_summary.py
diff --git a/modules/nf-core/umicollapse/environment.yml b/modules/nf-core/umicollapse/environment.yml
new file mode 100644
index 0000000..8dbc65d
--- /dev/null
+++ b/modules/nf-core/umicollapse/environment.yml
@@ -0,0 +1,7 @@
+name: umicollapse
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::umicollapse=1.0.0
diff --git a/modules/nf-core/umicollapse/main.nf b/modules/nf-core/umicollapse/main.nf
new file mode 100644
index 0000000..dae290e
--- /dev/null
+++ b/modules/nf-core/umicollapse/main.nf
@@ -0,0 +1,73 @@
+process UMICOLLAPSE {
+    tag "$meta.id"
+    label "process_high"
+    label "process_high_memory"
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' :
+        'biocontainers/umicollapse:1.0.0--hdfd78af_1' }"
+
+    input:
+    tuple val(meta), path(input), path(bai)
+    val(mode)
+
+    output:
+    tuple val(meta), path("*.bam"),                   emit: bam,        optional: true
+    tuple val(meta), path("*dedup*fastq.gz"),         emit: fastq,      optional: true
+    tuple val(meta), path("*_UMICollapse.log"),       emit: log
+    path  "versions.yml" ,                            emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for
+    // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90%
+    // which leaves 5% for stuff happening outside of java without the scheduler killing the process.
+    def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue()
+    def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max
+    if ( mode !in [ 'fastq', 'bam' ] ) {
+        error "Mode must be one of 'fastq' or 'bam'."
+    }
+    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
+    """
+    # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated
+    # by conda that allows to set the heap size (Xmx), but not the stack size (Xss).
+    # `which` allows us to get the directory that contains `umicollapse`, independent of whether we
+    # are in a container or conda environment.
+    UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar
+    java \\
+        -Xmx${max_heap_size_mega}M \\
+        -Xss${max_stack_size_mega}M \\
+        -jar \$UMICOLLAPSE_JAR \\
+        $mode \\
+        -i ${input} \\
+        -o ${prefix}.${extension} \\
+        $args | tee ${prefix}_UMICollapse.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        umicollapse: $VERSION
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.0-1'
+    if ( mode !in [ 'fastq', 'bam' ] ) {
+        error "Mode must be one of 'fastq' or 'bam'."
+    }
+    extension = mode.contains("fastq") ? "fastq.gz" : "bam"
+    """
+    touch ${prefix}.dedup.${extension}
+    touch ${prefix}_UMICollapse.log
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        umicollapse: $VERSION
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/umicollapse/meta.yml b/modules/nf-core/umicollapse/meta.yml
new file mode 100644
index 0000000..c1361f9
--- /dev/null
+++ b/modules/nf-core/umicollapse/meta.yml
@@ -0,0 +1,63 @@
+---
+name: "umicollapse"
+description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords:
+  - umicollapse
+  - deduplication
+  - genomics
+tools:
+  - "umicollapse":
+      description: "UMICollapse contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs)."
+      homepage: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse"
+      documentation: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse"
+      tool_dev_url: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse"
+      doi: "10.7717/peerj.8275"
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: |
+        BAM file containing reads to be deduplicated via UMIs.
+      pattern: "*.{bam}"
+  - bai:
+      type: file
+      description: |
+        BAM index files corresponding to the input BAM file. Optionally can be skipped using [] when using FastQ input.
+      pattern: "*.{bai}"
+  - mode:
+      type: string
+      description: |
+        Selects the mode of Umicollapse - either fastq or bam need to be provided.
+      pattern: "{fastq,bam}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file with deduplicated UMIs.
+      pattern: "*.{bam}"
+  - log:
+      type: file
+      description: A log file with the deduplication statistics.
+      pattern: "*_{UMICollapse.log}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@CharlotteAnne"
+  - "@chris-cheshire"
+maintainers:
+  - "@CharlotteAnne"
+  - "@chris-cheshire"
+  - "@apeltzer"
+  - "@MatthiasZepper"
diff --git a/modules/nf-core/umicollapse/tests/main.nf.test b/modules/nf-core/umicollapse/tests/main.nf.test
new file mode 100644
index 0000000..2dec45b
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/main.nf.test
@@ -0,0 +1,249 @@
+nextflow_process {
+
+    name "Test Process UMICOLLAPSE"
+    script "../main.nf"
+    process "UMICOLLAPSE"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "umicollapse"
+    tag "umitools/extract"
+    tag "samtools/index"
+    tag "bwa/index"
+    tag "bwa/mem"
+
+    test("umicollapse single end test") {
+        setup{
+            run("UMITOOLS_EXTRACT"){
+                script "../../umitools/extract/main.nf"
+                config "./nextflow_SE.config"
+                process{
+                    """
+                    input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                        file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)
+                    ]
+                    ]
+                    """
+                }
+            }
+
+            run("BWA_INDEX"){
+                script "../../bwa/index/main.nf"
+                process{
+                    """
+                    input[0] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]
+                    """
+                }
+            }
+            run("BWA_MEM"){
+                script "../../bwa/mem/main.nf"
+                process{
+                    """
+                    input[0] = UMITOOLS_EXTRACT.out.reads
+                    input[1] = BWA_INDEX.out.index
+                    input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]
+                    input[3] = true
+                    """
+                }
+            }
+            run("SAMTOOLS_INDEX"){
+                script "../../samtools/index/main.nf"
+                process{
+                    """
+                    input[0] = BWA_MEM.out.bam
+                    """
+                }
+            }
+        }
+
+        when {
+            config "./nextflow_SE.config"
+            process {
+                """
+                input[0] =  BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0])
+                input[1] = 'bam'
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.bam,
+                    process.out.versions).match() }
+            )
+        }
+
+    }
+
+    test("umicollapse paired tests") {
+            setup{
+            run("UMITOOLS_EXTRACT"){
+                script "../../umitools/extract/main.nf"
+                config "./nextflow_PE.config"
+                process{
+                    """
+                    input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                        file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)
+                    ]
+                    ]
+                    """
+                }
+            }
+
+            run("BWA_INDEX"){
+                script "../../bwa/index/main.nf"
+                process{
+                    """
+                    input[0] = [
+                            [ id:'sarscov2'],
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+                            ]
+                    """
+                }
+            }
+            run("BWA_MEM"){
+                script "../../bwa/mem/main.nf"
+                process{
+                    """
+                    input[0] = UMITOOLS_EXTRACT.out.reads
+                    input[1] = BWA_INDEX.out.index
+                    input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]
+                    input[3] = true
+                    """
+                }
+            }
+            run("SAMTOOLS_INDEX"){
+                script "../../samtools/index/main.nf"
+                process{
+                    """
+                    input[0] = BWA_MEM.out.bam
+                    """
+                }
+            }
+        }
+
+        when {
+            config "./nextflow_PE.config"
+            process {
+                """
+                input[0] =  BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0])
+                input[1] = 'bam'
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.bam,
+                    process.out.versions).match() }
+            )
+        }
+
+    }
+
+    test("umicollapse fastq tests") {
+
+            when {
+                config "./nextflow_SE.config"
+                process {
+                    """
+                    input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                    []
+                    ]
+                    input[1] = 'fastq'
+                    """
+                }
+            }
+
+            then {
+                assertAll(
+                    { assert process.success },
+                    { assert snapshot(
+                        process.out.fastq,
+                        process.out.versions).match() }
+                )
+            }
+    }
+
+    test("umicollapse stub tests") {
+        options "-stub-run"
+            setup{
+            run("UMITOOLS_EXTRACT"){
+                script "../../umitools/extract/main.nf"
+                config "./nextflow_PE.config"
+                process{
+                    """
+                    input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                        file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)
+                    ]
+                    ]
+                    """
+                }
+            }
+
+            run("BWA_INDEX"){
+                script "../../bwa/index/main.nf"
+                process{
+                    """
+                    input[0] = [
+                            [ id:'sarscov2'],
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+                            ]
+                    """
+                }
+            }
+            run("BWA_MEM"){
+                script "../../bwa/mem/main.nf"
+                process{
+                    """
+                    input[0] = UMITOOLS_EXTRACT.out.reads
+                    input[1] = BWA_INDEX.out.index
+                    input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]
+                    input[3] = true
+                    """
+                }
+            }
+            run("SAMTOOLS_INDEX"){
+                script "../../samtools/index/main.nf"
+                process{
+                    """
+                    input[0] = BWA_MEM.out.bam
+                    """
+                }
+            }
+        }
+        when {
+            config "./nextflow_PE.config"
+            process {
+                """
+                input[0] =  BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0])
+                input[1] = 'bam'
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+}
\ No newline at end of file
diff --git a/modules/nf-core/umicollapse/tests/main.nf.test.snap b/modules/nf-core/umicollapse/tests/main.nf.test.snap
new file mode 100644
index 0000000..861e9ca
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/main.nf.test.snap
@@ -0,0 +1,124 @@
+{
+    "umicollapse single end test": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.dedup.bam:md5,05c5331185263cbee6f508c0669be864"
+                ]
+            ],
+            [
+                "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-03-14T13:41:23.869211282"
+    },
+    "umicollapse fastq tests": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.dedup.fastq.gz:md5,c9bac08c7fd8df3e0203e3eeafc73155"
+                ]
+            ],
+            [
+                "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-01-30T10:45:56.053352008"
+    },
+    "umicollapse stub tests": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "3": [
+                    "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996"
+                ],
+                "bam": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "fastq": [
+                    
+                ],
+                "log": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-01-30T10:46:12.482697713"
+    },
+    "umicollapse paired tests": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false
+                    },
+                    "test.dedup.bam:md5,f4f05467cb456309fe22851d8b4d4387"
+                ]
+            ],
+            [
+                "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-03-14T13:41:54.486079388"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/umicollapse/tests/nextflow.config b/modules/nf-core/umicollapse/tests/nextflow.config
new file mode 100644
index 0000000..844edbd
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/nextflow.config
@@ -0,0 +1,8 @@
+process {
+    withName: UMITOOLS_EXTRACT {
+        ext.args = '--bc-pattern="NNNN"'
+    }
+    withName: UMICOLLAPSE {
+        ext.prefix = { "${meta.id}.dedup" }
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/umicollapse/tests/nextflow_PE.config b/modules/nf-core/umicollapse/tests/nextflow_PE.config
new file mode 100644
index 0000000..ae4c963
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/nextflow_PE.config
@@ -0,0 +1,10 @@
+process {
+
+    withName: UMITOOLS_EXTRACT {
+        ext.args = '--bc-pattern="NNNN" --bc-pattern2="NNNN"'
+    }
+
+    withName: UMICOLLAPSE {
+        ext.prefix = { "${meta.id}.dedup" }
+    }
+}
diff --git a/modules/nf-core/umicollapse/tests/nextflow_SE.config b/modules/nf-core/umicollapse/tests/nextflow_SE.config
new file mode 100644
index 0000000..d4b9443
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/nextflow_SE.config
@@ -0,0 +1,10 @@
+process {
+
+    withName: UMITOOLS_EXTRACT {
+        ext.args = '--bc-pattern="NNNN"'
+    }
+
+    withName: UMICOLLAPSE {
+        ext.prefix = { "${meta.id}.dedup" }
+    }
+}
diff --git a/modules/nf-core/umicollapse/tests/tags.yml b/modules/nf-core/umicollapse/tests/tags.yml
new file mode 100644
index 0000000..912879c
--- /dev/null
+++ b/modules/nf-core/umicollapse/tests/tags.yml
@@ -0,0 +1,2 @@
+umicollapse:
+  - "modules/nf-core/umicollapse/**"
diff --git a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf
index 0cb71f3..58ad0b6 100644
--- a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf
+++ b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf
@@ -2,7 +2,7 @@
 * UMIcollapse, index BAM file and run samtools stats, flagstat and idxstats
 */
 
-include { UMICOLLAPSE        } from '../../../modules/goodwright/umicollapse/main'
+include { UMICOLLAPSE        } from '../../../modules/nf-core/umicollapse/main'   
 include { SAMTOOLS_INDEX     } from '../../../modules/nf-core/samtools/index/main'
 
 workflow BAM_DEDUP_SAMTOOLS_UMITOOLS {
@@ -16,7 +16,8 @@ workflow BAM_DEDUP_SAMTOOLS_UMITOOLS {
     * MODULE: UMI-tools collapse
     */
     UMICOLLAPSE ( 
-        bam_bai 
+        bam_bai,
+        'bam' 
     )
     ch_versions = ch_versions.mix(UMICOLLAPSE.out.versions)
 
diff --git a/subworkflows/goodwright/icount_analyse/main.nf b/subworkflows/goodwright/icount_analyse/main.nf
index 2ec3ff6..d2fabd6 100644
--- a/subworkflows/goodwright/icount_analyse/main.nf
+++ b/subworkflows/goodwright/icount_analyse/main.nf
@@ -12,7 +12,7 @@ include { ICOUNT_SIGXLS           } from '../../../modules/goodwright/icount/sig
 include { ICOUNT_PEAKS            } from '../../../modules/goodwright/icount/peaks/main.nf'
 include { GUNZIP as GUNZIP_SIGXLS } from '../../../modules/nf-core/gunzip/main.nf'
 include { GUNZIP as GUNZIP_PEAKS  } from '../../../modules/nf-core/gunzip/main.nf'
-include { MERGE_SUMMARY           } from '../../../modules/goodwright/clipseq/merge_summary/main.nf'
+include { MERGE_SUMMARY           } from '../../../modules/local/clipseq/merge_summary/main.nf'
 
 workflow ICOUNT_ANALYSE {
     take:

From 1148dc67e3a387ae228650fdd57bcaf1dd50a856 Mon Sep 17 00:00:00 2001
From: Sam Ireland <samirelanduk@users.noreply.github.com>
Date: Mon, 2 Dec 2024 02:50:40 +0000
Subject: [PATCH 22/23] 1.2 clipseq.json upgrade

---
 schema/clipseq.json | 218 +++++++++++++++++++++++---------------------
 1 file changed, 115 insertions(+), 103 deletions(-)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index 2ca3536..0fe828e 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -1,275 +1,292 @@
 {
-    "inputs": {
-        "sample_options": {
+    "inputs": [
+        {
             "name": "Sample options",
             "description": "Parameters relating to the sample being analysed.",
             "advanced": false,
-            "properties": {
+            "params": {
                 "samplesheet": {
                     "name": "Samples",
-                    "type": "sample",
-                    "pattern": "csv|xlsx",
-                    "required": true,
-                    "categories": ["CLIP"],
                     "description": "The samples to process.",
-                    "csv": {
-                        "group": {
-                            "property": "",
-                            "user_override": true,
-                            "required": true
+                    "type": "csv",
+                    "required": true,
+                    "takes_samples": true,
+                    "sample_types": ["CLIP"],
+                    "columns": [
+                        {
+                            "name": "group",
+                            "type": "string",
+                            "required": true,
+                            "render": true
                         },
-                        "replicate": {
-                            "property": "",
-                            "user_override": true,
-                            "required": true
+                        {
+                            "name": "replicate",
+                            "type": "string",
+                            "required": true,
+                            "render": true
                         },
-                        "fastq_1": {
-                            "property": "input.1",
-                            "user_override": false
+                        {
+                            "name": "fastq_1",
+                            "type": "data",
+                            "from_sample": 1,
+                            "required": true,
+                            "render": false
                         },
-                        "fastq_2": {
-                            "property": "input.2",
-                            "user_override": false
+                        {
+                            "name": "fastq_2",
+                            "type": "data",
+                            "from_sample": 2,
+                            "required": false,
+                            "render": false
                         }
-                    }
+                    ]
                 }
             }
         },
-        "genome_options": {
+        {
             "name": "Genome options",
             "description": "The genome being aligned to.",
             "advanced": false,
-            "takes_genome": true,
-            "properties": {
+            "from_execution": true,
+            "params": {
                 "fasta": {
-                    "name": "Genome FASTA",
-                    "type": "file",
-                    "pattern": "fasta|fa$",
+                    "name": "FASTA",
+                    "type": "data",
+                    "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
                     "required": true,
-                    "genome_file": "fasta",
+                    "execution_output": {
+                        "process": null,
+                        "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$"
+                    },
                     "description": "A raw genome FASTA file."
                 },
                 "gtf": {
                     "name": "GTF",
-                    "type": "file",
-                    "pattern": "gtf",
+                    "type": "data",
+                    "pattern": "^\\S+\\.gtf(\\.gz)?$",
                     "required": true,
-                    "genome_file": "gtf",
+                    "execution_output": {
+                        "process": null,
+                        "pattern": "^\\S+\\.gtf(\\.gz)?$"
+                    },
                     "description": "An annotation for the genome."
                 },
                 "smrna_fasta": {
                     "name": "smRNA FASTA",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fasta|fa$",
                     "required": true,
-                    "genome_file": "fasta|fa$",
+                    "execution_output": {
+                        "process": null,
+                        "pattern": "(smrna|trna)\\.(fasta|fa)$"
+                    },
                     "description": "FASTA file to be mapped to before the genome file, typically containing rRNA and tRNA sequences."
                 },
                 "fasta_fai": {
                     "name": "Genome FASTA index",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fai$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "PREPARE_PRIMARY_GENOME:CUSTOM_GETCHROMSIZES",
-                        "filetype": "fai"
+                        "pattern": "\\.fai$"
                     },
                     "description": "A genome FASTA file index generated by Samtools faidx."
                 },
                 "chrom_sizes": {
                     "name": "Genome chromosome lengths",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "sizes$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "PREPARE_PRIMARY_GENOME:CUSTOM_GETCHROMSIZES",
-                        "filetype": "sizes"
+                        "pattern": "\\.sizes$"
                     },
                     "description": "A tabulated file of chromosome names and lengths."
                 },
                 "target_genome_index": {
                     "name": "Genome STAR index",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "STAR_GENOMEGENERATE",
-                        "filetype": ""
+                        "pattern": ""
                     },
                     "description": "A genome index generated by STAR."
                 },
                 "smrna_genome_index": {
                     "name": "Small RNA Bowtie index",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "BOWTIE_BUILD",
-                        "filetype": ""
+                        "pattern": ""
                     },
                     "description": "A small RNA index for pre-mapping generated by Bowtie."
                 },
                 "smrna_fasta_fai": {
                     "name": "Small RNA FASTA index",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fai$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "PREPARE_SMRNA_GENOME:CUSTOM_GETCHROMSIZES",
-                        "filetype": "fai"
+                        "pattern": "\\.fai$"
                     },
                     "description": "A small RNA FASTA file index generated by Samtools faidx."
                 },
                 "smrna_chrom_sizes": {
                     "name": "Small RNA lengths",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "sizes$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "PREPARE_SMRNA_GENOME:CUSTOM_GETCHROMSIZES",
-                        "filetype": "sizes"
+                        "pattern": "\\.sizes$"
                     },
                     "description": "A tabulated file of small RNA names and lengths."
                 },
                 "longest_transcript": {
                     "name": "Longest transcript IDs",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "txt$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT",
-                        "filetype": "txt"
+                        "pattern": "\\.txt$"
                     },
                     "description": "A list of transcript IDs for the longest transcript for each gene in provided GTF annotation."
                 },
                 "longest_transcript_fai": {
                     "name": "Longest transcript IDs and lengths",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fai$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT",
-                        "filetype": "fai"
+                        "pattern": "\\.fai$"
                     },
                     "description": "A tabulated file of transcript IDs and lengths for the longest transcript for each gene in provided GTF annotation."
                 },
                 "longest_transcript_gtf": {
                     "name": "Longest transcript IDs",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "CLIPSEQ_FIND_LONGEST_TRANSCRIPT",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "A list of transcript IDs for the longest transcript for each gene in provided GTF annotation."
                 },
                 "filtered_gtf": {
                     "name": "Filtered GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "CLIPSEQ_FILTER_GTF",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "GTF filtered for 'basic' transcript tag and support levels TSL1 and TSL2 to improve performance of downstream tools."
                 },
                 "seg_gtf": {
                     "name": "Segmented GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "ICOUNT_SEG_GTF",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "GTF segmented for use in iCount peak calling using iCount segment command."
                 },
                 "seg_filt_gtf": {
                     "name": "Segmented filtered GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "ICOUNT_SEG_FILTGTF",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "Filtered GTF segmented for use in iCount peak calling using iCount segment command."
                 },
                 "seg_resolved_gtf": {
                     "name": "Segmented resolved filtered GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "RESOLVE_UNANNOTATED",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "Filtered GTF segmented using iCount segment command and then resolve unannotated regions by overlapping transcript segments."
                 },
                 "seg_resolved_gtf_genic": {
                     "name": "Segmented resolved genic filtered GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "RESOLVE_UNANNOTATED_GENIC_OTHER",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "Filtered GTF segmented using iCount segment command and then resolve unannotated regions by annotating as 'genic other'."
                 },
                 "regions_gtf": {
                     "name": "Regions GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf.gz$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "ICOUNT_SEG_GTF",
-                        "filetype": "gtf.gz"
+                        "pattern": "\\.gtf.gz$"
                     },
                     "description": "GTF regions for use in PEKA using iCount segment command."
                 },
                 "regions_filt_gtf": {
                     "name": "Filtered regions GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf.gz$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "ICOUNT_SEG_FILTGTF",
-                        "filetype": "gtf.gz"
+                        "pattern": "\\.gtf.gz$"
                     },
                     "description": "Filtered GTF regions for use in PEKA using iCount segment command."
                 },
                 "regions_resolved_gtf": {
                     "name": "Filtered resolved regions GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "RESOLVE_UNANNOTATED_REGIONS",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "Filtered GTF regions using iCount segment command and then resolve unannotated regions by overlapping transcript segments."
                 },
                 "regions_resolved_gtf_genic": {
                     "name": "Filtered resolved regions genic GTF",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "gtf$",
                     "required": false,
-                    "genome_output": {
+                    "execution_output": {
                         "process": "RESOLVE_UNANNOTATED_GENIC_OTHER_REGIONS",
-                        "filetype": "gtf"
+                        "pattern": "\\.gtf$"
                     },
                     "description": "Filtered GTF regions using iCount segment command and then resolve unannotated regions by annotating as 'genic other'."
                 }
             }
         },
-        "pipeline_options": {
+        {
             "name": "Settings",
             "description": "Additional pipeline configuration options.",
             "advanced": true,
-            "properties": {
+            "params": {
                 "move_umi_to_header": {
                     "name": "Extract UMI to header",
                     "description": "Runs UMI to header extraction based on the head format provided in UMI header format.",
@@ -298,24 +315,19 @@
                 }
             }
         },
-        "cli_options": {
+        {
             "name": "Command Line Options",
-            "description": "Provide customised command line options to specific processes",
+            "description": "Proide customised command line options to specific processes",
             "advanced": true,
-            "properties": {
-                "trimgalore_params": {
-                    "name": "Trim Galore! parameters",
-                    "description": "Parameters for Trim Galore! trimming. Defaults are -q 20 and minimum length 10 to keep reads.",
-                    "type": "string"
-                },
+            "params": {
                 "bowtie_params": {
                     "name": "Bowtie parameters",
-                    "description": "Mapping parameters for Bowtie pre-mapping. Do not touch unless you know what you are doing!",
+                    "description": "Mapping parameters for Bowtie pre-mapping",
                     "type": "string"
                 },
                 "star_params": {
                     "name": "STAR parameters",
-                    "description": "Mapping parameters for STAR mapping. Do not touch unless you know what you are doing!",
+                    "description": "Mapping parameters for STAR mapping",
                     "type": "string"
                 },
                 "clippy_params": {
@@ -335,7 +347,7 @@
                 }
             }
         }
-    },
+    ],
     "outputs": [
         {
             "name": "Normalised genome crosslink bedgraph",
@@ -353,7 +365,7 @@
             "name": "Crosslink summary",
             "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).",
             "filetype": "tsv",
-            "process": "MERGE_SUMMARY"
+            "process": "ICOUNT_SUMMARY"
         },
         {
             "name": "K-mer enrichment",

From 94cf9a062d1d3c71af3578168af23dc8abe340a4 Mon Sep 17 00:00:00 2001
From: Sam Ireland <samirelanduk@users.noreply.github.com>
Date: Mon, 2 Dec 2024 02:51:15 +0000
Subject: [PATCH 23/23] 1.2 prepare_genome.json upgrade

---
 schema/prepare_genome.json | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/schema/prepare_genome.json b/schema/prepare_genome.json
index a861593..c197887 100644
--- a/schema/prepare_genome.json
+++ b/schema/prepare_genome.json
@@ -1,37 +1,38 @@
 {
-    "inputs": {
-        "genome_options": {
+    "inputs": [
+        {
             "name": "Genome options",
             "description": "The genome being aligned to.",
-            "takes_genome": true,
-            "properties": {
+            "from_fileset": true,
+            "fileset_requires_organism": true,
+            "params": {
                 "fasta": {
                     "name": "Genome FASTA",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fasta|fa$",
                     "required": true,
-                    "genome_file": "fasta",
+                    "fileset_pattern": "fasta",
                     "description": "A raw genome FASTA file."
                 },
                 "gtf": {
                     "name": "GTF",
-                    "type": "file",
-                    "pattern": "gtf",
+                    "type": "data",
+                    "pattern": "gtf$",
                     "required": true,
-                    "genome_file": "gtf",
+                    "fileset_pattern": "gtf$",
                     "description": "An annotation for the genome."
                 },
                 "smrna_fasta": {
                     "name": "smRNA FASTA",
-                    "type": "file",
+                    "type": "data",
                     "pattern": "fasta|fa$",
                     "required": true,
-                    "genome_file": "fasta|fa$",
+                    "fileset_pattern": "(smrna|trna)\\.(fasta|fa)$",
                     "description": "FASTA file to be mapped to before the genome file, typically containing rRNA and tRNA sequences."
                 }
             }
         }
-    },
+    ],
     "outputs": [
         {
             "name": "Genome STAR index",