From 19744b6ca9267754b9ffc2cab567ccdec6739a15 Mon Sep 17 00:00:00 2001
From: Chhabria <chhabrs1@iscb011.mskcc.org>
Date: Thu, 20 Mar 2025 17:50:58 -0400
Subject: [PATCH 01/11] MultiQC changes

---
 assets/multiqc_config.yml                | 223 ++++++++++++++++++++++-
 conf/modules.config                      |  17 +-
 conf/test.config                         |  52 ++++--
 modules/local/clipqc/main.nf             |   4 +
 modules/local/clipqc/templates/clipqc.py |  97 ++++++++--
 nextflow.config                          |   1 +
 workflows/clipseq.nf                     |  73 ++++++--
 7 files changed, 398 insertions(+), 69 deletions(-)

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index ca20bdd8..e6d43047 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -42,19 +42,226 @@ module_order:
   - star:
       name: "STAR Genome Alignment"
       path_filters:
-        - "*.final.out"
+        - "*.final.out"   
   - custom_content
 
+
+# Other MultiQC config stuff here
+custom_data:
+    mapping:
+        parent_id: mapping
+        parent_name: 'Mapping'
+        file_format: 'tsv'
+        section_name: 'Mapping'
+        description: 'The mapping metrics for each experiment'
+        plot_type: 'bargraph'
+    dedup_reads:
+        parent_id: dedup
+        parent_name: 'Deduplication'
+        file_format: 'tsv'
+        section_name: 'Reads'
+        description: 'The number of reads before and after PCR deduplication for each experiment'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Count'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    dedup_ratio:
+        parent_id: dedup
+        parent_name: 'Deduplication'
+        file_format: 'tsv'
+        section_name: 'Ratio'
+        description: 'The PCR deduplication ratio for each experiment'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Ratio'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    dedup_mean_umis:
+        parent_id: dedup
+        parent_name: 'Deduplication'
+        file_format: 'tsv'
+        section_name: 'Mean UMIs'
+        description: 'Mean number of unique UMIs per position for each experiment'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Mean number'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    crosslinks_counts:
+        parent_id: crosslinks
+        parent_name: 'Crosslinks'
+        file_format: 'tsv'
+        section_name: 'Counts'
+        description: 'The number of crosslinks or crosslink sites for each experiment'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Count'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    crosslinks_ratio:
+        parent_id: crosslinks
+        parent_name: 'Crosslinks'
+        file_format: 'tsv'
+        section_name: 'Ratios'
+        description: 'The ratio of number of cDNA mapping to crosslink positions for each experiment'
+        #plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Count'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+            tt_decimals: 2
+    peaks_counts:
+        parent_id: peaks
+        parent_name: 'Peaks'
+        file_format: 'tsv'
+        section_name: 'Counts'
+        description: 'The total number of peaks called by each peak caller'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Number of peaks'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    xlinks_in_peaks:
+        parent_id: peaks
+        parent_name: 'Peaks'
+        file_format: 'tsv'
+        section_name: 'Crosslinks positions in peaks'
+        description: 'The total percentage of crosslinks within peaks for each peak caller'
+        #plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Percentage of crosslinks'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+            tt_decimals: 2
+            tt_suffix: '%'
+    xlinksites_in_peaks:
+        parent_id: peaks
+        parent_name: 'Peaks'
+        file_format: 'tsv'
+        section_name: 'Crosslinks positions in peaks'
+        description: 'The total percentage of crosslink sites within peaks for each peak caller'
+        #plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Percentage of crosslink sites'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+            tt_decimals: 2
+            tt_suffix: '%'
+    peaks_xlinksite_coverage:
+        parent_id: peaks
+        parent_name: 'Peaks'
+        file_format: 'tsv'
+        section_name: 'Peak-crosslink coverage'
+        description: 'The total percentage of nucleotides within peaks covered by a crosslink site'
+        plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Percentage of nucleotides within peaks'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+            tt_decimals: 2
+            tt_suffix: '%'
+    summary_type:
+        parent_id: Summary
+        parent_name: 'Summary'
+        file_format: 'tsv'
+        section_name: 'Percentage of cDNA premap'
+        description: 'The total percentage of cDNA summary mapped'
+        #plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Type'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+    summary_subtype:
+        parent_id: Summary
+        parent_name: 'Summary'
+        file_format: 'tsv'
+        section_name: 'Percentage of cDNA premap subtypes'
+        description: 'The total percentage of cDNA subtypes mapped'
+        #plot_type: 'bargraph'
+        pconfig:
+            ylab: 'Type'
+            #stacking: False
+            cpswitch: False
+            tt_percentages: False
+
+        
+sp:
+    mapping:
+        fn: 'mapping.tsv'
+    dedup_reads:
+        fn: 'dedup_reads.tsv'
+    dedup_ratio:
+        fn: 'dedup_ratio.tsv'
+    dedup_mean_umis:
+        fn: 'dedup_mean_umis.tsv'
+    crosslinks_counts:
+        fn: 'xlinks_counts.tsv'
+    crosslinks_ratio:
+        fn: 'xlinks_ratio.tsv'
+    peaks:
+        fn: 'total_peaks.tsv'
+    xlinks_in_peaks:
+        fn: 'xlinks_in_peaks.tsv'
+    xlinksites_in_peaks:
+        fn: 'xlinksites_in_peaks.tsv'
+    peaks_xlinksite_coverage:
+        fn: 'peaks_xlinksite_coverage.tsv'
+    summary_type:
+        fn: 'summary_type_metrics.tsv'
+    summary_subtype:
+        fn: '"summary_subtype_metrics.tsv'
+
+
+sp:
+    mapping:
+        fn: 'mapping.tsv'
+    dedup_reads:
+        fn: 'dedup_reads.tsv'
+    dedup_ratio:
+        fn: 'dedup_ratio.tsv'
+    dedup_mean_umis:
+        fn: 'dedup_mean_umis.tsv'
+    crosslinks_counts:
+        fn: 'xlinks_counts.tsv'
+    crosslinks_ratio:
+        fn: 'xlinks_ratio.tsv'
+    peaks:
+        fn: 'total_peaks.tsv'
+    xlinks_in_peaks:
+        fn: 'xlinks_in_peaks.tsv'
+    xlinksites_in_peaks:
+        fn: 'xlinksites_in_peaks.tsv'
+    peaks_xlinksite_coverage:
+        fn: 'peaks_xlinksite_coverage.tsv'
+    summary_type:
+        fn: 'summary_type_metrics.tsv'
+    summary_subtype:
+        fn: 'summary_subtype_metrics.tsv'
+
 custom_content:
   order:
+    - clipqc
     - software-versions-by-process
     - software-versions-unique
 
 # Customise the module search patterns to speed up execution time
-sp:
-  samtools/stats:
-    fn: "*.stats"
-  samtools/flagstat:
-    fn: "*.flagstat"
-  samtools/idxstats:
-    fn: "*.idxstats*"
+# sp:
+#   samtools/stats:
+#     fn: "*.stats"
+#   samtools/flagstat:
+#     fn: "*.flagstat"
+#   samtools/idxstats:
+#     fn: "*.idxstats*"
+#   clipqc:
+#     fn: "*.txt"
diff --git a/conf/modules.config b/conf/modules.config
index f5928fd6..a787b7c6 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -34,6 +34,7 @@ process {
 ========================================================================================
 */
 
+
 if(params.run_genome_prep) {
     process {
         withName: '.*PREPARE_GENOME:GUNZIP_.*' {
@@ -71,7 +72,6 @@ if(params.run_genome_prep) {
                 path: { "${params.outdir}/00_genome" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-                enabled: params.save_reference
             ]
         }
 
@@ -185,6 +185,7 @@ if(params.run_genome_prep) {
     }
 }
 
+
 /*
 ========================================================================================
     PRE-PROCESSING
@@ -1004,13 +1005,13 @@ if(params.run_reporting) {
             ]
         }
 
-        // withName: 'CLIPSEQ:CLIPSEQ_CLIPQC' {
-        //     publishDir = [
-        //         path: { "${params.outdir}/06_reports/clipqc" },
-        //         mode: "${params.publish_dir_mode}",
-        //         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        //     ]
-        // }
+         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CLIPQC' {
+             publishDir = [
+                 path: { "${params.outdir}/06_reports/clipqc" },
+                 mode: "${params.publish_dir_mode}",
+                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+             ]
+         }
 
         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:MULTIQC' {
             ext.args   = params.multiqc_title ? "-v --title \"$params.multiqc_title\"" : '-v'
diff --git a/conf/test.config b/conf/test.config
index 0359cedb..42e07573 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -20,29 +20,37 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input  = '/Users/capitac/repos/nfcore-clipseq/tests/test_new_samplesheet_FASTQ.csv'
+    //input  = '/Users/capitac/repos/nfcore-clipseq/tests/test_new_samplesheet_FASTQ.csv'
+    input  = "./tests/test_new_samplesheet_FASTQ.csv"
+    fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
+    gtf   = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz"
+    ncrna_fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.smrna.fasta"
     source = "fastq"
 
     // Genome references
-    fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz"
-    ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
-    gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.gtf.gz"
-    genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/star.tar.gz"
-    ncrna_genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/bowtie.tar.gz"
-    genome_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.sizes"
-    ncrna_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.sizes"
-    longest_transcript = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.txt"
-    longest_transcript_fai = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.fai"
-    longest_transcript_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.gtf"
-    filtered_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered.gtf"
-    seg_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_seg.gtf"
-    seg_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg.gtf"
-    regions_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_regions.gtf.gz"
-    regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz"
-    seg_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOtherfalse.resolved.gtf"
-    regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf"
-    seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf"
-    regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf"
+    //fasta         = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz'
+    //gtf           = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz'
+
+    // Genome references
+    //fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz"
+    //ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
+    //gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.gtf.gz"
+    //genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/star.tar.gz"
+    //ncrna_genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/bowtie.tar.gz"
+    //genome_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.sizes"
+    //ncrna_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.sizes"
+    //longest_transcript = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.txt"
+    //longest_transcript_fai = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.fai"
+    //longest_transcript_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.gtf"
+    //filtered_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered.gtf"
+    //seg_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_seg.gtf"
+    //seg_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg.gtf"
+    //regions_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_regions.gtf.gz"
+    //regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz"
+    //seg_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOtherfalse.resolved.gtf"
+    //regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf"
+    //seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf"
+    //regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf"
 
     // Logic
     debug                 = true
@@ -56,4 +64,8 @@ params {
 
     // Pipeline params
     umitools_bc_pattern = 'NNNNNNNNN'
+
+    // Don't call consensus
+    //consensus_peak        = false
+
 }
diff --git a/modules/local/clipqc/main.nf b/modules/local/clipqc/main.nf
index 3c9e81e7..0fc14820 100644
--- a/modules/local/clipqc/main.nf
+++ b/modules/local/clipqc/main.nf
@@ -12,6 +12,10 @@ process CLIPQC {
     path("icount/*")
     path("paraclu/*")
     path("clippy/*")
+    path("pureclip/*")
+    path("summary_type/*")
+    path("summary_subtype/*")
+    path("summary_gene/*")
 
     output:
     path "*.tsv"         , emit: tsv
diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py
index f265983f..41d1b7b6 100644
--- a/modules/local/clipqc/templates/clipqc.py
+++ b/modules/local/clipqc/templates/clipqc.py
@@ -49,6 +49,8 @@ def main(process_name):
             ncrna["ncrna_reads"].append(input_reads - output_reads)
 
     ncrna_df = pd.DataFrame(ncrna)
+    print(ncrna_df)
+
 
     # Next get STAR logs
     star_logs = sorted(["mapped/" + f for f in os.listdir("mapped") if f.endswith(".Log.final.out")])
@@ -96,7 +98,7 @@ def main(process_name):
 
     for dedup_log in dedup_logs:
         with open(dedup_log, "r") as logfile:
-            exp = re.sub(".log", "", os.path.basename(dedup_log))
+            exp = re.sub(".genomeUnique.dedup_UMICollapse.log", "", os.path.basename(dedup_log))
 
             lines = logfile.readlines()
 
@@ -149,26 +151,26 @@ def read_bed(filename):
     # First get xlink bed files
     xlinks_files = sorted(["xlinks/" + f for f in os.listdir("xlinks") if f.endswith(".bed")])
 
-    xlinks = dict((key, []) for key in ["exp", "total_xlinks", "total_xlinksites", "ratio"])
+    xlinks = dict((key, []) for key in ["exp", "number_of_cDNAs", "total_positions_of_crosslinks", "ratio"])
 
     for xlinks_file in xlinks_files:
         xlinks_df = read_bed(xlinks_file)
 
-        exp = re.sub(".bed", "", os.path.basename(xlinks_file))
-        total_xlinks = xlinks_df["score"].sum()
-        total_xlinksites = xlinks_df.shape[0]
-        ratio = total_xlinks / total_xlinksites
+        exp = re.sub(".genome.xl.bed", "", os.path.basename(xlinks_file))
+        number_of_cDNAs = xlinks_df["score"].sum()
+        total_positions_of_crosslinks = xlinks_df.shape[0]
+        ratio = number_of_cDNAs / total_positions_of_crosslinks
 
         xlinks["exp"].append(exp)
-        xlinks["total_xlinks"].append(total_xlinks)
-        xlinks["total_xlinksites"].append(total_xlinksites)
-        xlinks["ratio"].append(round(total_xlinks / total_xlinksites, 2))
+        xlinks["number_of_cDNAs"].append(number_of_cDNAs)
+        xlinks["total_positions_of_crosslinks"].append(total_positions_of_crosslinks)
+        xlinks["ratio"].append(round(number_of_cDNAs / total_positions_of_crosslinks, 2))
 
     xlinks_metrics_df = pd.DataFrame(xlinks)
     xlinks_metrics_df.to_csv("xlinks_metrics.tsv", sep="\t", index=False)
 
     # Subset for MultiQC plots
-    xlinks_metrics_df.loc[:, ["exp", "total_xlinks", "total_xlinksites"]].to_csv(
+    xlinks_metrics_df.loc[:, ["exp", "number_of_cDNAs", "total_positions_of_crosslinks"]].to_csv(
         "xlinks_counts.tsv", sep="\t", index=False
     )
     xlinks_metrics_df.loc[:, ["exp", "ratio"]].to_csv("xlinks_ratio.tsv", sep="\t", index=False)
@@ -176,11 +178,74 @@ def read_bed(filename):
     print(xlinks_metrics_df)
     print("\n\n")
 
+    # ==========
+    # Summary cDNA
+    # ==========
+    summary_type_files = sorted(["summary_type/" + f for f in os.listdir("summary_type") if f.endswith(".tsv")])
+    #summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"])
+    #summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"])
+    summary_data = []
+
+    for summary_type_file in summary_type_files:
+           
+            exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file))
+            df = pd.read_csv(summary_type_file, sep='\t')
+            
+
+            exp_data = {'exp': exp}
+        
+            # Add percentage for each RNA type
+            for _, row in df.iterrows():
+                rna_type = row['Type']
+                exp_data[rna_type] = row['cDNA %']
+        
+            summary_data.append(exp_data)
+    
+
+    summary_df = pd.DataFrame(summary_data) 
+    desired_columns = ['exp', 'CDS', 'ncRNA', 'intergenic', 'premapped RNA']
+    existing_cols = [col for col in desired_columns if col in summary_df.columns]
+    summary_df = summary_df[existing_cols]
+    summary_df.to_csv("summary_type_metrics.tsv", sep="\t", index=False)
+
+   
+    # ==========
+    # Summary subtype cDNA
+    # ==========
+
+    summary_subtype_files = sorted(["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")])
+    summary_subtypedata = []
+
+    for summary_subtype_file in summary_subtype_files:
+           
+            exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file))
+            df = pd.read_csv(summary_subtype_file, sep='\t')
+            
+
+            exp_data = {'exp': exp}
+        
+            # Add percentage for each RNA type
+            for _, row in df.iterrows():
+                rna_type = row['Subtype']
+                exp_data[rna_type] = row['cDNA %']
+        
+            summary_subtypedata.append(exp_data)
+    
+
+    summary_subtype_df = pd.DataFrame(summary_subtypedata) 
+    desired_columns = ['exp', 'CDS mRNA','ncRNA rRNA','ncRNA snRNA','ncRNA snoRNA','ncRNA tRNA','ncRNA ncRNA', 'intergenic', 'premapped RNA']
+    existing_cols = [col for col in desired_columns if col in summary_subtype_df.columns]
+    summary_subtype_df = summary_subtype_df[existing_cols]
+    summary_subtype_df.to_csv("summary_subtype_metrics.tsv", sep="\t", index=False)
+
+
+
+
     # ==========
     # Peaks
     # ==========
 
-    peakcallers = ["icount", "paraclu", "clippy"]
+    peakcallers = ["icount", "paraclu", "clippy","pureclip"]
 
     def get_peaks_metrics(peakcaller):
         peak_files = sorted([peakcaller + "/" + f for f in os.listdir(peakcaller) if f.endswith(".bed")])
@@ -209,13 +274,15 @@ def get_peaks_metrics(peakcaller):
             if peakcaller == "icount":
                 exp = re.sub(".peaks.bed", "", os.path.basename(peak_file))
             elif peakcaller == "paraclu":
-                exp = re.sub(".peaks.bed", "", os.path.basename(peak_file))
+                exp = re.sub(".genome.peaks.clustered.simplified.bed", "", os.path.basename(peak_file))
             elif peakcaller == "clippy":
                 exp = re.sub(
-                    "_rollmean.+_stdev.+_minGeneCount.+.bed",
+                    ".genome.peaks_rollmean10_minHeightAdjust1.0_minPromAdjust1.0_minGeneCount5_Peaks.bed",
                     "",
                     os.path.basename(peak_file),
                 )
+            elif peakcaller == "pureclip":
+                exp = re.sub("_pureclip.peaks.bed", "", os.path.basename(peak_file))
 
             xlinks_df = read_bed(xlinks_files[0])
             expanded_xlinks_df = xlinks_df.loc[xlinks_df.index.repeat(xlinks_df.score)].reset_index(drop=True)
@@ -287,6 +354,10 @@ def get_peaks_metrics(peakcaller):
     print(peaks_xlinksite_coverage_percent_df)
     print("\n\n")
 
+   
+
+
+
 
 if __name__ == "__main__":
     # Allows switching between nextflow templating and standalone python running using arguments
diff --git a/nextflow.config b/nextflow.config
index bba0156e..fed474d2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -83,6 +83,7 @@ params {
 
     // Peak calling
     peakcaller                 = 'clippy,icount,paraclu,pureclip'
+    //peakcaller                 = 'clippy,paraclu,pureclip'
     clippy_params              = ''
     icount_sigxl_params        = ''
     icount_peaks_params        = ''
diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf
index 7822b4a6..3f09ed36 100644
--- a/workflows/clipseq.nf
+++ b/workflows/clipseq.nf
@@ -268,6 +268,10 @@ workflow CLIPSEQ {
         ch_genome_index               = PREPARE_GENOME.out.genome_index
         ch_ncrna_genome_index         = PREPARE_GENOME.out.ncrna_index
     }
+    
+    //ch_ncrna_genome_index.view()
+    ch_ncrna_genome_index.collect().view()
+
 
     //
     // SUBWORKFLOW: Read in samplesheet, validate, stage input files and merge replicates
@@ -285,7 +289,7 @@ workflow CLIPSEQ {
     // ch_fastq | view
 
     //
-    // SUBWORKFLOW: Extract UMI, trim and run b4 and after fastqc
+    // SUBWORKFLOW: Extract UMI, trim and run before and after fastqc
     //
     if(params.source == "fastq" & params.run_preprocessing) {
         FASTQ_FASTQC_UMITOOLS_TRIMGALORE (
@@ -384,7 +388,7 @@ workflow CLIPSEQ {
         ch_versions   = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions)
         ch_genome_unique_dedupe_bam = GENOME_UNIQUE_DEDUP.out.bam
         ch_genome_unique_dedupe_bai = GENOME_UNIQUE_DEDUP.out.bai
-        //ch_umi_log    = GENOME_UNIQUE_DEDUP.out.umi_log
+        ch_umi_log    = GENOME_UNIQUE_DEDUP.out.umi_log
 
         GENOME_MULTI_DEDUP (
             ch_genome_multi_bam_bai
@@ -448,6 +452,10 @@ workflow CLIPSEQ {
             ch_ncrna_k1_crosslink_group_resolved_bed
         )
         ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions)
+        ch_merged_summary_type = MERGE_SUMMARY.out.summary_type_adjusted
+        ch_merged_summary_subtype = MERGE_SUMMARY.out.summary_subtype_adjusted
+        ch_merged_summary_gene = MERGE_SUMMARY.out.summary_gene_adjusted
+
 
         ICOUNTMINI_METAGENE (
             ch_genome_crosslink_group_resolved_bed,
@@ -500,6 +508,15 @@ workflow CLIPSEQ {
     ch_icountmini_sigxls            = Channel.empty()
     ch_paraclu_genome_peaks         = Channel.empty()
 
+    //
+    // PEKA CHANNELS
+    //
+    ch_peka_icountmini_peaks        = Channel.empty()
+    ch_peka_clippy_peaks            = Channel.empty()
+    ch_peka_paraclu_peaks           = Channel.empty()
+    ch_peka_pureclip_peaks          = Channel.empty()
+    
+
     if(params.run_peakcalling) {
 
         if('clippy' in callers) {
@@ -763,25 +780,35 @@ workflow CLIPSEQ {
     }
 
     if(params.run_reporting) {
-        //
+        
         // MODULE: Collect software versions
-        //
-        DUMP_SOFTWARE_VERSIONS (
-            ch_versions.unique().collectFile()
+        
+        // DUMP_SOFTWARE_VERSIONS (
+        //     ch_versions.unique().collectFile()
+        // )
+
+        
+       // MODULE: Run clipqc
+        
+        CLIPQC (
+            ch_ncrna_log.collect{ it[1] },
+            ch_genome_log.collect{ it[1] },
+            ch_umi_log.collect{ it[1] },
+            ch_genome_crosslink_group_resolved_bed.collect{ it[1] },
+            ch_icountmini_peaks.collect{ it[1] },
+            ch_paraclu_genome_peaks.collect {it[1]},
+            ch_clippy_genome_peaks.collect {it[1]},
+            ch_pureclip_genome_peaks.collect {it[1] },
+            ch_merged_summary_type.collect{it[1]},
+            ch_merged_summary_subtype.collect{it[1]},
+            ch_merged_summary_gene.collect{it[1]},
+
+            //ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] },
+            //PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] },
+            //CLIPPY_GENOME.out.peaks.map{ it[1] }
         )
 
-        //
-        // MODULE: Run clipqc
-        //
-        // CLIPSEQ_CLIPQC (
-        //     ch_bt_log.map{ it[1] },
-        //     ch_star_log.map{ it[1] },
-        //     ch_umi_log.map{ it[1] },
-        //     ch_genome_crosslink_bed.map{ it[1] },
-        //     ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] },
-        //     PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] },
-        //     CLIPPY_GENOME.out.peaks.map{ it[1] }
-        // )
+        //CLIPQC.out.txt.view()
 
         //
         // MODULE: Run multiqc
@@ -795,14 +822,20 @@ workflow CLIPSEQ {
         ch_multiqc_files = Channel.empty()
         ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
         ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
-        ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect())
-        ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect())
+        //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect())
+        //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect())
         
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(ch_ncrna_log.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(ch_genome_log.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix(CLIPQC.out.tsv.collect().ifEmpty([]))
+
+        //ch_multiqc_files.collect().view()
+        ch_tmp = ch_ncrna_log.collect{it[1]}
+        ch_tmp.view()
+        //ch_genome_log.collect().view()
 
         MULTIQC (
             ch_multiqc_files.collect(),

From d5672b095bff34ef841201eda1b85e32f31b13bd Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Fri, 21 Mar 2025 15:20:54 +0000
Subject: [PATCH 02/11] linting

---
 modules/local/clipqc/templates/clipqc.py | 93 ++++++++++++------------
 nextflow.config                          |  1 -
 2 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py
index 41d1b7b6..122f714c 100644
--- a/modules/local/clipqc/templates/clipqc.py
+++ b/modules/local/clipqc/templates/clipqc.py
@@ -51,7 +51,6 @@ def main(process_name):
     ncrna_df = pd.DataFrame(ncrna)
     print(ncrna_df)
 
-
     # Next get STAR logs
     star_logs = sorted(["mapped/" + f for f in os.listdir("mapped") if f.endswith(".Log.final.out")])
 
@@ -182,70 +181,74 @@ def read_bed(filename):
     # Summary cDNA
     # ==========
     summary_type_files = sorted(["summary_type/" + f for f in os.listdir("summary_type") if f.endswith(".tsv")])
-    #summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"])
-    #summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"])
+    # summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"])
+    # summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"])
     summary_data = []
 
     for summary_type_file in summary_type_files:
-           
-            exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file))
-            df = pd.read_csv(summary_type_file, sep='\t')
-            
-
-            exp_data = {'exp': exp}
-        
-            # Add percentage for each RNA type
-            for _, row in df.iterrows():
-                rna_type = row['Type']
-                exp_data[rna_type] = row['cDNA %']
-        
-            summary_data.append(exp_data)
-    
-
-    summary_df = pd.DataFrame(summary_data) 
-    desired_columns = ['exp', 'CDS', 'ncRNA', 'intergenic', 'premapped RNA']
+
+        exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file))
+        df = pd.read_csv(summary_type_file, sep="\t")
+
+        exp_data = {"exp": exp}
+
+        # Add percentage for each RNA type
+        for _, row in df.iterrows():
+            rna_type = row["Type"]
+            exp_data[rna_type] = row["cDNA %"]
+
+        summary_data.append(exp_data)
+
+    summary_df = pd.DataFrame(summary_data)
+    desired_columns = ["exp", "CDS", "ncRNA", "intergenic", "premapped RNA"]
     existing_cols = [col for col in desired_columns if col in summary_df.columns]
     summary_df = summary_df[existing_cols]
     summary_df.to_csv("summary_type_metrics.tsv", sep="\t", index=False)
 
-   
     # ==========
     # Summary subtype cDNA
     # ==========
 
-    summary_subtype_files = sorted(["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")])
+    summary_subtype_files = sorted(
+        ["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")]
+    )
     summary_subtypedata = []
 
     for summary_subtype_file in summary_subtype_files:
-           
-            exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file))
-            df = pd.read_csv(summary_subtype_file, sep='\t')
-            
-
-            exp_data = {'exp': exp}
-        
-            # Add percentage for each RNA type
-            for _, row in df.iterrows():
-                rna_type = row['Subtype']
-                exp_data[rna_type] = row['cDNA %']
-        
-            summary_subtypedata.append(exp_data)
-    
-
-    summary_subtype_df = pd.DataFrame(summary_subtypedata) 
-    desired_columns = ['exp', 'CDS mRNA','ncRNA rRNA','ncRNA snRNA','ncRNA snoRNA','ncRNA tRNA','ncRNA ncRNA', 'intergenic', 'premapped RNA']
+
+        exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file))
+        df = pd.read_csv(summary_subtype_file, sep="\t")
+
+        exp_data = {"exp": exp}
+
+        # Add percentage for each RNA type
+        for _, row in df.iterrows():
+            rna_type = row["Subtype"]
+            exp_data[rna_type] = row["cDNA %"]
+
+        summary_subtypedata.append(exp_data)
+
+    summary_subtype_df = pd.DataFrame(summary_subtypedata)
+    desired_columns = [
+        "exp",
+        "CDS mRNA",
+        "ncRNA rRNA",
+        "ncRNA snRNA",
+        "ncRNA snoRNA",
+        "ncRNA tRNA",
+        "ncRNA ncRNA",
+        "intergenic",
+        "premapped RNA",
+    ]
     existing_cols = [col for col in desired_columns if col in summary_subtype_df.columns]
     summary_subtype_df = summary_subtype_df[existing_cols]
     summary_subtype_df.to_csv("summary_subtype_metrics.tsv", sep="\t", index=False)
 
-
-
-
     # ==========
     # Peaks
     # ==========
 
-    peakcallers = ["icount", "paraclu", "clippy","pureclip"]
+    peakcallers = ["icount", "paraclu", "clippy", "pureclip"]
 
     def get_peaks_metrics(peakcaller):
         peak_files = sorted([peakcaller + "/" + f for f in os.listdir(peakcaller) if f.endswith(".bed")])
@@ -354,10 +357,6 @@ def get_peaks_metrics(peakcaller):
     print(peaks_xlinksite_coverage_percent_df)
     print("\n\n")
 
-   
-
-
-
 
 if __name__ == "__main__":
     # Allows switching between nextflow templating and standalone python running using arguments
diff --git a/nextflow.config b/nextflow.config
index be58ebf1..9ad1278a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -83,7 +83,6 @@ params {
 
     // Peak calling
     peakcaller                 = 'clippy,icount,paraclu,pureclip'
-    //peakcaller                 = 'clippy,paraclu,pureclip'
     clippy_params              = ''
     icount_sigxl_params        = ''
     icount_peaks_params        = ''

From 264c963d60e4bf66351c149a8158c0fa3515957e Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Fri, 21 Mar 2025 15:25:02 +0000
Subject: [PATCH 03/11] linting

---
 assets/multiqc_config.yml | 372 ++++++++++++++++++--------------------
 1 file changed, 171 insertions(+), 201 deletions(-)

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index e6d43047..617e0125 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -42,219 +42,189 @@ module_order:
   - star:
       name: "STAR Genome Alignment"
       path_filters:
-        - "*.final.out"   
+        - "*.final.out"
   - custom_content
 
-
 # Other MultiQC config stuff here
 custom_data:
-    mapping:
-        parent_id: mapping
-        parent_name: 'Mapping'
-        file_format: 'tsv'
-        section_name: 'Mapping'
-        description: 'The mapping metrics for each experiment'
-        plot_type: 'bargraph'
-    dedup_reads:
-        parent_id: dedup
-        parent_name: 'Deduplication'
-        file_format: 'tsv'
-        section_name: 'Reads'
-        description: 'The number of reads before and after PCR deduplication for each experiment'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Count'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    dedup_ratio:
-        parent_id: dedup
-        parent_name: 'Deduplication'
-        file_format: 'tsv'
-        section_name: 'Ratio'
-        description: 'The PCR deduplication ratio for each experiment'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Ratio'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    dedup_mean_umis:
-        parent_id: dedup
-        parent_name: 'Deduplication'
-        file_format: 'tsv'
-        section_name: 'Mean UMIs'
-        description: 'Mean number of unique UMIs per position for each experiment'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Mean number'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    crosslinks_counts:
-        parent_id: crosslinks
-        parent_name: 'Crosslinks'
-        file_format: 'tsv'
-        section_name: 'Counts'
-        description: 'The number of crosslinks or crosslink sites for each experiment'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Count'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    crosslinks_ratio:
-        parent_id: crosslinks
-        parent_name: 'Crosslinks'
-        file_format: 'tsv'
-        section_name: 'Ratios'
-        description: 'The ratio of number of cDNA mapping to crosslink positions for each experiment'
-        #plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Count'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-            tt_decimals: 2
-    peaks_counts:
-        parent_id: peaks
-        parent_name: 'Peaks'
-        file_format: 'tsv'
-        section_name: 'Counts'
-        description: 'The total number of peaks called by each peak caller'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Number of peaks'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    xlinks_in_peaks:
-        parent_id: peaks
-        parent_name: 'Peaks'
-        file_format: 'tsv'
-        section_name: 'Crosslinks positions in peaks'
-        description: 'The total percentage of crosslinks within peaks for each peak caller'
-        #plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Percentage of crosslinks'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-            tt_decimals: 2
-            tt_suffix: '%'
-    xlinksites_in_peaks:
-        parent_id: peaks
-        parent_name: 'Peaks'
-        file_format: 'tsv'
-        section_name: 'Crosslinks positions in peaks'
-        description: 'The total percentage of crosslink sites within peaks for each peak caller'
-        #plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Percentage of crosslink sites'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-            tt_decimals: 2
-            tt_suffix: '%'
-    peaks_xlinksite_coverage:
-        parent_id: peaks
-        parent_name: 'Peaks'
-        file_format: 'tsv'
-        section_name: 'Peak-crosslink coverage'
-        description: 'The total percentage of nucleotides within peaks covered by a crosslink site'
-        plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Percentage of nucleotides within peaks'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-            tt_decimals: 2
-            tt_suffix: '%'
-    summary_type:
-        parent_id: Summary
-        parent_name: 'Summary'
-        file_format: 'tsv'
-        section_name: 'Percentage of cDNA premap'
-        description: 'The total percentage of cDNA summary mapped'
-        #plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Type'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
-    summary_subtype:
-        parent_id: Summary
-        parent_name: 'Summary'
-        file_format: 'tsv'
-        section_name: 'Percentage of cDNA premap subtypes'
-        description: 'The total percentage of cDNA subtypes mapped'
-        #plot_type: 'bargraph'
-        pconfig:
-            ylab: 'Type'
-            #stacking: False
-            cpswitch: False
-            tt_percentages: False
+  mapping:
+    parent_id: mapping
+    parent_name: "Mapping"
+    file_format: "tsv"
+    section_name: "Mapping"
+    description: "The mapping metrics for each experiment"
+    plot_type: "bargraph"
+  dedup_reads:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Reads"
+    description: "The number of reads before and after PCR deduplication for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  dedup_ratio:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Ratio"
+    description: "The PCR deduplication ratio for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Ratio"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  dedup_mean_umis:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Mean UMIs"
+    description: "Mean number of unique UMIs per position for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Mean number"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  crosslinks_counts:
+    parent_id: crosslinks
+    parent_name: "Crosslinks"
+    file_format: "tsv"
+    section_name: "Counts"
+    description: "The number of crosslinks or crosslink sites for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  crosslinks_ratio:
+    parent_id: crosslinks
+    parent_name: "Crosslinks"
+    file_format: "tsv"
+    section_name: "Ratios"
+    description: "The ratio of number of cDNA mapping to crosslink positions for each experiment"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+  peaks_counts:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Counts"
+    description: "The total number of peaks called by each peak caller"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Number of peaks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  xlinks_in_peaks:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Crosslinks positions in peaks"
+    description: "The total percentage of crosslinks within peaks for each peak caller"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Percentage of crosslinks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  xlinksites_in_peaks:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Crosslinks positions in peaks"
+    description: "The total percentage of crosslink sites within peaks for each peak caller"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Percentage of crosslink sites"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  peaks_xlinksite_coverage:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Peak-crosslink coverage"
+    description: "The total percentage of nucleotides within peaks covered by a crosslink site"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Percentage of nucleotides within peaks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  summary_type:
+    parent_id: Summary
+    parent_name: "Summary"
+    file_format: "tsv"
+    section_name: "Percentage of cDNA premap"
+    description: "The total percentage of cDNA summary mapped"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Type"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  summary_subtype:
+    parent_id: Summary
+    parent_name: "Summary"
+    file_format: "tsv"
+    section_name: "Percentage of cDNA premap subtypes"
+    description: "The total percentage of cDNA subtypes mapped"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Type"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
 
-        
 sp:
-    mapping:
-        fn: 'mapping.tsv'
-    dedup_reads:
-        fn: 'dedup_reads.tsv'
-    dedup_ratio:
-        fn: 'dedup_ratio.tsv'
-    dedup_mean_umis:
-        fn: 'dedup_mean_umis.tsv'
-    crosslinks_counts:
-        fn: 'xlinks_counts.tsv'
-    crosslinks_ratio:
-        fn: 'xlinks_ratio.tsv'
-    peaks:
-        fn: 'total_peaks.tsv'
-    xlinks_in_peaks:
-        fn: 'xlinks_in_peaks.tsv'
-    xlinksites_in_peaks:
-        fn: 'xlinksites_in_peaks.tsv'
-    peaks_xlinksite_coverage:
-        fn: 'peaks_xlinksite_coverage.tsv'
-    summary_type:
-        fn: 'summary_type_metrics.tsv'
-    summary_subtype:
-        fn: '"summary_subtype_metrics.tsv'
-
-
-sp:
-    mapping:
-        fn: 'mapping.tsv'
-    dedup_reads:
-        fn: 'dedup_reads.tsv'
-    dedup_ratio:
-        fn: 'dedup_ratio.tsv'
-    dedup_mean_umis:
-        fn: 'dedup_mean_umis.tsv'
-    crosslinks_counts:
-        fn: 'xlinks_counts.tsv'
-    crosslinks_ratio:
-        fn: 'xlinks_ratio.tsv'
-    peaks:
-        fn: 'total_peaks.tsv'
-    xlinks_in_peaks:
-        fn: 'xlinks_in_peaks.tsv'
-    xlinksites_in_peaks:
-        fn: 'xlinksites_in_peaks.tsv'
-    peaks_xlinksite_coverage:
-        fn: 'peaks_xlinksite_coverage.tsv'
-    summary_type:
-        fn: 'summary_type_metrics.tsv'
-    summary_subtype:
-        fn: 'summary_subtype_metrics.tsv'
+  mapping:
+    fn: "mapping.tsv"
+  dedup_reads:
+    fn: "dedup_reads.tsv"
+  dedup_ratio:
+    fn: "dedup_ratio.tsv"
+  dedup_mean_umis:
+    fn: "dedup_mean_umis.tsv"
+  crosslinks_counts:
+    fn: "xlinks_counts.tsv"
+  crosslinks_ratio:
+    fn: "xlinks_ratio.tsv"
+  peaks:
+    fn: "total_peaks.tsv"
+  xlinks_in_peaks:
+    fn: "xlinks_in_peaks.tsv"
+  xlinksites_in_peaks:
+    fn: "xlinksites_in_peaks.tsv"
+  peaks_xlinksite_coverage:
+    fn: "peaks_xlinksite_coverage.tsv"
+  summary_type:
+    fn: "summary_type_metrics.tsv"
+  summary_subtype:
+    fn: '"summary_subtype_metrics.tsv'
 
 custom_content:
   order:
     - clipqc
     - software-versions-by-process
     - software-versions-unique
-
 # Customise the module search patterns to speed up execution time
 # sp:
 #   samtools/stats:

From d6ee604b2979ae83c5f96cdbd9ae6d0d16d65125 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Fri, 21 Mar 2025 15:30:02 +0000
Subject: [PATCH 04/11] remove view()

---
 workflows/clipseq.nf | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf
index 3f09ed36..d3f63760 100644
--- a/workflows/clipseq.nf
+++ b/workflows/clipseq.nf
@@ -268,9 +268,6 @@ workflow CLIPSEQ {
         ch_genome_index               = PREPARE_GENOME.out.genome_index
         ch_ncrna_genome_index         = PREPARE_GENOME.out.ncrna_index
     }
-    
-    //ch_ncrna_genome_index.view()
-    ch_ncrna_genome_index.collect().view()
 
 
     //
@@ -808,8 +805,6 @@ workflow CLIPSEQ {
             //CLIPPY_GENOME.out.peaks.map{ it[1] }
         )
 
-        //CLIPQC.out.txt.view()
-
         //
         // MODULE: Run multiqc
         //
@@ -832,10 +827,7 @@ workflow CLIPSEQ {
         ch_multiqc_files = ch_multiqc_files.mix(ch_genome_log.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(CLIPQC.out.tsv.collect().ifEmpty([]))
 
-        //ch_multiqc_files.collect().view()
         ch_tmp = ch_ncrna_log.collect{it[1]}
-        ch_tmp.view()
-        //ch_genome_log.collect().view()
 
         MULTIQC (
             ch_multiqc_files.collect(),

From 336066414888004b5f490e53f2f6622d70edd83c Mon Sep 17 00:00:00 2001
From: Chhabria <chhabrs1@iscb011.mskcc.org>
Date: Thu, 12 Jun 2025 15:45:49 -0400
Subject: [PATCH 05/11] deseq2 modules

---
 conf/modules.config                         |  1 +
 conf/test.config                            | 38 +++++++++----
 modules.json                                |  5 ++
 modules/local/consensus_count_table/main.nf |  2 +-
 subworkflows/local/consensus_peak_table.nf  | 61 ++++++++++++++++++++-
 subworkflows/local/prepare_genome.nf        |  4 +-
 subworkflows/local/rna_align.nf             |  2 +-
 workflows/clipseq.nf                        | 37 +++++++++----
 8 files changed, 121 insertions(+), 29 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index a787b7c6..756b7eae 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -757,6 +757,7 @@ if(params.run_crosslinking) {
 if(params.run_peakcalling && params.consensus_peak){
     process {
         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:.*CONSENSUS_PEAK_TABLE:CONSENSUS_MAP' {
+            ext.prefix = { "${meta.id}_consensus_sorted" }
             publishDir = [
                 path: { "${params.outdir}/05_peakcalling/consensus_peak_tables" },
                 mode: "${params.publish_dir_mode}",
diff --git a/conf/test.config b/conf/test.config
index ff4270d2..8e44ea3e 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -15,21 +15,15 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '8.GB'
-    max_time   = '6.h'
+    max_cpus   = 8
+    max_memory = '60.GB'
+    max_time   = '24.h'
 
-    // Input data
+    // Inputs for testing dataset with yeast genome
     //input  = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv'
-    input  = "./tests/test_new_samplesheet_FASTQ.csv"
-    fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
-    gtf   = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz"
-    ncrna_fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.smrna.fasta"
-    source = "fastq"
 
-    // Genome references
-    //fasta         = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz'
-    //gtf           = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz'
+    //input    = '../tests/test_new_samplesheet_FASTQ.csv'
+    //source   = "fastq"
 
     // Genome references
     //fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz"
@@ -52,6 +46,20 @@ params {
     //seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf"
     //regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf"
 
+
+    // Input data for full human testing
+    input  = "./tests/test_new_samplesheet_FASTQ_human.csv"
+    fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
+    gtf   = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz"
+    ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
+    source = "fastq"
+
+    // Genome references from s3 bucket
+    //fasta         = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz'
+    //gtf           = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz'
+    
+
+
     // Logic
     debug                 = true
     save_reference        = true
@@ -62,10 +70,16 @@ params {
     save_align_intermed   = true
     skip_transcriptome    = true
 
+    // Inputs for deseq2_qc
+    skip_deseq2_qc        = false
+    
+
     // Pipeline params
     umitools_bc_pattern = 'NNNNNNNNN'
 
     // Don't call consensus
     //consensus_peak        = false
 
+    
+
 }
diff --git a/modules.json b/modules.json
index 00019c28..ad826ef1 100644
--- a/modules.json
+++ b/modules.json
@@ -70,6 +70,11 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "deseq2/differential": {
+                        "branch": "master",
+                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
+                        "installed_by": ["modules"]
+                    },
                     "fastqc": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
diff --git a/modules/local/consensus_count_table/main.nf b/modules/local/consensus_count_table/main.nf
index 803f5af4..0d5e5b24 100644
--- a/modules/local/consensus_count_table/main.nf
+++ b/modules/local/consensus_count_table/main.nf
@@ -13,7 +13,7 @@ process GET_CONSENSUS_COUNTS {
     val(output_name)
 
     output:
-    path "*.tsv"         , emit: tsv
+    tuple val(meta), path ("*.tsv")         , emit: tsv
     path  "versions.yml" , emit: versions
 
     when:
diff --git a/subworkflows/local/consensus_peak_table.nf b/subworkflows/local/consensus_peak_table.nf
index 20d4b485..86d44dd1 100644
--- a/subworkflows/local/consensus_peak_table.nf
+++ b/subworkflows/local/consensus_peak_table.nf
@@ -6,6 +6,8 @@
 // MODULES
 //
 include { BEDTOOLS_MAP as CONSENSUS_MAP } from '../../modules/nf-core/bedtools/map/main'
+include { BEDTOOLS_SORT as CONSENSUS_SORT       } from '../../modules/nf-core/bedtools/sort/main'
+include { DESEQ2_QC              } from '../../modules/local/deseq2_qc'
 //
 // SUBWORKFLOWS
 //
@@ -16,23 +18,35 @@ workflow CONSENSUS_PEAK_TABLE {
     take:
     all_crosslinks           // channel: [ val(meta), [ xl.bed ] ]
     consensus_peaks          // channel: [ val(meta), [ peaks.bed ] ]
+    genome_fasta
     genome_fai               // channel: [ val(meta), [ index ] ]
     gtf                      // channel: [ val(meta), [ gtf ] ]
     table_name               // val "Clippy_Consensus_Counts.tsv" must end in tsv
+    ch_deseq2_pca_header_multiqc
+    ch_deseq2_clustering_header_multiqc
+    skip_deseq2_qc   
 
     main:
     ch_versions = Channel.empty()
 
+
+    CONSENSUS_SORT ( all_crosslinks , [])
+    ch_consensus_sorted = CONSENSUS_SORT.out.sorted
+
+
     consensus_peaks
-        .combine(all_crosslinks)
+        .combine(ch_consensus_sorted)
         .map{ meta1, consensuspeaks, meta2, crosslink -> [meta2, consensuspeaks, crosslink] }
         .set { ch_consensus_map }
 
+
     //ch_consensus_map.view { item -> "consensus for bedtools map: $item" }
 
+    ch_consensus_map.view()
+
     CONSENSUS_MAP (
         ch_consensus_map,
-        genome_fai
+        genome_fai,
     )
 
     CONSENSUS_MAP.out.mapped
@@ -46,7 +60,48 @@ workflow CONSENSUS_PEAK_TABLE {
         table_name
     )
 
+    //
+    // Generate QC plots with DESeq2
+    //
+    ch_deseq2_qc_pdf           = Channel.empty()
+    ch_deseq2_qc_rdata         = Channel.empty()
+    ch_deseq2_qc_rds           = Channel.empty()
+    ch_deseq2_qc_pca_txt       = Channel.empty()
+    ch_deseq2_qc_pca_multiqc   = Channel.empty()
+    ch_deseq2_qc_dists_txt     = Channel.empty()
+    ch_deseq2_qc_dists_multiqc = Channel.empty()
+    ch_deseq2_qc_log           = Channel.empty()
+    ch_deseq2_qc_size_factors  = Channel.empty()
+    if (!skip_deseq2_qc) {
+        DESEQ2_QC (
+            GET_CONSENSUS_COUNTS.out.tsv,
+            ch_deseq2_pca_header_multiqc,
+            ch_deseq2_clustering_header_multiqc
+        )
+        ch_deseq2_qc_pdf           = DESEQ2_QC.out.pdf
+        ch_deseq2_qc_rdata         = DESEQ2_QC.out.rdata
+        ch_deseq2_qc_rds           = DESEQ2_QC.out.rds
+        ch_deseq2_qc_pca_txt       = DESEQ2_QC.out.pca_txt
+        ch_deseq2_qc_pca_multiqc   = DESEQ2_QC.out.pca_multiqc
+        ch_deseq2_qc_dists_txt     = DESEQ2_QC.out.dists_txt
+        ch_deseq2_qc_dists_multiqc = DESEQ2_QC.out.dists_multiqc
+        ch_deseq2_qc_log           = DESEQ2_QC.out.log
+        ch_deseq2_qc_size_factors  = DESEQ2_QC.out.size_factors
+        ch_versions = ch_versions.mix(DESEQ2_QC.out.versions)
+    }
+
     emit:
-    mapped_table   = GET_CONSENSUS_COUNTS.out.tsv    // channel: [ val(meta), [ tsv ] ]
+    mapped_table            = GET_CONSENSUS_COUNTS.out.tsv    // channel: [ val(meta), [ tsv ] ]
+
+
+    deseq2_qc_pdf           = ch_deseq2_qc_pdf                  // channel: [ pdf ]
+    deseq2_qc_rdata         = ch_deseq2_qc_rdata                // channel: [ rdata ]
+    deseq2_qc_rds           = ch_deseq2_qc_rds                  // channel: [ rds ]
+    deseq2_qc_pca_txt       = ch_deseq2_qc_pca_txt              // channel: [ txt ]
+    deseq2_qc_pca_multiqc   = ch_deseq2_qc_pca_multiqc          // channel: [ txt ]
+    deseq2_qc_dists_txt     = ch_deseq2_qc_dists_txt            // channel: [ txt ]
+    deseq2_qc_dists_multiqc = ch_deseq2_qc_dists_multiqc        // channel: [ txt ]
+    deseq2_qc_log           = ch_deseq2_qc_log                  // channel: [ txt ]
+    deseq2_qc_size_factors  = ch_deseq2_qc_size_factors         // channel: [ txt ]
     versions         = ch_versions                   // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
index d5f8bf90..282aa37c 100644
--- a/subworkflows/local/prepare_genome.nf
+++ b/subworkflows/local/prepare_genome.nf
@@ -104,7 +104,7 @@ workflow PREPARE_GENOME {
         }
     }
     else {
-        ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map{it[1]}, ch_gtf.map{it[1]} ).index
+        ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index
         ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
     }
 
@@ -121,7 +121,7 @@ workflow PREPARE_GENOME {
         }
     }
     else {
-        ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta.map{it[1]} ).index
+        ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta).index
         ch_versions = ch_versions.mix(BOWTIE_BUILD.out.versions)
     }
 
diff --git a/subworkflows/local/rna_align.nf b/subworkflows/local/rna_align.nf
index a8d82e83..aaffbc08 100644
--- a/subworkflows/local/rna_align.nf
+++ b/subworkflows/local/rna_align.nf
@@ -230,4 +230,4 @@ workflow RNA_ALIGN {
     transcript_multi_bai   = ch_transcript_multi_bai                // channel: [ val(meta), [ bai ] ]
 
     versions         = ch_versions                      // channel: [ versions.yml ]
-}
+}
\ No newline at end of file
diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf
index 3f09ed36..b77d4c93 100644
--- a/workflows/clipseq.nf
+++ b/workflows/clipseq.nf
@@ -60,6 +60,10 @@ if ((caller_list + callers).unique().size() != caller_list.size()) {
 ch_dummy_file  = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)
 ch_dummy_file2 = file("$projectDir/assets/dummy_file2.txt", checkIfExists: true)
 
+// deseq2 header files:
+ch_multiqc_merged_replicate_deseq2_pca_header        = file("$projectDir/assets/merged_replicate_deseq2_pca_header.txt", checkIfExists: true)
+ch_multiqc_merged_replicate_deseq2_clustering_header = file("$projectDir/assets/merged_replicate_deseq2_clustering_header.txt", checkIfExists: true)
+
 // // Check if an AWS iGenome has been provided to use the appropriate version of STAR
 // def is_aws_igenome = false
 // if (params.fasta && params.gtf) {
@@ -442,7 +446,7 @@ workflow CLIPSEQ {
 
         ICOUNTMINI_SUMMARY (
             ch_genome_crosslink_group_resolved_bed,
-            ch_regions_resolved_gtf.collect{ it[1] }
+            ch_regions_resolved_gtf
         )
 
         MERGE_SUMMARY (
@@ -459,7 +463,7 @@ workflow CLIPSEQ {
 
         ICOUNTMINI_METAGENE (
             ch_genome_crosslink_group_resolved_bed,
-            ch_regions_resolved_gtf.collect{ it[1] }
+            ch_regions_resolved_gtf
         )
 
         if(params.consensus_peak){
@@ -539,9 +543,13 @@ workflow CLIPSEQ {
                 CLIPPY_CONSENSUS_PEAK_TABLE (
                     ch_all_crosslinks,
                     CLIPPY_GENOME_CONSENSUS.out.peaks,
+                    ch_fasta,
                     ch_fasta_fai,
-                    ch_regions_resolved_gtf,
-                    "Clippy_Consensus_AllCounts.tsv"
+                    ch_regions_resolved_gtf.collect{ it[1] },
+                    "Clippy_Consensus_AllCounts.tsv",
+                    ch_multiqc_merged_replicate_deseq2_pca_header,
+                    ch_multiqc_merged_replicate_deseq2_clustering_header,
+                    params.skip_deseq2_qc
                 )
                 ch_versions = ch_versions.mix(CLIPPY_CONSENSUS_PEAK_TABLE.out.versions)
             }
@@ -563,7 +571,7 @@ workflow CLIPSEQ {
 
             ICOUNTMINI_SIGXLS (
                 ch_genome_crosslink_group_resolved_bed,
-                ch_seg_resolved_gtf.collect{ it[1]}
+                ch_seg_resolved_gtf
                 
             )
 
@@ -601,7 +609,7 @@ workflow CLIPSEQ {
             if(params.consensus_peak){
                 CONSENSUS_ICOUNTMINI_SIGXLS (
                     ch_consensus_crosslinks_final_bed,
-                    ch_seg_resolved_gtf.collect{ it[1]}
+                    ch_seg_resolved_gtf
                 )
                 // CHANNEL: Create combined channel of input crosslinks and sigxls
                 ch_consensus_peaks_input = ch_consensus_crosslinks_final_bed
@@ -622,9 +630,13 @@ workflow CLIPSEQ {
                 ICOUNT_CONSENSUS_PEAK_TABLE (
                     ch_all_crosslinks,
                     CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS.out.gunzip,
+                    ch_fasta,
                     ch_fasta_fai,
                     ch_regions_resolved_gtf,
-                    "iCount-Mini_Consensus_AllCounts.tsv"
+                    "iCount-Mini_Consensus_AllCounts.tsv",
+                    ch_multiqc_merged_replicate_deseq2_pca_header,
+                    ch_multiqc_merged_replicate_deseq2_clustering_header,
+                    params.skip_deseq2_qc
                 )
                 ch_versions = ch_versions.mix(ICOUNT_CONSENSUS_PEAK_TABLE.out.versions)
             }
@@ -635,7 +647,8 @@ workflow CLIPSEQ {
                     ch_genome_crosslink_group_resolved_bed,
                     ch_fasta.collect{ it[1] },
                     ch_fasta_fai.collect{ it[1] },
-                    ch_regions_resolved_gtf.collect{ it[1] }
+                    ch_regions_resolved_gtf
+                    //ch_regions_resolved_gtf.collect{ it[1] }
                 )
                 ch_versions = ch_versions.mix(PEKA_ICOUNT.out.versions)
             }
@@ -663,9 +676,13 @@ workflow CLIPSEQ {
                 PARACLU_CONSENSUS_PEAK_TABLE (
                     ch_all_crosslinks,
                     PARACLU_GENOME_CONSENSUS.out.bed,
+                    ch_fasta,
                     ch_fasta_fai,
                     ch_regions_resolved_gtf,
-                    "Paraclu_Consensus_AllCounts.tsv"
+                    "Paraclu_Consensus_AllCounts.tsv",
+                    ch_multiqc_merged_replicate_deseq2_pca_header,
+                    ch_multiqc_merged_replicate_deseq2_clustering_header,
+                    params.skip_deseq2_qc
                 )
                 ch_versions = ch_versions.mix(PARACLU_CONSENSUS_PEAK_TABLE.out.versions)
             }
@@ -676,7 +693,7 @@ workflow CLIPSEQ {
                     ch_genome_crosslink_group_resolved_bed,
                     ch_fasta.collect{ it[1] },
                     ch_fasta_fai.collect{ it[1] },
-                    ch_regions_resolved_gtf.collect{ it[1] }
+                    ch_regions_resolved_gtf
                 )
                 ch_versions = ch_versions.mix(PEKA_PARACLU.out.versions)
             }

From 70d738d76771e562cd1aeeb3fd67e1904bcee81b Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 10 Nov 2025 15:59:39 +0000
Subject: [PATCH 06/11] revert small test config

---
 conf/test.config                           | 39 ++++------------------
 tests/test_new_samplesheet_FASTQ_human.csv | 10 ------
 2 files changed, 7 insertions(+), 42 deletions(-)
 delete mode 100644 tests/test_new_samplesheet_FASTQ_human.csv

diff --git a/conf/test.config b/conf/test.config
index 618bded7..6a241ddf 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -15,15 +15,13 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 8
-    max_memory = '60.GB'
-    max_time   = '24.h'
+    max_cpus   = 2
+    max_memory = '8.GB'
+    max_time   = '6.h'
 
-    // Inputs for testing dataset with yeast genome
-    //input  = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv'
-
-    //input    = '../tests/test_new_samplesheet_FASTQ.csv'
-    //source   = "fastq"
+    // Input data
+    input  = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv'
+    source = "fastq"
 
     // Genome references
     fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz"
@@ -42,20 +40,6 @@ params {
     regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz"
     regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf"
 
-
-    // Input data for full human testing
-    input  = "./tests/test_new_samplesheet_FASTQ_human.csv"
-    fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
-    gtf   = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz"
-    ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
-    source = "fastq"
-
-    // Genome references from s3 bucket
-    //fasta         = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz'
-    //gtf           = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz'
-    
-
-
     // Logic
     debug                 = true
     save_reference        = true
@@ -67,16 +51,7 @@ params {
     skip_transcriptome    = true
     skip_filter_gtf       = false
 
-    // Inputs for deseq2_qc
-    skip_deseq2_qc        = false
-    
-
     // Pipeline params
     umitools_bc_pattern = 'NNNNNNNNN'
-
-    // Don't call consensus
-    //consensus_peak        = false
-
-    
-
 }
+
diff --git a/tests/test_new_samplesheet_FASTQ_human.csv b/tests/test_new_samplesheet_FASTQ_human.csv
deleted file mode 100644
index 3ff1bafb..00000000
--- a/tests/test_new_samplesheet_FASTQ_human.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-sample_name,group_name,input_name,fastq
-TDP43_1,TDP43,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR1530360.fastq.gz
-TDP43_2,TDP43,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR1530361.fastq.gz
-MATR3_1,MATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2210802.fastq.gz
-MATR3_2,MATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2210803.fastq.gz
-PTBP1_1,PTBP1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214592.fastq.gz
-PTBP1_2,PTBP1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214593.fastq.gz
-PTBP1_siMATR3_1,PTBP1_siMATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214594.fastq.gz
-PTBP1_siMATR3_2,PTBP1_siMATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214595.fastq.gz
-STAU1_1,STAU1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR605258.fastq.gz
\ No newline at end of file

From 008ad81c3c2e01cb3ebbd75bb1c74099ccb625fb Mon Sep 17 00:00:00 2001
From: Chhabria <chhabrs1@iscxh001.mskcc.org>
Date: Mon, 10 Nov 2025 11:04:54 -0500
Subject: [PATCH 07/11] Adding new files

---
 assets/merged_replicate_deseq2_pca_header.txt | 11 +++++++++++
 tests/test_new_samplesheet_FASTQ.csv          |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 assets/merged_replicate_deseq2_pca_header.txt

diff --git a/assets/merged_replicate_deseq2_pca_header.txt b/assets/merged_replicate_deseq2_pca_header.txt
new file mode 100644
index 00000000..a92b75d9
--- /dev/null
+++ b/assets/merged_replicate_deseq2_pca_header.txt
@@ -0,0 +1,11 @@
+#id: 'mrep_deseq2_pca'
+#section_name: 'MERGED REP: DESeq2 PCA plot'
+#description: "PCA plot of the samples in the experiment.
+#              These values are calculated using <a href='https://bioconductor.org/packages/release/bioc/html/DESeq2.html'>DESeq2</a>
+#              in the <a href='https://github.com/nf-core/atacseq/blob/master/bin/deseq2_qc.r'><code>deseq2_qc.r</code></a> script."
+#plot_type: 'scatter'
+#anchor: 'mrep_deseq2_pca'
+#pconfig:
+#    title: 'DESeq2: Principal component plot'
+#    xlab: PC1
+#    ylab: PC2
diff --git a/tests/test_new_samplesheet_FASTQ.csv b/tests/test_new_samplesheet_FASTQ.csv
index a74b8037..98cf201b 100644
--- a/tests/test_new_samplesheet_FASTQ.csv
+++ b/tests/test_new_samplesheet_FASTQ.csv
@@ -2,4 +2,4 @@ sample_name,group_name,input_name,fastq
 PHO92_A,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz
 PHO92_B,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz
 PHO92_C,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz
-HNRNPC,,,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz
\ No newline at end of file
+HNRNPC,HNRNPC,,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz
\ No newline at end of file

From e638c7fdfcdd65adbdd9420b2980128265b57e1e Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 10 Nov 2025 16:13:15 +0000
Subject: [PATCH 08/11] add default for params.skip_deseq2_qc = some_value

---
 nextflow.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nextflow.config b/nextflow.config
index 83861b5f..71319c7e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -54,6 +54,7 @@ params {
     skip_umi_extract   = true
     skip_trimming      = false
     skip_transcriptome = true
+    skip_deseq2_qc     = false
 
     // Output params
     save_reference        = false

From a2d824399c867d5cc916a2e30f20027f147c1f45 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 10 Nov 2025 20:20:32 +0000
Subject: [PATCH 09/11] update file naming

---
 modules/local/deseq2_qc/main.nf               | 2 +-
 modules/local/deseq2_qc/templates/deseq2_qc.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/local/deseq2_qc/main.nf b/modules/local/deseq2_qc/main.nf
index e043abdd..8f52f7d2 100644
--- a/modules/local/deseq2_qc/main.nf
+++ b/modules/local/deseq2_qc/main.nf
@@ -37,7 +37,7 @@ process DESEQ2_QC {
     -i ${counts} \
     -c 3 \
     -d "PeakID" \
-    -o results/${prefix} \
+    -o ${prefix} \
     --vst
     """
 
diff --git a/modules/local/deseq2_qc/templates/deseq2_qc.R b/modules/local/deseq2_qc/templates/deseq2_qc.R
index 382724a0..8aff2570 100755
--- a/modules/local/deseq2_qc/templates/deseq2_qc.R
+++ b/modules/local/deseq2_qc/templates/deseq2_qc.R
@@ -118,7 +118,7 @@ dds     <- DESeqDataSetFromMatrix(countData=round(counts), colData=coldata, desi
 dds     <- estimateSizeFactors(dds)
 if (min(dim(count.table))<=1)  { # No point if only one sample, or one gene
     save(dds,file=DDSFile)
-    saveRDS(dds, file = paste(opt$output_prefix,".rds", sep = '.'))
+    saveRDS(dds, file = paste(opt$output_prefix,"rds", sep = '.'))
     warning("Not enough samples or genes in counts file for PCA.", call.=FALSE)
     quit(save = "no", status = 0, runLast = FALSE)
 }
@@ -132,7 +132,7 @@ if (!opt$vst) {
 
 assay(dds, vst_name) <- assay(rld)
 save(dds,file=DDSFile)
-saveRDS(dds, file = paste(opt$output_prefix, ".rds", sep = '.'))
+saveRDS(dds, file = paste(opt$output_prefix, "rds", sep = '.'))
 
 ################################################
 ################################################

From c09a801117261d58e065b1289ca05cd8298ea4c3 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 10 Nov 2025 21:30:59 +0000
Subject: [PATCH 10/11] update consensus modules

---
 conf/modules.config  | 30 ------------------------------
 workflows/clipseq.nf | 39 ---------------------------------------
 2 files changed, 69 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 5ddc834f..ee1e6f2f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -916,36 +916,6 @@ if(params.run_peakcalling && params.peakcaller.contains('icount')) {
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
-
-        withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_SIGXLS' {
-            ext.args = { "${params.icount_sigxl_params}" }
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_PEAKS' {
-            ext.args = { "${params.icount_peaks_params}" }
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS' {
-            publishDir = [
-                path: { "${params.outdir}/05_peakcalling/icountmini" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS' {
-            publishDir = [
-                path: { "${params.outdir}/05_peakcalling/icountmini" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
     }
 }
 
diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf
index 76d910a2..8e0a17e2 100644
--- a/workflows/clipseq.nf
+++ b/workflows/clipseq.nf
@@ -137,10 +137,6 @@ include { ICOUNTMINI_PEAKS                                          } from "../m
 include { GUNZIP as GUNZIP_ICOUNTMINI_SIGXLS                        } from "../modules/nf-core/gunzip/main"
 include { GUNZIP as GUNZIP_ICOUNTMINI_PEAKS                         } from "../modules/nf-core/gunzip/main"
 
-include { ICOUNTMINI_SIGXLS as CONSENSUS_ICOUNTMINI_SIGXLS          } from "../modules/nf-core/icountmini/sigxls/main"
-include { ICOUNTMINI_PEAKS as CONSENSUS_ICOUNTMINI_PEAKS            } from "../modules/nf-core/icountmini/peaks/main"
-include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS              } from "../modules/nf-core/gunzip/main"
-include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS               } from "../modules/nf-core/gunzip/main"
 include { PARACLU as PARACLU_GENOME                                 } from "../modules/nf-core/paraclu/main"
 include { PARACLU as PARACLU_GENOME_CONSENSUS                       } from "../modules/nf-core/paraclu/main"
 
@@ -595,41 +591,6 @@ workflow CLIPSEQ {
             ch_versions                      = ch_versions.mix(GUNZIP_ICOUNTMINI_PEAKS.out.versions)
             ch_icountmini_peaks              = GUNZIP_ICOUNTMINI_PEAKS.out.gunzip
 
-            if(params.consensus_peak){
-                CONSENSUS_ICOUNTMINI_SIGXLS (
-                    ch_consensus_crosslinks_final_bed,
-                    ch_seg_gtf.collect{ it[1]}
-                )
-                // CHANNEL: Create combined channel of input crosslinks and sigxls
-                ch_consensus_peaks_input = ch_consensus_crosslinks_final_bed
-                    .map{ [ it[0].id, it[0], it[1] ] }
-                    .join( CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls.map{ [ it[0].id, it[0], it[1] ] } )
-                    .map { [ it[1], it[2], it[4] ] }
-                //EXAMPLE CHANNEL STRUCT: [ [id:test], BED(crosslinks), BED(sigxls) ]
-                CONSENSUS_ICOUNTMINI_PEAKS (
-                    ch_consensus_peaks_input
-                )
-                ch_consensus_peaks = CONSENSUS_ICOUNTMINI_PEAKS.out.peaks
-                CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS (
-                    CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls
-                )
-                CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS (
-                    CONSENSUS_ICOUNTMINI_PEAKS.out.peaks
-                )
-                ICOUNT_CONSENSUS_PEAK_TABLE (
-                    ch_all_crosslinks,
-                    CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS.out.gunzip,
-                    ch_fasta,
-                    ch_fasta_fai,
-                    ch_regions_used,
-                    "iCount-Mini_Consensus_AllCounts.tsv",
-                    ch_multiqc_merged_replicate_deseq2_pca_header,
-                    ch_multiqc_merged_replicate_deseq2_clustering_header,
-                    params.skip_deseq2_qc
-                )
-                ch_versions = ch_versions.mix(ICOUNT_CONSENSUS_PEAK_TABLE.out.versions)
-            }
-
             if(params.run_peka) {
                 PEKA_ICOUNT (
                     ch_icountmini_peaks,

From 573b68619ff1bba971dab5f2ceb2405b18913461 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 10 Nov 2025 21:44:49 +0000
Subject: [PATCH 11/11] update some resource requirements

---
 modules/local/bedgraph_strand_split/main.nf   | 2 +-
 modules/local/consensus_count_table/main.nf   | 2 +-
 modules/nf-core/peka/main.nf                  | 2 +-
 modules/nf-core/ucsc/bedgraphtobigwig/main.nf | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/local/bedgraph_strand_split/main.nf b/modules/local/bedgraph_strand_split/main.nf
index 5888a035..d611d0f3 100644
--- a/modules/local/bedgraph_strand_split/main.nf
+++ b/modules/local/bedgraph_strand_split/main.nf
@@ -1,7 +1,7 @@
 // modules/local/bedgraph_strand_split/main.nf
 process BEDGRAPH_STRAND_SPLIT {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_medium'
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/local/consensus_count_table/main.nf b/modules/local/consensus_count_table/main.nf
index 0d5e5b24..3515b2e8 100644
--- a/modules/local/consensus_count_table/main.nf
+++ b/modules/local/consensus_count_table/main.nf
@@ -1,6 +1,6 @@
 process GET_CONSENSUS_COUNTS {
     tag "${meta.id}"
-    label "process_single"
+    label "process_high"
 
     conda "bioconda::bioconductor-rtracklayer=1.62.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/nf-core/peka/main.nf b/modules/nf-core/peka/main.nf
index 416a44a8..84256428 100644
--- a/modules/nf-core/peka/main.nf
+++ b/modules/nf-core/peka/main.nf
@@ -1,6 +1,6 @@
 process PEKA {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_high'
 
     conda "bioconda::peka=1.0.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf
index 81cdee95..37a201f5 100644
--- a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf
+++ b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf
@@ -1,6 +1,6 @@
 process UCSC_BEDGRAPHTOBIGWIG {
     tag "$meta.id"
-    label 'process_single'
+    label 'process_medium'
 
     // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
     conda "${moduleDir}/environment.yml"