From 19744b6ca9267754b9ffc2cab567ccdec6739a15 Mon Sep 17 00:00:00 2001 From: Chhabria Date: Thu, 20 Mar 2025 17:50:58 -0400 Subject: [PATCH 01/11] MultiQC changes --- assets/multiqc_config.yml | 223 ++++++++++++++++++++++- conf/modules.config | 17 +- conf/test.config | 52 ++++-- modules/local/clipqc/main.nf | 4 + modules/local/clipqc/templates/clipqc.py | 97 ++++++++-- nextflow.config | 1 + workflows/clipseq.nf | 73 ++++++-- 7 files changed, 398 insertions(+), 69 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index ca20bdd8..e6d43047 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -42,19 +42,226 @@ module_order: - star: name: "STAR Genome Alignment" path_filters: - - "*.final.out" + - "*.final.out" - custom_content + +# Other MultiQC config stuff here +custom_data: + mapping: + parent_id: mapping + parent_name: 'Mapping' + file_format: 'tsv' + section_name: 'Mapping' + description: 'The mapping metrics for each experiment' + plot_type: 'bargraph' + dedup_reads: + parent_id: dedup + parent_name: 'Deduplication' + file_format: 'tsv' + section_name: 'Reads' + description: 'The number of reads before and after PCR deduplication for each experiment' + plot_type: 'bargraph' + pconfig: + ylab: 'Count' + #stacking: False + cpswitch: False + tt_percentages: False + dedup_ratio: + parent_id: dedup + parent_name: 'Deduplication' + file_format: 'tsv' + section_name: 'Ratio' + description: 'The PCR deduplication ratio for each experiment' + plot_type: 'bargraph' + pconfig: + ylab: 'Ratio' + #stacking: False + cpswitch: False + tt_percentages: False + dedup_mean_umis: + parent_id: dedup + parent_name: 'Deduplication' + file_format: 'tsv' + section_name: 'Mean UMIs' + description: 'Mean number of unique UMIs per position for each experiment' + plot_type: 'bargraph' + pconfig: + ylab: 'Mean number' + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_counts: + parent_id: crosslinks + parent_name: 'Crosslinks' + file_format: 'tsv' + section_name: 'Counts' + description: 'The number of crosslinks or crosslink sites for each experiment' + plot_type: 'bargraph' + pconfig: + ylab: 'Count' + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_ratio: + parent_id: crosslinks + parent_name: 'Crosslinks' + file_format: 'tsv' + section_name: 'Ratios' + description: 'The ratio of number of cDNA mapping to crosslink positions for each experiment' + #plot_type: 'bargraph' + pconfig: + ylab: 'Count' + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + peaks_counts: + parent_id: peaks + parent_name: 'Peaks' + file_format: 'tsv' + section_name: 'Counts' + description: 'The total number of peaks called by each peak caller' + plot_type: 'bargraph' + pconfig: + ylab: 'Number of peaks' + #stacking: False + cpswitch: False + tt_percentages: False + xlinks_in_peaks: + parent_id: peaks + parent_name: 'Peaks' + file_format: 'tsv' + section_name: 'Crosslinks positions in peaks' + description: 'The total percentage of crosslinks within peaks for each peak caller' + #plot_type: 'bargraph' + pconfig: + ylab: 'Percentage of crosslinks' + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: '%' + xlinksites_in_peaks: + parent_id: peaks + parent_name: 'Peaks' + file_format: 'tsv' + section_name: 'Crosslinks positions in peaks' + description: 'The total percentage of crosslink sites within peaks for each peak caller' + #plot_type: 'bargraph' + pconfig: + ylab: 'Percentage of crosslink sites' + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: '%' + peaks_xlinksite_coverage: + parent_id: peaks + parent_name: 'Peaks' + file_format: 'tsv' + section_name: 'Peak-crosslink coverage' + description: 'The total percentage of nucleotides within peaks covered by a crosslink site' + plot_type: 'bargraph' + pconfig: + ylab: 'Percentage of nucleotides within peaks' + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: '%' + summary_type: + parent_id: Summary + parent_name: 'Summary' + file_format: 'tsv' + section_name: 'Percentage of cDNA premap' + description: 'The total percentage of cDNA summary mapped' + #plot_type: 'bargraph' + pconfig: + ylab: 'Type' + #stacking: False + cpswitch: False + tt_percentages: False + summary_subtype: + parent_id: Summary + parent_name: 'Summary' + file_format: 'tsv' + section_name: 'Percentage of cDNA premap subtypes' + description: 'The total percentage of cDNA subtypes mapped' + #plot_type: 'bargraph' + pconfig: + ylab: 'Type' + #stacking: False + cpswitch: False + tt_percentages: False + + +sp: + mapping: + fn: 'mapping.tsv' + dedup_reads: + fn: 'dedup_reads.tsv' + dedup_ratio: + fn: 'dedup_ratio.tsv' + dedup_mean_umis: + fn: 'dedup_mean_umis.tsv' + crosslinks_counts: + fn: 'xlinks_counts.tsv' + crosslinks_ratio: + fn: 'xlinks_ratio.tsv' + peaks: + fn: 'total_peaks.tsv' + xlinks_in_peaks: + fn: 'xlinks_in_peaks.tsv' + xlinksites_in_peaks: + fn: 'xlinksites_in_peaks.tsv' + peaks_xlinksite_coverage: + fn: 'peaks_xlinksite_coverage.tsv' + summary_type: + fn: 'summary_type_metrics.tsv' + summary_subtype: + fn: '"summary_subtype_metrics.tsv' + + +sp: + mapping: + fn: 'mapping.tsv' + dedup_reads: + fn: 'dedup_reads.tsv' + dedup_ratio: + fn: 'dedup_ratio.tsv' + dedup_mean_umis: + fn: 'dedup_mean_umis.tsv' + crosslinks_counts: + fn: 'xlinks_counts.tsv' + crosslinks_ratio: + fn: 'xlinks_ratio.tsv' + peaks: + fn: 'total_peaks.tsv' + xlinks_in_peaks: + fn: 'xlinks_in_peaks.tsv' + xlinksites_in_peaks: + fn: 'xlinksites_in_peaks.tsv' + peaks_xlinksite_coverage: + fn: 'peaks_xlinksite_coverage.tsv' + summary_type: + fn: 'summary_type_metrics.tsv' + summary_subtype: + fn: 'summary_subtype_metrics.tsv' + custom_content: order: + - clipqc - software-versions-by-process - software-versions-unique # Customise the module search patterns to speed up execution time -sp: - samtools/stats: - fn: "*.stats" - samtools/flagstat: - fn: "*.flagstat" - samtools/idxstats: - fn: "*.idxstats*" +# sp: +# samtools/stats: +# fn: "*.stats" +# samtools/flagstat: +# fn: "*.flagstat" +# samtools/idxstats: +# fn: "*.idxstats*" +# clipqc: +# fn: "*.txt" diff --git a/conf/modules.config b/conf/modules.config index f5928fd6..a787b7c6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,7 @@ process { ======================================================================================== */ + if(params.run_genome_prep) { process { withName: '.*PREPARE_GENOME:GUNZIP_.*' { @@ -71,7 +72,6 @@ if(params.run_genome_prep) { path: { "${params.outdir}/00_genome" }, mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference ] } @@ -185,6 +185,7 @@ if(params.run_genome_prep) { } } + /* ======================================================================================== PRE-PROCESSING @@ -1004,13 +1005,13 @@ if(params.run_reporting) { ] } - // withName: 'CLIPSEQ:CLIPSEQ_CLIPQC' { - // publishDir = [ - // path: { "${params.outdir}/06_reports/clipqc" }, - // mode: "${params.publish_dir_mode}", - // saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - // ] - // } + withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CLIPQC' { + publishDir = [ + path: { "${params.outdir}/06_reports/clipqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: 'NFCORE_CLIPSEQ:CLIPSEQ:MULTIQC' { ext.args = params.multiqc_title ? "-v --title \"$params.multiqc_title\"" : '-v' diff --git a/conf/test.config b/conf/test.config index 0359cedb..42e07573 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,29 +20,37 @@ params { max_time = '6.h' // Input data - input = '/Users/capitac/repos/nfcore-clipseq/tests/test_new_samplesheet_FASTQ.csv' + //input = '/Users/capitac/repos/nfcore-clipseq/tests/test_new_samplesheet_FASTQ.csv' + input = "./tests/test_new_samplesheet_FASTQ.csv" + fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz" + ncrna_fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.smrna.fasta" source = "fastq" // Genome references - fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz" - ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz" - gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.gtf.gz" - genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/star.tar.gz" - ncrna_genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/bowtie.tar.gz" - genome_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.sizes" - ncrna_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.sizes" - longest_transcript = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.txt" - longest_transcript_fai = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.fai" - longest_transcript_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.gtf" - filtered_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered.gtf" - seg_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_seg.gtf" - seg_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg.gtf" - regions_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_regions.gtf.gz" - regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz" - seg_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOtherfalse.resolved.gtf" - regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf" - seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf" - regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf" + //fasta = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz' + //gtf = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz' + + // Genome references + //fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz" + //ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz" + //gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.gtf.gz" + //genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/star.tar.gz" + //ncrna_genome_index = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/bowtie.tar.gz" + //genome_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.sizes" + //ncrna_chrom_sizes = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.sizes" + //longest_transcript = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.txt" + //longest_transcript_fai = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.fai" + //longest_transcript_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/longest_transcript.gtf" + //filtered_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered.gtf" + //seg_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_seg.gtf" + //seg_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg.gtf" + //regions_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_regions.gtf.gz" + //regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz" + //seg_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOtherfalse.resolved.gtf" + //regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf" + //seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf" + //regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf" // Logic debug = true @@ -56,4 +64,8 @@ params { // Pipeline params umitools_bc_pattern = 'NNNNNNNNN' + + // Don't call consensus + //consensus_peak = false + } diff --git a/modules/local/clipqc/main.nf b/modules/local/clipqc/main.nf index 3c9e81e7..0fc14820 100644 --- a/modules/local/clipqc/main.nf +++ b/modules/local/clipqc/main.nf @@ -12,6 +12,10 @@ process CLIPQC { path("icount/*") path("paraclu/*") path("clippy/*") + path("pureclip/*") + path("summary_type/*") + path("summary_subtype/*") + path("summary_gene/*") output: path "*.tsv" , emit: tsv diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py index f265983f..41d1b7b6 100644 --- a/modules/local/clipqc/templates/clipqc.py +++ b/modules/local/clipqc/templates/clipqc.py @@ -49,6 +49,8 @@ def main(process_name): ncrna["ncrna_reads"].append(input_reads - output_reads) ncrna_df = pd.DataFrame(ncrna) + print(ncrna_df) + # Next get STAR logs star_logs = sorted(["mapped/" + f for f in os.listdir("mapped") if f.endswith(".Log.final.out")]) @@ -96,7 +98,7 @@ def main(process_name): for dedup_log in dedup_logs: with open(dedup_log, "r") as logfile: - exp = re.sub(".log", "", os.path.basename(dedup_log)) + exp = re.sub(".genomeUnique.dedup_UMICollapse.log", "", os.path.basename(dedup_log)) lines = logfile.readlines() @@ -149,26 +151,26 @@ def read_bed(filename): # First get xlink bed files xlinks_files = sorted(["xlinks/" + f for f in os.listdir("xlinks") if f.endswith(".bed")]) - xlinks = dict((key, []) for key in ["exp", "total_xlinks", "total_xlinksites", "ratio"]) + xlinks = dict((key, []) for key in ["exp", "number_of_cDNAs", "total_positions_of_crosslinks", "ratio"]) for xlinks_file in xlinks_files: xlinks_df = read_bed(xlinks_file) - exp = re.sub(".bed", "", os.path.basename(xlinks_file)) - total_xlinks = xlinks_df["score"].sum() - total_xlinksites = xlinks_df.shape[0] - ratio = total_xlinks / total_xlinksites + exp = re.sub(".genome.xl.bed", "", os.path.basename(xlinks_file)) + number_of_cDNAs = xlinks_df["score"].sum() + total_positions_of_crosslinks = xlinks_df.shape[0] + ratio = number_of_cDNAs / total_positions_of_crosslinks xlinks["exp"].append(exp) - xlinks["total_xlinks"].append(total_xlinks) - xlinks["total_xlinksites"].append(total_xlinksites) - xlinks["ratio"].append(round(total_xlinks / total_xlinksites, 2)) + xlinks["number_of_cDNAs"].append(number_of_cDNAs) + xlinks["total_positions_of_crosslinks"].append(total_positions_of_crosslinks) + xlinks["ratio"].append(round(number_of_cDNAs / total_positions_of_crosslinks, 2)) xlinks_metrics_df = pd.DataFrame(xlinks) xlinks_metrics_df.to_csv("xlinks_metrics.tsv", sep="\t", index=False) # Subset for MultiQC plots - xlinks_metrics_df.loc[:, ["exp", "total_xlinks", "total_xlinksites"]].to_csv( + xlinks_metrics_df.loc[:, ["exp", "number_of_cDNAs", "total_positions_of_crosslinks"]].to_csv( "xlinks_counts.tsv", sep="\t", index=False ) xlinks_metrics_df.loc[:, ["exp", "ratio"]].to_csv("xlinks_ratio.tsv", sep="\t", index=False) @@ -176,11 +178,74 @@ def read_bed(filename): print(xlinks_metrics_df) print("\n\n") + # ========== + # Summary cDNA + # ========== + summary_type_files = sorted(["summary_type/" + f for f in os.listdir("summary_type") if f.endswith(".tsv")]) + #summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"]) + #summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"]) + summary_data = [] + + for summary_type_file in summary_type_files: + + exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file)) + df = pd.read_csv(summary_type_file, sep='\t') + + + exp_data = {'exp': exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row['Type'] + exp_data[rna_type] = row['cDNA %'] + + summary_data.append(exp_data) + + + summary_df = pd.DataFrame(summary_data) + desired_columns = ['exp', 'CDS', 'ncRNA', 'intergenic', 'premapped RNA'] + existing_cols = [col for col in desired_columns if col in summary_df.columns] + summary_df = summary_df[existing_cols] + summary_df.to_csv("summary_type_metrics.tsv", sep="\t", index=False) + + + # ========== + # Summary subtype cDNA + # ========== + + summary_subtype_files = sorted(["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")]) + summary_subtypedata = [] + + for summary_subtype_file in summary_subtype_files: + + exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file)) + df = pd.read_csv(summary_subtype_file, sep='\t') + + + exp_data = {'exp': exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row['Subtype'] + exp_data[rna_type] = row['cDNA %'] + + summary_subtypedata.append(exp_data) + + + summary_subtype_df = pd.DataFrame(summary_subtypedata) + desired_columns = ['exp', 'CDS mRNA','ncRNA rRNA','ncRNA snRNA','ncRNA snoRNA','ncRNA tRNA','ncRNA ncRNA', 'intergenic', 'premapped RNA'] + existing_cols = [col for col in desired_columns if col in summary_subtype_df.columns] + summary_subtype_df = summary_subtype_df[existing_cols] + summary_subtype_df.to_csv("summary_subtype_metrics.tsv", sep="\t", index=False) + + + + # ========== # Peaks # ========== - peakcallers = ["icount", "paraclu", "clippy"] + peakcallers = ["icount", "paraclu", "clippy","pureclip"] def get_peaks_metrics(peakcaller): peak_files = sorted([peakcaller + "/" + f for f in os.listdir(peakcaller) if f.endswith(".bed")]) @@ -209,13 +274,15 @@ def get_peaks_metrics(peakcaller): if peakcaller == "icount": exp = re.sub(".peaks.bed", "", os.path.basename(peak_file)) elif peakcaller == "paraclu": - exp = re.sub(".peaks.bed", "", os.path.basename(peak_file)) + exp = re.sub(".genome.peaks.clustered.simplified.bed", "", os.path.basename(peak_file)) elif peakcaller == "clippy": exp = re.sub( - "_rollmean.+_stdev.+_minGeneCount.+.bed", + ".genome.peaks_rollmean10_minHeightAdjust1.0_minPromAdjust1.0_minGeneCount5_Peaks.bed", "", os.path.basename(peak_file), ) + elif peakcaller == "pureclip": + exp = re.sub("_pureclip.peaks.bed", "", os.path.basename(peak_file)) xlinks_df = read_bed(xlinks_files[0]) expanded_xlinks_df = xlinks_df.loc[xlinks_df.index.repeat(xlinks_df.score)].reset_index(drop=True) @@ -287,6 +354,10 @@ def get_peaks_metrics(peakcaller): print(peaks_xlinksite_coverage_percent_df) print("\n\n") + + + + if __name__ == "__main__": # Allows switching between nextflow templating and standalone python running using arguments diff --git a/nextflow.config b/nextflow.config index bba0156e..fed474d2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,6 +83,7 @@ params { // Peak calling peakcaller = 'clippy,icount,paraclu,pureclip' + //peakcaller = 'clippy,paraclu,pureclip' clippy_params = '' icount_sigxl_params = '' icount_peaks_params = '' diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf index 7822b4a6..3f09ed36 100644 --- a/workflows/clipseq.nf +++ b/workflows/clipseq.nf @@ -268,6 +268,10 @@ workflow CLIPSEQ { ch_genome_index = PREPARE_GENOME.out.genome_index ch_ncrna_genome_index = PREPARE_GENOME.out.ncrna_index } + + //ch_ncrna_genome_index.view() + ch_ncrna_genome_index.collect().view() + // // SUBWORKFLOW: Read in samplesheet, validate, stage input files and merge replicates @@ -285,7 +289,7 @@ workflow CLIPSEQ { // ch_fastq | view // - // SUBWORKFLOW: Extract UMI, trim and run b4 and after fastqc + // SUBWORKFLOW: Extract UMI, trim and run before and after fastqc // if(params.source == "fastq" & params.run_preprocessing) { FASTQ_FASTQC_UMITOOLS_TRIMGALORE ( @@ -384,7 +388,7 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions) ch_genome_unique_dedupe_bam = GENOME_UNIQUE_DEDUP.out.bam ch_genome_unique_dedupe_bai = GENOME_UNIQUE_DEDUP.out.bai - //ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log + ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log GENOME_MULTI_DEDUP ( ch_genome_multi_bam_bai @@ -448,6 +452,10 @@ workflow CLIPSEQ { ch_ncrna_k1_crosslink_group_resolved_bed ) ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions) + ch_merged_summary_type = MERGE_SUMMARY.out.summary_type_adjusted + ch_merged_summary_subtype = MERGE_SUMMARY.out.summary_subtype_adjusted + ch_merged_summary_gene = MERGE_SUMMARY.out.summary_gene_adjusted + ICOUNTMINI_METAGENE ( ch_genome_crosslink_group_resolved_bed, @@ -500,6 +508,15 @@ workflow CLIPSEQ { ch_icountmini_sigxls = Channel.empty() ch_paraclu_genome_peaks = Channel.empty() + // + // PEKA CHANNELS + // + ch_peka_icountmini_peaks = Channel.empty() + ch_peka_clippy_peaks = Channel.empty() + ch_peka_paraclu_peaks = Channel.empty() + ch_peka_pureclip_peaks = Channel.empty() + + if(params.run_peakcalling) { if('clippy' in callers) { @@ -763,25 +780,35 @@ workflow CLIPSEQ { } if(params.run_reporting) { - // + // MODULE: Collect software versions - // - DUMP_SOFTWARE_VERSIONS ( - ch_versions.unique().collectFile() + + // DUMP_SOFTWARE_VERSIONS ( + // ch_versions.unique().collectFile() + // ) + + + // MODULE: Run clipqc + + CLIPQC ( + ch_ncrna_log.collect{ it[1] }, + ch_genome_log.collect{ it[1] }, + ch_umi_log.collect{ it[1] }, + ch_genome_crosslink_group_resolved_bed.collect{ it[1] }, + ch_icountmini_peaks.collect{ it[1] }, + ch_paraclu_genome_peaks.collect {it[1]}, + ch_clippy_genome_peaks.collect {it[1]}, + ch_pureclip_genome_peaks.collect {it[1] }, + ch_merged_summary_type.collect{it[1]}, + ch_merged_summary_subtype.collect{it[1]}, + ch_merged_summary_gene.collect{it[1]}, + + //ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] }, + //PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] }, + //CLIPPY_GENOME.out.peaks.map{ it[1] } ) - // - // MODULE: Run clipqc - // - // CLIPSEQ_CLIPQC ( - // ch_bt_log.map{ it[1] }, - // ch_star_log.map{ it[1] }, - // ch_umi_log.map{ it[1] }, - // ch_genome_crosslink_bed.map{ it[1] }, - // ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] }, - // PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] }, - // CLIPPY_GENOME.out.peaks.map{ it[1] } - // ) + //CLIPQC.out.txt.view() // // MODULE: Run multiqc @@ -795,14 +822,20 @@ workflow CLIPSEQ { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect()) + //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect()) + //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_ncrna_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_genome_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CLIPQC.out.tsv.collect().ifEmpty([])) + + //ch_multiqc_files.collect().view() + ch_tmp = ch_ncrna_log.collect{it[1]} + ch_tmp.view() + //ch_genome_log.collect().view() MULTIQC ( ch_multiqc_files.collect(), From d5672b095bff34ef841201eda1b85e32f31b13bd Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 21 Mar 2025 15:20:54 +0000 Subject: [PATCH 02/11] linting --- modules/local/clipqc/templates/clipqc.py | 93 ++++++++++++------------ nextflow.config | 1 - 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py index 41d1b7b6..122f714c 100644 --- a/modules/local/clipqc/templates/clipqc.py +++ b/modules/local/clipqc/templates/clipqc.py @@ -51,7 +51,6 @@ def main(process_name): ncrna_df = pd.DataFrame(ncrna) print(ncrna_df) - # Next get STAR logs star_logs = sorted(["mapped/" + f for f in os.listdir("mapped") if f.endswith(".Log.final.out")]) @@ -182,70 +181,74 @@ def read_bed(filename): # Summary cDNA # ========== summary_type_files = sorted(["summary_type/" + f for f in os.listdir("summary_type") if f.endswith(".tsv")]) - #summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"]) - #summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"]) + # summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"]) + # summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"]) summary_data = [] for summary_type_file in summary_type_files: - - exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file)) - df = pd.read_csv(summary_type_file, sep='\t') - - - exp_data = {'exp': exp} - - # Add percentage for each RNA type - for _, row in df.iterrows(): - rna_type = row['Type'] - exp_data[rna_type] = row['cDNA %'] - - summary_data.append(exp_data) - - - summary_df = pd.DataFrame(summary_data) - desired_columns = ['exp', 'CDS', 'ncRNA', 'intergenic', 'premapped RNA'] + + exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file)) + df = pd.read_csv(summary_type_file, sep="\t") + + exp_data = {"exp": exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row["Type"] + exp_data[rna_type] = row["cDNA %"] + + summary_data.append(exp_data) + + summary_df = pd.DataFrame(summary_data) + desired_columns = ["exp", "CDS", "ncRNA", "intergenic", "premapped RNA"] existing_cols = [col for col in desired_columns if col in summary_df.columns] summary_df = summary_df[existing_cols] summary_df.to_csv("summary_type_metrics.tsv", sep="\t", index=False) - # ========== # Summary subtype cDNA # ========== - summary_subtype_files = sorted(["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")]) + summary_subtype_files = sorted( + ["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")] + ) summary_subtypedata = [] for summary_subtype_file in summary_subtype_files: - - exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file)) - df = pd.read_csv(summary_subtype_file, sep='\t') - - - exp_data = {'exp': exp} - - # Add percentage for each RNA type - for _, row in df.iterrows(): - rna_type = row['Subtype'] - exp_data[rna_type] = row['cDNA %'] - - summary_subtypedata.append(exp_data) - - - summary_subtype_df = pd.DataFrame(summary_subtypedata) - desired_columns = ['exp', 'CDS mRNA','ncRNA rRNA','ncRNA snRNA','ncRNA snoRNA','ncRNA tRNA','ncRNA ncRNA', 'intergenic', 'premapped RNA'] + + exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file)) + df = pd.read_csv(summary_subtype_file, sep="\t") + + exp_data = {"exp": exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row["Subtype"] + exp_data[rna_type] = row["cDNA %"] + + summary_subtypedata.append(exp_data) + + summary_subtype_df = pd.DataFrame(summary_subtypedata) + desired_columns = [ + "exp", + "CDS mRNA", + "ncRNA rRNA", + "ncRNA snRNA", + "ncRNA snoRNA", + "ncRNA tRNA", + "ncRNA ncRNA", + "intergenic", + "premapped RNA", + ] existing_cols = [col for col in desired_columns if col in summary_subtype_df.columns] summary_subtype_df = summary_subtype_df[existing_cols] summary_subtype_df.to_csv("summary_subtype_metrics.tsv", sep="\t", index=False) - - - # ========== # Peaks # ========== - peakcallers = ["icount", "paraclu", "clippy","pureclip"] + peakcallers = ["icount", "paraclu", "clippy", "pureclip"] def get_peaks_metrics(peakcaller): peak_files = sorted([peakcaller + "/" + f for f in os.listdir(peakcaller) if f.endswith(".bed")]) @@ -354,10 +357,6 @@ def get_peaks_metrics(peakcaller): print(peaks_xlinksite_coverage_percent_df) print("\n\n") - - - - if __name__ == "__main__": # Allows switching between nextflow templating and standalone python running using arguments diff --git a/nextflow.config b/nextflow.config index be58ebf1..9ad1278a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,7 +83,6 @@ params { // Peak calling peakcaller = 'clippy,icount,paraclu,pureclip' - //peakcaller = 'clippy,paraclu,pureclip' clippy_params = '' icount_sigxl_params = '' icount_peaks_params = '' From 264c963d60e4bf66351c149a8158c0fa3515957e Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 21 Mar 2025 15:25:02 +0000 Subject: [PATCH 03/11] linting --- assets/multiqc_config.yml | 372 ++++++++++++++++++-------------------- 1 file changed, 171 insertions(+), 201 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index e6d43047..617e0125 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -42,219 +42,189 @@ module_order: - star: name: "STAR Genome Alignment" path_filters: - - "*.final.out" + - "*.final.out" - custom_content - # Other MultiQC config stuff here custom_data: - mapping: - parent_id: mapping - parent_name: 'Mapping' - file_format: 'tsv' - section_name: 'Mapping' - description: 'The mapping metrics for each experiment' - plot_type: 'bargraph' - dedup_reads: - parent_id: dedup - parent_name: 'Deduplication' - file_format: 'tsv' - section_name: 'Reads' - description: 'The number of reads before and after PCR deduplication for each experiment' - plot_type: 'bargraph' - pconfig: - ylab: 'Count' - #stacking: False - cpswitch: False - tt_percentages: False - dedup_ratio: - parent_id: dedup - parent_name: 'Deduplication' - file_format: 'tsv' - section_name: 'Ratio' - description: 'The PCR deduplication ratio for each experiment' - plot_type: 'bargraph' - pconfig: - ylab: 'Ratio' - #stacking: False - cpswitch: False - tt_percentages: False - dedup_mean_umis: - parent_id: dedup - parent_name: 'Deduplication' - file_format: 'tsv' - section_name: 'Mean UMIs' - description: 'Mean number of unique UMIs per position for each experiment' - plot_type: 'bargraph' - pconfig: - ylab: 'Mean number' - #stacking: False - cpswitch: False - tt_percentages: False - crosslinks_counts: - parent_id: crosslinks - parent_name: 'Crosslinks' - file_format: 'tsv' - section_name: 'Counts' - description: 'The number of crosslinks or crosslink sites for each experiment' - plot_type: 'bargraph' - pconfig: - ylab: 'Count' - #stacking: False - cpswitch: False - tt_percentages: False - crosslinks_ratio: - parent_id: crosslinks - parent_name: 'Crosslinks' - file_format: 'tsv' - section_name: 'Ratios' - description: 'The ratio of number of cDNA mapping to crosslink positions for each experiment' - #plot_type: 'bargraph' - pconfig: - ylab: 'Count' - #stacking: False - cpswitch: False - tt_percentages: False - tt_decimals: 2 - peaks_counts: - parent_id: peaks - parent_name: 'Peaks' - file_format: 'tsv' - section_name: 'Counts' - description: 'The total number of peaks called by each peak caller' - plot_type: 'bargraph' - pconfig: - ylab: 'Number of peaks' - #stacking: False - cpswitch: False - tt_percentages: False - xlinks_in_peaks: - parent_id: peaks - parent_name: 'Peaks' - file_format: 'tsv' - section_name: 'Crosslinks positions in peaks' - description: 'The total percentage of crosslinks within peaks for each peak caller' - #plot_type: 'bargraph' - pconfig: - ylab: 'Percentage of crosslinks' - #stacking: False - cpswitch: False - tt_percentages: False - tt_decimals: 2 - tt_suffix: '%' - xlinksites_in_peaks: - parent_id: peaks - parent_name: 'Peaks' - file_format: 'tsv' - section_name: 'Crosslinks positions in peaks' - description: 'The total percentage of crosslink sites within peaks for each peak caller' - #plot_type: 'bargraph' - pconfig: - ylab: 'Percentage of crosslink sites' - #stacking: False - cpswitch: False - tt_percentages: False - tt_decimals: 2 - tt_suffix: '%' - peaks_xlinksite_coverage: - parent_id: peaks - parent_name: 'Peaks' - file_format: 'tsv' - section_name: 'Peak-crosslink coverage' - description: 'The total percentage of nucleotides within peaks covered by a crosslink site' - plot_type: 'bargraph' - pconfig: - ylab: 'Percentage of nucleotides within peaks' - #stacking: False - cpswitch: False - tt_percentages: False - tt_decimals: 2 - tt_suffix: '%' - summary_type: - parent_id: Summary - parent_name: 'Summary' - file_format: 'tsv' - section_name: 'Percentage of cDNA premap' - description: 'The total percentage of cDNA summary mapped' - #plot_type: 'bargraph' - pconfig: - ylab: 'Type' - #stacking: False - cpswitch: False - tt_percentages: False - summary_subtype: - parent_id: Summary - parent_name: 'Summary' - file_format: 'tsv' - section_name: 'Percentage of cDNA premap subtypes' - description: 'The total percentage of cDNA subtypes mapped' - #plot_type: 'bargraph' - pconfig: - ylab: 'Type' - #stacking: False - cpswitch: False - tt_percentages: False + mapping: + parent_id: mapping + parent_name: "Mapping" + file_format: "tsv" + section_name: "Mapping" + description: "The mapping metrics for each experiment" + plot_type: "bargraph" + dedup_reads: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Reads" + description: "The number of reads before and after PCR deduplication for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + dedup_ratio: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Ratio" + description: "The PCR deduplication ratio for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Ratio" + #stacking: False + cpswitch: False + tt_percentages: False + dedup_mean_umis: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Mean UMIs" + description: "Mean number of unique UMIs per position for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Mean number" + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_counts: + parent_id: crosslinks + parent_name: "Crosslinks" + file_format: "tsv" + section_name: "Counts" + description: "The number of crosslinks or crosslink sites for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_ratio: + parent_id: crosslinks + parent_name: "Crosslinks" + file_format: "tsv" + section_name: "Ratios" + description: "The ratio of number of cDNA mapping to crosslink positions for each experiment" + #plot_type: 'bargraph' + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + peaks_counts: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Counts" + description: "The total number of peaks called by each peak caller" + plot_type: "bargraph" + pconfig: + ylab: "Number of peaks" + #stacking: False + cpswitch: False + tt_percentages: False + xlinks_in_peaks: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Crosslinks positions in peaks" + description: "The total percentage of crosslinks within peaks for each peak caller" + #plot_type: 'bargraph' + pconfig: + ylab: "Percentage of crosslinks" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + xlinksites_in_peaks: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Crosslinks positions in peaks" + description: "The total percentage of crosslink sites within peaks for each peak caller" + #plot_type: 'bargraph' + pconfig: + ylab: "Percentage of crosslink sites" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + peaks_xlinksite_coverage: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Peak-crosslink coverage" + description: "The total percentage of nucleotides within peaks covered by a crosslink site" + plot_type: "bargraph" + pconfig: + ylab: "Percentage of nucleotides within peaks" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + summary_type: + parent_id: Summary + parent_name: "Summary" + file_format: "tsv" + section_name: "Percentage of cDNA premap" + description: "The total percentage of cDNA summary mapped" + #plot_type: 'bargraph' + pconfig: + ylab: "Type" + #stacking: False + cpswitch: False + tt_percentages: False + summary_subtype: + parent_id: Summary + parent_name: "Summary" + file_format: "tsv" + section_name: "Percentage of cDNA premap subtypes" + description: "The total percentage of cDNA subtypes mapped" + #plot_type: 'bargraph' + pconfig: + ylab: "Type" + #stacking: False + cpswitch: False + tt_percentages: False - sp: - mapping: - fn: 'mapping.tsv' - dedup_reads: - fn: 'dedup_reads.tsv' - dedup_ratio: - fn: 'dedup_ratio.tsv' - dedup_mean_umis: - fn: 'dedup_mean_umis.tsv' - crosslinks_counts: - fn: 'xlinks_counts.tsv' - crosslinks_ratio: - fn: 'xlinks_ratio.tsv' - peaks: - fn: 'total_peaks.tsv' - xlinks_in_peaks: - fn: 'xlinks_in_peaks.tsv' - xlinksites_in_peaks: - fn: 'xlinksites_in_peaks.tsv' - peaks_xlinksite_coverage: - fn: 'peaks_xlinksite_coverage.tsv' - summary_type: - fn: 'summary_type_metrics.tsv' - summary_subtype: - fn: '"summary_subtype_metrics.tsv' - - -sp: - mapping: - fn: 'mapping.tsv' - dedup_reads: - fn: 'dedup_reads.tsv' - dedup_ratio: - fn: 'dedup_ratio.tsv' - dedup_mean_umis: - fn: 'dedup_mean_umis.tsv' - crosslinks_counts: - fn: 'xlinks_counts.tsv' - crosslinks_ratio: - fn: 'xlinks_ratio.tsv' - peaks: - fn: 'total_peaks.tsv' - xlinks_in_peaks: - fn: 'xlinks_in_peaks.tsv' - xlinksites_in_peaks: - fn: 'xlinksites_in_peaks.tsv' - peaks_xlinksite_coverage: - fn: 'peaks_xlinksite_coverage.tsv' - summary_type: - fn: 'summary_type_metrics.tsv' - summary_subtype: - fn: 'summary_subtype_metrics.tsv' + mapping: + fn: "mapping.tsv" + dedup_reads: + fn: "dedup_reads.tsv" + dedup_ratio: + fn: "dedup_ratio.tsv" + dedup_mean_umis: + fn: "dedup_mean_umis.tsv" + crosslinks_counts: + fn: "xlinks_counts.tsv" + crosslinks_ratio: + fn: "xlinks_ratio.tsv" + peaks: + fn: "total_peaks.tsv" + xlinks_in_peaks: + fn: "xlinks_in_peaks.tsv" + xlinksites_in_peaks: + fn: "xlinksites_in_peaks.tsv" + peaks_xlinksite_coverage: + fn: "peaks_xlinksite_coverage.tsv" + summary_type: + fn: "summary_type_metrics.tsv" + summary_subtype: + fn: '"summary_subtype_metrics.tsv' custom_content: order: - clipqc - software-versions-by-process - software-versions-unique - # Customise the module search patterns to speed up execution time # sp: # samtools/stats: From d6ee604b2979ae83c5f96cdbd9ae6d0d16d65125 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 21 Mar 2025 15:30:02 +0000 Subject: [PATCH 04/11] remove view() --- workflows/clipseq.nf | 8 -------- 1 file changed, 8 deletions(-) diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf index 3f09ed36..d3f63760 100644 --- a/workflows/clipseq.nf +++ b/workflows/clipseq.nf @@ -268,9 +268,6 @@ workflow CLIPSEQ { ch_genome_index = PREPARE_GENOME.out.genome_index ch_ncrna_genome_index = PREPARE_GENOME.out.ncrna_index } - - //ch_ncrna_genome_index.view() - ch_ncrna_genome_index.collect().view() // @@ -808,8 +805,6 @@ workflow CLIPSEQ { //CLIPPY_GENOME.out.peaks.map{ it[1] } ) - //CLIPQC.out.txt.view() - // // MODULE: Run multiqc // @@ -832,10 +827,7 @@ workflow CLIPSEQ { ch_multiqc_files = ch_multiqc_files.mix(ch_genome_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(CLIPQC.out.tsv.collect().ifEmpty([])) - //ch_multiqc_files.collect().view() ch_tmp = ch_ncrna_log.collect{it[1]} - ch_tmp.view() - //ch_genome_log.collect().view() MULTIQC ( ch_multiqc_files.collect(), From 336066414888004b5f490e53f2f6622d70edd83c Mon Sep 17 00:00:00 2001 From: Chhabria Date: Thu, 12 Jun 2025 15:45:49 -0400 Subject: [PATCH 05/11] deseq2 modules --- conf/modules.config | 1 + conf/test.config | 38 +++++++++---- modules.json | 5 ++ modules/local/consensus_count_table/main.nf | 2 +- subworkflows/local/consensus_peak_table.nf | 61 ++++++++++++++++++++- subworkflows/local/prepare_genome.nf | 4 +- subworkflows/local/rna_align.nf | 2 +- workflows/clipseq.nf | 37 +++++++++---- 8 files changed, 121 insertions(+), 29 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a787b7c6..756b7eae 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -757,6 +757,7 @@ if(params.run_crosslinking) { if(params.run_peakcalling && params.consensus_peak){ process { withName: 'NFCORE_CLIPSEQ:CLIPSEQ:.*CONSENSUS_PEAK_TABLE:CONSENSUS_MAP' { + ext.prefix = { "${meta.id}_consensus_sorted" } publishDir = [ path: { "${params.outdir}/05_peakcalling/consensus_peak_tables" }, mode: "${params.publish_dir_mode}", diff --git a/conf/test.config b/conf/test.config index ff4270d2..8e44ea3e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,21 +15,15 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '8.GB' - max_time = '6.h' + max_cpus = 8 + max_memory = '60.GB' + max_time = '24.h' - // Input data + // Inputs for testing dataset with yeast genome //input = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv' - input = "./tests/test_new_samplesheet_FASTQ.csv" - fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" - gtf = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz" - ncrna_fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.smrna.fasta" - source = "fastq" - // Genome references - //fasta = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz' - //gtf = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz' + //input = '../tests/test_new_samplesheet_FASTQ.csv' + //source = "fastq" // Genome references //fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz" @@ -52,6 +46,20 @@ params { //seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf" //regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf" + + // Input data for full human testing + input = "./tests/test_new_samplesheet_FASTQ_human.csv" + fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz" + ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz" + source = "fastq" + + // Genome references from s3 bucket + //fasta = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz' + //gtf = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz' + + + // Logic debug = true save_reference = true @@ -62,10 +70,16 @@ params { save_align_intermed = true skip_transcriptome = true + // Inputs for deseq2_qc + skip_deseq2_qc = false + + // Pipeline params umitools_bc_pattern = 'NNNNNNNNN' // Don't call consensus //consensus_peak = false + + } diff --git a/modules.json b/modules.json index 00019c28..ad826ef1 100644 --- a/modules.json +++ b/modules.json @@ -70,6 +70,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "deseq2/differential": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/consensus_count_table/main.nf b/modules/local/consensus_count_table/main.nf index 803f5af4..0d5e5b24 100644 --- a/modules/local/consensus_count_table/main.nf +++ b/modules/local/consensus_count_table/main.nf @@ -13,7 +13,7 @@ process GET_CONSENSUS_COUNTS { val(output_name) output: - path "*.tsv" , emit: tsv + tuple val(meta), path ("*.tsv") , emit: tsv path "versions.yml" , emit: versions when: diff --git a/subworkflows/local/consensus_peak_table.nf b/subworkflows/local/consensus_peak_table.nf index 20d4b485..86d44dd1 100644 --- a/subworkflows/local/consensus_peak_table.nf +++ b/subworkflows/local/consensus_peak_table.nf @@ -6,6 +6,8 @@ // MODULES // include { BEDTOOLS_MAP as CONSENSUS_MAP } from '../../modules/nf-core/bedtools/map/main' +include { BEDTOOLS_SORT as CONSENSUS_SORT } from '../../modules/nf-core/bedtools/sort/main' +include { DESEQ2_QC } from '../../modules/local/deseq2_qc' // // SUBWORKFLOWS // @@ -16,23 +18,35 @@ workflow CONSENSUS_PEAK_TABLE { take: all_crosslinks // channel: [ val(meta), [ xl.bed ] ] consensus_peaks // channel: [ val(meta), [ peaks.bed ] ] + genome_fasta genome_fai // channel: [ val(meta), [ index ] ] gtf // channel: [ val(meta), [ gtf ] ] table_name // val "Clippy_Consensus_Counts.tsv" must end in tsv + ch_deseq2_pca_header_multiqc + ch_deseq2_clustering_header_multiqc + skip_deseq2_qc main: ch_versions = Channel.empty() + + CONSENSUS_SORT ( all_crosslinks , []) + ch_consensus_sorted = CONSENSUS_SORT.out.sorted + + consensus_peaks - .combine(all_crosslinks) + .combine(ch_consensus_sorted) .map{ meta1, consensuspeaks, meta2, crosslink -> [meta2, consensuspeaks, crosslink] } .set { ch_consensus_map } + //ch_consensus_map.view { item -> "consensus for bedtools map: $item" } + ch_consensus_map.view() + CONSENSUS_MAP ( ch_consensus_map, - genome_fai + genome_fai, ) CONSENSUS_MAP.out.mapped @@ -46,7 +60,48 @@ workflow CONSENSUS_PEAK_TABLE { table_name ) + // + // Generate QC plots with DESeq2 + // + ch_deseq2_qc_pdf = Channel.empty() + ch_deseq2_qc_rdata = Channel.empty() + ch_deseq2_qc_rds = Channel.empty() + ch_deseq2_qc_pca_txt = Channel.empty() + ch_deseq2_qc_pca_multiqc = Channel.empty() + ch_deseq2_qc_dists_txt = Channel.empty() + ch_deseq2_qc_dists_multiqc = Channel.empty() + ch_deseq2_qc_log = Channel.empty() + ch_deseq2_qc_size_factors = Channel.empty() + if (!skip_deseq2_qc) { + DESEQ2_QC ( + GET_CONSENSUS_COUNTS.out.tsv, + ch_deseq2_pca_header_multiqc, + ch_deseq2_clustering_header_multiqc + ) + ch_deseq2_qc_pdf = DESEQ2_QC.out.pdf + ch_deseq2_qc_rdata = DESEQ2_QC.out.rdata + ch_deseq2_qc_rds = DESEQ2_QC.out.rds + ch_deseq2_qc_pca_txt = DESEQ2_QC.out.pca_txt + ch_deseq2_qc_pca_multiqc = DESEQ2_QC.out.pca_multiqc + ch_deseq2_qc_dists_txt = DESEQ2_QC.out.dists_txt + ch_deseq2_qc_dists_multiqc = DESEQ2_QC.out.dists_multiqc + ch_deseq2_qc_log = DESEQ2_QC.out.log + ch_deseq2_qc_size_factors = DESEQ2_QC.out.size_factors + ch_versions = ch_versions.mix(DESEQ2_QC.out.versions) + } + emit: - mapped_table = GET_CONSENSUS_COUNTS.out.tsv // channel: [ val(meta), [ tsv ] ] + mapped_table = GET_CONSENSUS_COUNTS.out.tsv // channel: [ val(meta), [ tsv ] ] + + + deseq2_qc_pdf = ch_deseq2_qc_pdf // channel: [ pdf ] + deseq2_qc_rdata = ch_deseq2_qc_rdata // channel: [ rdata ] + deseq2_qc_rds = ch_deseq2_qc_rds // channel: [ rds ] + deseq2_qc_pca_txt = ch_deseq2_qc_pca_txt // channel: [ txt ] + deseq2_qc_pca_multiqc = ch_deseq2_qc_pca_multiqc // channel: [ txt ] + deseq2_qc_dists_txt = ch_deseq2_qc_dists_txt // channel: [ txt ] + deseq2_qc_dists_multiqc = ch_deseq2_qc_dists_multiqc // channel: [ txt ] + deseq2_qc_log = ch_deseq2_qc_log // channel: [ txt ] + deseq2_qc_size_factors = ch_deseq2_qc_size_factors // channel: [ txt ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index d5f8bf90..282aa37c 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -104,7 +104,7 @@ workflow PREPARE_GENOME { } } else { - ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map{it[1]}, ch_gtf.map{it[1]} ).index + ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) } @@ -121,7 +121,7 @@ workflow PREPARE_GENOME { } } else { - ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta.map{it[1]} ).index + ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta).index ch_versions = ch_versions.mix(BOWTIE_BUILD.out.versions) } diff --git a/subworkflows/local/rna_align.nf b/subworkflows/local/rna_align.nf index a8d82e83..aaffbc08 100644 --- a/subworkflows/local/rna_align.nf +++ b/subworkflows/local/rna_align.nf @@ -230,4 +230,4 @@ workflow RNA_ALIGN { transcript_multi_bai = ch_transcript_multi_bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] -} +} \ No newline at end of file diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf index 3f09ed36..b77d4c93 100644 --- a/workflows/clipseq.nf +++ b/workflows/clipseq.nf @@ -60,6 +60,10 @@ if ((caller_list + callers).unique().size() != caller_list.size()) { ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true) ch_dummy_file2 = file("$projectDir/assets/dummy_file2.txt", checkIfExists: true) +// deseq2 header files: +ch_multiqc_merged_replicate_deseq2_pca_header = file("$projectDir/assets/merged_replicate_deseq2_pca_header.txt", checkIfExists: true) +ch_multiqc_merged_replicate_deseq2_clustering_header = file("$projectDir/assets/merged_replicate_deseq2_clustering_header.txt", checkIfExists: true) + // // Check if an AWS iGenome has been provided to use the appropriate version of STAR // def is_aws_igenome = false // if (params.fasta && params.gtf) { @@ -442,7 +446,7 @@ workflow CLIPSEQ { ICOUNTMINI_SUMMARY ( ch_genome_crosslink_group_resolved_bed, - ch_regions_resolved_gtf.collect{ it[1] } + ch_regions_resolved_gtf ) MERGE_SUMMARY ( @@ -459,7 +463,7 @@ workflow CLIPSEQ { ICOUNTMINI_METAGENE ( ch_genome_crosslink_group_resolved_bed, - ch_regions_resolved_gtf.collect{ it[1] } + ch_regions_resolved_gtf ) if(params.consensus_peak){ @@ -539,9 +543,13 @@ workflow CLIPSEQ { CLIPPY_CONSENSUS_PEAK_TABLE ( ch_all_crosslinks, CLIPPY_GENOME_CONSENSUS.out.peaks, + ch_fasta, ch_fasta_fai, - ch_regions_resolved_gtf, - "Clippy_Consensus_AllCounts.tsv" + ch_regions_resolved_gtf.collect{ it[1] }, + "Clippy_Consensus_AllCounts.tsv", + ch_multiqc_merged_replicate_deseq2_pca_header, + ch_multiqc_merged_replicate_deseq2_clustering_header, + params.skip_deseq2_qc ) ch_versions = ch_versions.mix(CLIPPY_CONSENSUS_PEAK_TABLE.out.versions) } @@ -563,7 +571,7 @@ workflow CLIPSEQ { ICOUNTMINI_SIGXLS ( ch_genome_crosslink_group_resolved_bed, - ch_seg_resolved_gtf.collect{ it[1]} + ch_seg_resolved_gtf ) @@ -601,7 +609,7 @@ workflow CLIPSEQ { if(params.consensus_peak){ CONSENSUS_ICOUNTMINI_SIGXLS ( ch_consensus_crosslinks_final_bed, - ch_seg_resolved_gtf.collect{ it[1]} + ch_seg_resolved_gtf ) // CHANNEL: Create combined channel of input crosslinks and sigxls ch_consensus_peaks_input = ch_consensus_crosslinks_final_bed @@ -622,9 +630,13 @@ workflow CLIPSEQ { ICOUNT_CONSENSUS_PEAK_TABLE ( ch_all_crosslinks, CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS.out.gunzip, + ch_fasta, ch_fasta_fai, ch_regions_resolved_gtf, - "iCount-Mini_Consensus_AllCounts.tsv" + "iCount-Mini_Consensus_AllCounts.tsv", + ch_multiqc_merged_replicate_deseq2_pca_header, + ch_multiqc_merged_replicate_deseq2_clustering_header, + params.skip_deseq2_qc ) ch_versions = ch_versions.mix(ICOUNT_CONSENSUS_PEAK_TABLE.out.versions) } @@ -635,7 +647,8 @@ workflow CLIPSEQ { ch_genome_crosslink_group_resolved_bed, ch_fasta.collect{ it[1] }, ch_fasta_fai.collect{ it[1] }, - ch_regions_resolved_gtf.collect{ it[1] } + ch_regions_resolved_gtf + //ch_regions_resolved_gtf.collect{ it[1] } ) ch_versions = ch_versions.mix(PEKA_ICOUNT.out.versions) } @@ -663,9 +676,13 @@ workflow CLIPSEQ { PARACLU_CONSENSUS_PEAK_TABLE ( ch_all_crosslinks, PARACLU_GENOME_CONSENSUS.out.bed, + ch_fasta, ch_fasta_fai, ch_regions_resolved_gtf, - "Paraclu_Consensus_AllCounts.tsv" + "Paraclu_Consensus_AllCounts.tsv", + ch_multiqc_merged_replicate_deseq2_pca_header, + ch_multiqc_merged_replicate_deseq2_clustering_header, + params.skip_deseq2_qc ) ch_versions = ch_versions.mix(PARACLU_CONSENSUS_PEAK_TABLE.out.versions) } @@ -676,7 +693,7 @@ workflow CLIPSEQ { ch_genome_crosslink_group_resolved_bed, ch_fasta.collect{ it[1] }, ch_fasta_fai.collect{ it[1] }, - ch_regions_resolved_gtf.collect{ it[1] } + ch_regions_resolved_gtf ) ch_versions = ch_versions.mix(PEKA_PARACLU.out.versions) } From 70d738d76771e562cd1aeeb3fd67e1904bcee81b Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 10 Nov 2025 15:59:39 +0000 Subject: [PATCH 06/11] revert small test config --- conf/test.config | 39 ++++------------------ tests/test_new_samplesheet_FASTQ_human.csv | 10 ------ 2 files changed, 7 insertions(+), 42 deletions(-) delete mode 100644 tests/test_new_samplesheet_FASTQ_human.csv diff --git a/conf/test.config b/conf/test.config index 618bded7..6a241ddf 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,15 +15,13 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 8 - max_memory = '60.GB' - max_time = '24.h' + max_cpus = 2 + max_memory = '8.GB' + max_time = '6.h' - // Inputs for testing dataset with yeast genome - //input = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv' - - //input = '../tests/test_new_samplesheet_FASTQ.csv' - //source = "fastq" + // Input data + input = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv' + source = "fastq" // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz" @@ -42,20 +40,6 @@ params { regions_filt_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions.gtf.gz" regions_resolved_gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOtherfalse.resolved.gtf" - - // Input data for full human testing - input = "./tests/test_new_samplesheet_FASTQ_human.csv" - fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" - gtf = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz" - ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz" - source = "fastq" - - // Genome references from s3 bucket - //fasta = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz' - //gtf = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz' - - - // Logic debug = true save_reference = true @@ -67,16 +51,7 @@ params { skip_transcriptome = true skip_filter_gtf = false - // Inputs for deseq2_qc - skip_deseq2_qc = false - - // Pipeline params umitools_bc_pattern = 'NNNNNNNNN' - - // Don't call consensus - //consensus_peak = false - - - } + diff --git a/tests/test_new_samplesheet_FASTQ_human.csv b/tests/test_new_samplesheet_FASTQ_human.csv deleted file mode 100644 index 3ff1bafb..00000000 --- a/tests/test_new_samplesheet_FASTQ_human.csv +++ /dev/null @@ -1,10 +0,0 @@ -sample_name,group_name,input_name,fastq -TDP43_1,TDP43,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR1530360.fastq.gz -TDP43_2,TDP43,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR1530361.fastq.gz -MATR3_1,MATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2210802.fastq.gz -MATR3_2,MATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2210803.fastq.gz -PTBP1_1,PTBP1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214592.fastq.gz -PTBP1_2,PTBP1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214593.fastq.gz -PTBP1_siMATR3_1,PTBP1_siMATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214594.fastq.gz -PTBP1_siMATR3_2,PTBP1_siMATR3,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR2214595.fastq.gz -STAU1_1,STAU1,,s3://ngi-igenomes/test-data/clipseq/fastq/ERR605258.fastq.gz \ No newline at end of file From 008ad81c3c2e01cb3ebbd75bb1c74099ccb625fb Mon Sep 17 00:00:00 2001 From: Chhabria Date: Mon, 10 Nov 2025 11:04:54 -0500 Subject: [PATCH 07/11] Adding new files --- assets/merged_replicate_deseq2_pca_header.txt | 11 +++++++++++ tests/test_new_samplesheet_FASTQ.csv | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 assets/merged_replicate_deseq2_pca_header.txt diff --git a/assets/merged_replicate_deseq2_pca_header.txt b/assets/merged_replicate_deseq2_pca_header.txt new file mode 100644 index 00000000..a92b75d9 --- /dev/null +++ b/assets/merged_replicate_deseq2_pca_header.txt @@ -0,0 +1,11 @@ +#id: 'mrep_deseq2_pca' +#section_name: 'MERGED REP: DESeq2 PCA plot' +#description: "PCA plot of the samples in the experiment. +# These values are calculated using DESeq2 +# in the deseq2_qc.r script." +#plot_type: 'scatter' +#anchor: 'mrep_deseq2_pca' +#pconfig: +# title: 'DESeq2: Principal component plot' +# xlab: PC1 +# ylab: PC2 diff --git a/tests/test_new_samplesheet_FASTQ.csv b/tests/test_new_samplesheet_FASTQ.csv index a74b8037..98cf201b 100644 --- a/tests/test_new_samplesheet_FASTQ.csv +++ b/tests/test_new_samplesheet_FASTQ.csv @@ -2,4 +2,4 @@ sample_name,group_name,input_name,fastq PHO92_A,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz PHO92_B,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz PHO92_C,PHO92,HNRNPC,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz -HNRNPC,,,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz \ No newline at end of file +HNRNPC,HNRNPC,,https://github.com/luslab/test-datasets/raw/clipseq/v_2_0/fastq/ERR3988069-yeast-quarter.fastq.gz \ No newline at end of file From e638c7fdfcdd65adbdd9420b2980128265b57e1e Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 10 Nov 2025 16:13:15 +0000 Subject: [PATCH 08/11] add default for params.skip_deseq2_qc = some_value --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index 83861b5f..71319c7e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { skip_umi_extract = true skip_trimming = false skip_transcriptome = true + skip_deseq2_qc = false // Output params save_reference = false From a2d824399c867d5cc916a2e30f20027f147c1f45 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 10 Nov 2025 20:20:32 +0000 Subject: [PATCH 09/11] update file naming --- modules/local/deseq2_qc/main.nf | 2 +- modules/local/deseq2_qc/templates/deseq2_qc.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/deseq2_qc/main.nf b/modules/local/deseq2_qc/main.nf index e043abdd..8f52f7d2 100644 --- a/modules/local/deseq2_qc/main.nf +++ b/modules/local/deseq2_qc/main.nf @@ -37,7 +37,7 @@ process DESEQ2_QC { -i ${counts} \ -c 3 \ -d "PeakID" \ - -o results/${prefix} \ + -o ${prefix} \ --vst """ diff --git a/modules/local/deseq2_qc/templates/deseq2_qc.R b/modules/local/deseq2_qc/templates/deseq2_qc.R index 382724a0..8aff2570 100755 --- a/modules/local/deseq2_qc/templates/deseq2_qc.R +++ b/modules/local/deseq2_qc/templates/deseq2_qc.R @@ -118,7 +118,7 @@ dds <- DESeqDataSetFromMatrix(countData=round(counts), colData=coldata, desi dds <- estimateSizeFactors(dds) if (min(dim(count.table))<=1) { # No point if only one sample, or one gene save(dds,file=DDSFile) - saveRDS(dds, file = paste(opt$output_prefix,".rds", sep = '.')) + saveRDS(dds, file = paste(opt$output_prefix,"rds", sep = '.')) warning("Not enough samples or genes in counts file for PCA.", call.=FALSE) quit(save = "no", status = 0, runLast = FALSE) } @@ -132,7 +132,7 @@ if (!opt$vst) { assay(dds, vst_name) <- assay(rld) save(dds,file=DDSFile) -saveRDS(dds, file = paste(opt$output_prefix, ".rds", sep = '.')) +saveRDS(dds, file = paste(opt$output_prefix, "rds", sep = '.')) ################################################ ################################################ From c09a801117261d58e065b1289ca05cd8298ea4c3 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 10 Nov 2025 21:30:59 +0000 Subject: [PATCH 10/11] update consensus modules --- conf/modules.config | 30 ------------------------------ workflows/clipseq.nf | 39 --------------------------------------- 2 files changed, 69 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5ddc834f..ee1e6f2f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -916,36 +916,6 @@ if(params.run_peakcalling && params.peakcaller.contains('icount')) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_SIGXLS' { - ext.args = { "${params.icount_sigxl_params}" } - publishDir = [ - enabled: false - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_PEAKS' { - ext.args = { "${params.icount_peaks_params}" } - publishDir = [ - enabled: false - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS' { - publishDir = [ - path: { "${params.outdir}/05_peakcalling/icountmini" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS' { - publishDir = [ - path: { "${params.outdir}/05_peakcalling/icountmini" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } } } diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf index 76d910a2..8e0a17e2 100644 --- a/workflows/clipseq.nf +++ b/workflows/clipseq.nf @@ -137,10 +137,6 @@ include { ICOUNTMINI_PEAKS } from "../m include { GUNZIP as GUNZIP_ICOUNTMINI_SIGXLS } from "../modules/nf-core/gunzip/main" include { GUNZIP as GUNZIP_ICOUNTMINI_PEAKS } from "../modules/nf-core/gunzip/main" -include { ICOUNTMINI_SIGXLS as CONSENSUS_ICOUNTMINI_SIGXLS } from "../modules/nf-core/icountmini/sigxls/main" -include { ICOUNTMINI_PEAKS as CONSENSUS_ICOUNTMINI_PEAKS } from "../modules/nf-core/icountmini/peaks/main" -include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS } from "../modules/nf-core/gunzip/main" -include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS } from "../modules/nf-core/gunzip/main" include { PARACLU as PARACLU_GENOME } from "../modules/nf-core/paraclu/main" include { PARACLU as PARACLU_GENOME_CONSENSUS } from "../modules/nf-core/paraclu/main" @@ -595,41 +591,6 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(GUNZIP_ICOUNTMINI_PEAKS.out.versions) ch_icountmini_peaks = GUNZIP_ICOUNTMINI_PEAKS.out.gunzip - if(params.consensus_peak){ - CONSENSUS_ICOUNTMINI_SIGXLS ( - ch_consensus_crosslinks_final_bed, - ch_seg_gtf.collect{ it[1]} - ) - // CHANNEL: Create combined channel of input crosslinks and sigxls - ch_consensus_peaks_input = ch_consensus_crosslinks_final_bed - .map{ [ it[0].id, it[0], it[1] ] } - .join( CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls.map{ [ it[0].id, it[0], it[1] ] } ) - .map { [ it[1], it[2], it[4] ] } - //EXAMPLE CHANNEL STRUCT: [ [id:test], BED(crosslinks), BED(sigxls) ] - CONSENSUS_ICOUNTMINI_PEAKS ( - ch_consensus_peaks_input - ) - ch_consensus_peaks = CONSENSUS_ICOUNTMINI_PEAKS.out.peaks - CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS ( - CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls - ) - CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS ( - CONSENSUS_ICOUNTMINI_PEAKS.out.peaks - ) - ICOUNT_CONSENSUS_PEAK_TABLE ( - ch_all_crosslinks, - CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS.out.gunzip, - ch_fasta, - ch_fasta_fai, - ch_regions_used, - "iCount-Mini_Consensus_AllCounts.tsv", - ch_multiqc_merged_replicate_deseq2_pca_header, - ch_multiqc_merged_replicate_deseq2_clustering_header, - params.skip_deseq2_qc - ) - ch_versions = ch_versions.mix(ICOUNT_CONSENSUS_PEAK_TABLE.out.versions) - } - if(params.run_peka) { PEKA_ICOUNT ( ch_icountmini_peaks, From 573b68619ff1bba971dab5f2ceb2405b18913461 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 10 Nov 2025 21:44:49 +0000 Subject: [PATCH 11/11] update some resource requirements --- modules/local/bedgraph_strand_split/main.nf | 2 +- modules/local/consensus_count_table/main.nf | 2 +- modules/nf-core/peka/main.nf | 2 +- modules/nf-core/ucsc/bedgraphtobigwig/main.nf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/bedgraph_strand_split/main.nf b/modules/local/bedgraph_strand_split/main.nf index 5888a035..d611d0f3 100644 --- a/modules/local/bedgraph_strand_split/main.nf +++ b/modules/local/bedgraph_strand_split/main.nf @@ -1,7 +1,7 @@ // modules/local/bedgraph_strand_split/main.nf process BEDGRAPH_STRAND_SPLIT { tag "$meta.id" - label 'process_low' + label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/consensus_count_table/main.nf b/modules/local/consensus_count_table/main.nf index 0d5e5b24..3515b2e8 100644 --- a/modules/local/consensus_count_table/main.nf +++ b/modules/local/consensus_count_table/main.nf @@ -1,6 +1,6 @@ process GET_CONSENSUS_COUNTS { tag "${meta.id}" - label "process_single" + label "process_high" conda "bioconda::bioconductor-rtracklayer=1.62.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/peka/main.nf b/modules/nf-core/peka/main.nf index 416a44a8..84256428 100644 --- a/modules/nf-core/peka/main.nf +++ b/modules/nf-core/peka/main.nf @@ -1,6 +1,6 @@ process PEKA { tag "$meta.id" - label 'process_low' + label 'process_high' conda "bioconda::peka=1.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf index 81cdee95..37a201f5 100644 --- a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf +++ b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf @@ -1,6 +1,6 @@ process UCSC_BEDGRAPHTOBIGWIG { tag "$meta.id" - label 'process_single' + label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. conda "${moduleDir}/environment.yml"