diff --git a/assets/merged_library_deseq2_clustering_header.txt b/assets/merged_library_deseq2_clustering_header.txt new file mode 100644 index 00000000..cd997676 --- /dev/null +++ b/assets/merged_library_deseq2_clustering_header.txt @@ -0,0 +1,12 @@ +#id: 'mlib_deseq2_clustering' +#section_name: 'MERGED LIB: DESeq2 sample similarity' +#description: "Matrix is generated from clustering with Euclidean distances between +# DESeq2 +# rlog values for each sample +# in the deseq2_qc.r script." +#plot_type: 'heatmap' +#anchor: 'mlib_deseq2_clustering' +#pconfig: +# title: 'DESeq2: Heatmap of the sample-to-sample distances' +# xlab: True +# reverseColors: True diff --git a/assets/merged_library_deseq2_pca_header.txt b/assets/merged_library_deseq2_pca_header.txt new file mode 100644 index 00000000..71a9783f --- /dev/null +++ b/assets/merged_library_deseq2_pca_header.txt @@ -0,0 +1,11 @@ +#id: 'mlib_deseq2_pca' +#section_name: 'MERGED LIB: DESeq2 PCA plot' +#description: "PCA plot of the samples in the experiment. +# These values are calculated using DESeq2 +# in the deseq2_qc.r script." +#plot_type: 'scatter' +#anchor: 'mlib_deseq2_pca' +#pconfig: +# title: 'DESeq2: Principal component plot' +# xlab: PC1 +# ylab: PC2 diff --git a/assets/merged_replicate_deseq2_clustering_header.txt b/assets/merged_replicate_deseq2_clustering_header.txt new file mode 100644 index 00000000..aed0c093 --- /dev/null +++ b/assets/merged_replicate_deseq2_clustering_header.txt @@ -0,0 +1,12 @@ +#id: 'mrep_deseq2_clustering' +#section_name: 'MERGED REP: DESeq2 sample similarity' +#description: "Matrix is generated from clustering with Euclidean distances between +# DESeq2 +# rlog values for each sample +# in the deseq2_qc.r script." +#plot_type: 'heatmap' +#anchor: 'mrep_deseq2_clustering' +#pconfig: +# title: 'DESeq2: Heatmap of the sample-to-sample distances' +# xlab: True +# reverseColors: True diff --git a/assets/merged_replicate_deseq2_pca_header.txt b/assets/merged_replicate_deseq2_pca_header.txt new file mode 100644 index 00000000..a92b75d9 --- /dev/null +++ b/assets/merged_replicate_deseq2_pca_header.txt @@ -0,0 +1,11 @@ +#id: 'mrep_deseq2_pca' +#section_name: 'MERGED REP: DESeq2 PCA plot' +#description: "PCA plot of the samples in the experiment. +# These values are calculated using DESeq2 +# in the deseq2_qc.r script." +#plot_type: 'scatter' +#anchor: 'mrep_deseq2_pca' +#pconfig: +# title: 'DESeq2: Principal component plot' +# xlab: PC1 +# ylab: PC2 diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 062ec1bb..238f8b9d 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -47,16 +47,193 @@ module_order: - "*.final.out" - custom_content +# Other MultiQC config stuff here +custom_data: + mapping: + parent_id: mapping + parent_name: "Mapping" + file_format: "tsv" + section_name: "Mapping" + description: "The mapping metrics for each experiment" + plot_type: "bargraph" + dedup_reads: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Reads" + description: "The number of reads before and after PCR deduplication for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + dedup_ratio: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Ratio" + description: "The PCR deduplication ratio for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Ratio" + #stacking: False + cpswitch: False + tt_percentages: False + dedup_mean_umis: + parent_id: dedup + parent_name: "Deduplication" + file_format: "tsv" + section_name: "Mean UMIs" + description: "Mean number of unique UMIs per position for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Mean number" + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_counts: + parent_id: crosslinks + parent_name: "Crosslinks" + file_format: "tsv" + section_name: "Counts" + description: "The number of crosslinks or crosslink sites for each experiment" + plot_type: "bargraph" + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + crosslinks_ratio: + parent_id: crosslinks + parent_name: "Crosslinks" + file_format: "tsv" + section_name: "Ratios" + description: "The ratio of number of cDNA mapping to crosslink positions for each experiment" + #plot_type: 'bargraph' + pconfig: + ylab: "Count" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + peaks_counts: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Counts" + description: "The total number of peaks called by each peak caller" + plot_type: "bargraph" + pconfig: + ylab: "Number of peaks" + #stacking: False + cpswitch: False + tt_percentages: False + xlinks_in_peaks: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Crosslinks positions in peaks" + description: "The total percentage of crosslinks within peaks for each peak caller" + #plot_type: 'bargraph' + pconfig: + ylab: "Percentage of crosslinks" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + xlinksites_in_peaks: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Crosslinks positions in peaks" + description: "The total percentage of crosslink sites within peaks for each peak caller" + #plot_type: 'bargraph' + pconfig: + ylab: "Percentage of crosslink sites" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + peaks_xlinksite_coverage: + parent_id: peaks + parent_name: "Peaks" + file_format: "tsv" + section_name: "Peak-crosslink coverage" + description: "The total percentage of nucleotides within peaks covered by a crosslink site" + plot_type: "bargraph" + pconfig: + ylab: "Percentage of nucleotides within peaks" + #stacking: False + cpswitch: False + tt_percentages: False + tt_decimals: 2 + tt_suffix: "%" + summary_type: + parent_id: Summary + parent_name: "Summary" + file_format: "tsv" + section_name: "Percentage of cDNA premap" + description: "The total percentage of cDNA summary mapped" + #plot_type: 'bargraph' + pconfig: + ylab: "Type" + #stacking: False + cpswitch: False + tt_percentages: False + summary_subtype: + parent_id: Summary + parent_name: "Summary" + file_format: "tsv" + section_name: "Percentage of cDNA premap subtypes" + description: "The total percentage of cDNA subtypes mapped" + #plot_type: 'bargraph' + pconfig: + ylab: "Type" + #stacking: False + cpswitch: False + tt_percentages: False + +sp: + mapping: + fn: "mapping.tsv" + dedup_reads: + fn: "dedup_reads.tsv" + dedup_ratio: + fn: "dedup_ratio.tsv" + dedup_mean_umis: + fn: "dedup_mean_umis.tsv" + crosslinks_counts: + fn: "xlinks_counts.tsv" + crosslinks_ratio: + fn: "xlinks_ratio.tsv" + peaks: + fn: "total_peaks.tsv" + xlinks_in_peaks: + fn: "xlinks_in_peaks.tsv" + xlinksites_in_peaks: + fn: "xlinksites_in_peaks.tsv" + peaks_xlinksite_coverage: + fn: "peaks_xlinksite_coverage.tsv" + summary_type: + fn: "summary_type_metrics.tsv" + summary_subtype: + fn: '"summary_subtype_metrics.tsv' + custom_content: order: + - clipqc - software-versions-by-process - software-versions-unique - # Customise the module search patterns to speed up execution time -sp: - samtools/stats: - fn: "*.stats" - samtools/flagstat: - fn: "*.flagstat" - samtools/idxstats: - fn: "*.idxstats*" +# sp: +# samtools/stats: +# fn: "*.stats" +# samtools/flagstat: +# fn: "*.flagstat" +# samtools/idxstats: +# fn: "*.idxstats*" +# clipqc: +# fn: "*.txt" diff --git a/conf/modules.config b/conf/modules.config index 5e0d3d5e..ee1e6f2f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,7 @@ process { ======================================================================================== */ + if(params.run_genome_prep) { process { withName: '.*PREPARE_GENOME:GUNZIP_.*' { @@ -71,7 +72,6 @@ if(params.run_genome_prep) { path: { "${params.outdir}/00_genome" }, mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference ] } @@ -157,6 +157,7 @@ if(params.run_genome_prep) { } } + /* ======================================================================================== PRE-PROCESSING @@ -915,36 +916,6 @@ if(params.run_peakcalling && params.peakcaller.contains('icount')) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_SIGXLS' { - ext.args = { "${params.icount_sigxl_params}" } - publishDir = [ - enabled: false - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_ICOUNTMINI_PEAKS' { - ext.args = { "${params.icount_peaks_params}" } - publishDir = [ - enabled: false - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS' { - publishDir = [ - path: { "${params.outdir}/05_peakcalling/icountmini" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS' { - publishDir = [ - path: { "${params.outdir}/05_peakcalling/icountmini" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } } } @@ -1066,13 +1037,13 @@ if(params.run_reporting) { ] } - // withName: 'CLIPSEQ:CLIPSEQ_CLIPQC' { - // publishDir = [ - // path: { "${params.outdir}/06_reports/clipqc" }, - // mode: "${params.publish_dir_mode}", - // saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - // ] - // } + withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CLIPQC' { + publishDir = [ + path: { "${params.outdir}/06_reports/clipqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: 'NFCORE_CLIPSEQ:CLIPSEQ:MULTIQC' { ext.args = params.multiqc_title ? "-v --title \"$params.multiqc_title\"" : '-v' diff --git a/conf/test.config b/conf/test.config index 2b5811f7..6a241ddf 100644 --- a/conf/test.config +++ b/conf/test.config @@ -54,3 +54,4 @@ params { // Pipeline params umitools_bc_pattern = 'NNNNNNNNN' } + diff --git a/modules.json b/modules.json index b51c411a..5e0e1025 100644 --- a/modules.json +++ b/modules.json @@ -70,6 +70,11 @@ "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", "installed_by": ["modules"] }, + "deseq2/differential": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", diff --git a/modules/local/bedgraph_strand_split/main.nf b/modules/local/bedgraph_strand_split/main.nf index 5888a035..d611d0f3 100644 --- a/modules/local/bedgraph_strand_split/main.nf +++ b/modules/local/bedgraph_strand_split/main.nf @@ -1,7 +1,7 @@ // modules/local/bedgraph_strand_split/main.nf process BEDGRAPH_STRAND_SPLIT { tag "$meta.id" - label 'process_low' + label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/clipqc/main.nf b/modules/local/clipqc/main.nf index 3c9e81e7..0fc14820 100644 --- a/modules/local/clipqc/main.nf +++ b/modules/local/clipqc/main.nf @@ -12,6 +12,10 @@ process CLIPQC { path("icount/*") path("paraclu/*") path("clippy/*") + path("pureclip/*") + path("summary_type/*") + path("summary_subtype/*") + path("summary_gene/*") output: path "*.tsv" , emit: tsv diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py index f265983f..122f714c 100644 --- a/modules/local/clipqc/templates/clipqc.py +++ b/modules/local/clipqc/templates/clipqc.py @@ -49,6 +49,7 @@ def main(process_name): ncrna["ncrna_reads"].append(input_reads - output_reads) ncrna_df = pd.DataFrame(ncrna) + print(ncrna_df) # Next get STAR logs star_logs = sorted(["mapped/" + f for f in os.listdir("mapped") if f.endswith(".Log.final.out")]) @@ -96,7 +97,7 @@ def main(process_name): for dedup_log in dedup_logs: with open(dedup_log, "r") as logfile: - exp = re.sub(".log", "", os.path.basename(dedup_log)) + exp = re.sub(".genomeUnique.dedup_UMICollapse.log", "", os.path.basename(dedup_log)) lines = logfile.readlines() @@ -149,26 +150,26 @@ def read_bed(filename): # First get xlink bed files xlinks_files = sorted(["xlinks/" + f for f in os.listdir("xlinks") if f.endswith(".bed")]) - xlinks = dict((key, []) for key in ["exp", "total_xlinks", "total_xlinksites", "ratio"]) + xlinks = dict((key, []) for key in ["exp", "number_of_cDNAs", "total_positions_of_crosslinks", "ratio"]) for xlinks_file in xlinks_files: xlinks_df = read_bed(xlinks_file) - exp = re.sub(".bed", "", os.path.basename(xlinks_file)) - total_xlinks = xlinks_df["score"].sum() - total_xlinksites = xlinks_df.shape[0] - ratio = total_xlinks / total_xlinksites + exp = re.sub(".genome.xl.bed", "", os.path.basename(xlinks_file)) + number_of_cDNAs = xlinks_df["score"].sum() + total_positions_of_crosslinks = xlinks_df.shape[0] + ratio = number_of_cDNAs / total_positions_of_crosslinks xlinks["exp"].append(exp) - xlinks["total_xlinks"].append(total_xlinks) - xlinks["total_xlinksites"].append(total_xlinksites) - xlinks["ratio"].append(round(total_xlinks / total_xlinksites, 2)) + xlinks["number_of_cDNAs"].append(number_of_cDNAs) + xlinks["total_positions_of_crosslinks"].append(total_positions_of_crosslinks) + xlinks["ratio"].append(round(number_of_cDNAs / total_positions_of_crosslinks, 2)) xlinks_metrics_df = pd.DataFrame(xlinks) xlinks_metrics_df.to_csv("xlinks_metrics.tsv", sep="\t", index=False) # Subset for MultiQC plots - xlinks_metrics_df.loc[:, ["exp", "total_xlinks", "total_xlinksites"]].to_csv( + xlinks_metrics_df.loc[:, ["exp", "number_of_cDNAs", "total_positions_of_crosslinks"]].to_csv( "xlinks_counts.tsv", sep="\t", index=False ) xlinks_metrics_df.loc[:, ["exp", "ratio"]].to_csv("xlinks_ratio.tsv", sep="\t", index=False) @@ -176,11 +177,78 @@ def read_bed(filename): print(xlinks_metrics_df) print("\n\n") + # ========== + # Summary cDNA + # ========== + summary_type_files = sorted(["summary_type/" + f for f in os.listdir("summary_type") if f.endswith(".tsv")]) + # summary_data = dict((key, []) for key in ["exp", "Type", "Length", "Total_number_cDNA", "perc_number_cDNA"]) + # summary_data = dict((key, []) for key in ["exp", "Type", "perc_number_cDNA"]) + summary_data = [] + + for summary_type_file in summary_type_files: + + exp = re.sub(".summary_type_premapadjusted.tsv", "", os.path.basename(summary_type_file)) + df = pd.read_csv(summary_type_file, sep="\t") + + exp_data = {"exp": exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row["Type"] + exp_data[rna_type] = row["cDNA %"] + + summary_data.append(exp_data) + + summary_df = pd.DataFrame(summary_data) + desired_columns = ["exp", "CDS", "ncRNA", "intergenic", "premapped RNA"] + existing_cols = [col for col in desired_columns if col in summary_df.columns] + summary_df = summary_df[existing_cols] + summary_df.to_csv("summary_type_metrics.tsv", sep="\t", index=False) + + # ========== + # Summary subtype cDNA + # ========== + + summary_subtype_files = sorted( + ["summary_subtype/" + f for f in os.listdir("summary_subtype") if f.endswith(".tsv")] + ) + summary_subtypedata = [] + + for summary_subtype_file in summary_subtype_files: + + exp = re.sub(".summary_subtype_premapadjusted.tsv", "", os.path.basename(summary_subtype_file)) + df = pd.read_csv(summary_subtype_file, sep="\t") + + exp_data = {"exp": exp} + + # Add percentage for each RNA type + for _, row in df.iterrows(): + rna_type = row["Subtype"] + exp_data[rna_type] = row["cDNA %"] + + summary_subtypedata.append(exp_data) + + summary_subtype_df = pd.DataFrame(summary_subtypedata) + desired_columns = [ + "exp", + "CDS mRNA", + "ncRNA rRNA", + "ncRNA snRNA", + "ncRNA snoRNA", + "ncRNA tRNA", + "ncRNA ncRNA", + "intergenic", + "premapped RNA", + ] + existing_cols = [col for col in desired_columns if col in summary_subtype_df.columns] + summary_subtype_df = summary_subtype_df[existing_cols] + summary_subtype_df.to_csv("summary_subtype_metrics.tsv", sep="\t", index=False) + # ========== # Peaks # ========== - peakcallers = ["icount", "paraclu", "clippy"] + peakcallers = ["icount", "paraclu", "clippy", "pureclip"] def get_peaks_metrics(peakcaller): peak_files = sorted([peakcaller + "/" + f for f in os.listdir(peakcaller) if f.endswith(".bed")]) @@ -209,13 +277,15 @@ def get_peaks_metrics(peakcaller): if peakcaller == "icount": exp = re.sub(".peaks.bed", "", os.path.basename(peak_file)) elif peakcaller == "paraclu": - exp = re.sub(".peaks.bed", "", os.path.basename(peak_file)) + exp = re.sub(".genome.peaks.clustered.simplified.bed", "", os.path.basename(peak_file)) elif peakcaller == "clippy": exp = re.sub( - "_rollmean.+_stdev.+_minGeneCount.+.bed", + ".genome.peaks_rollmean10_minHeightAdjust1.0_minPromAdjust1.0_minGeneCount5_Peaks.bed", "", os.path.basename(peak_file), ) + elif peakcaller == "pureclip": + exp = re.sub("_pureclip.peaks.bed", "", os.path.basename(peak_file)) xlinks_df = read_bed(xlinks_files[0]) expanded_xlinks_df = xlinks_df.loc[xlinks_df.index.repeat(xlinks_df.score)].reset_index(drop=True) diff --git a/modules/local/consensus_count_table/main.nf b/modules/local/consensus_count_table/main.nf index 803f5af4..3515b2e8 100644 --- a/modules/local/consensus_count_table/main.nf +++ b/modules/local/consensus_count_table/main.nf @@ -1,6 +1,6 @@ process GET_CONSENSUS_COUNTS { tag "${meta.id}" - label "process_single" + label "process_high" conda "bioconda::bioconductor-rtracklayer=1.62.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -13,7 +13,7 @@ process GET_CONSENSUS_COUNTS { val(output_name) output: - path "*.tsv" , emit: tsv + tuple val(meta), path ("*.tsv") , emit: tsv path "versions.yml" , emit: versions when: diff --git a/modules/local/deseq2_qc/environment.yml b/modules/local/deseq2_qc/environment.yml new file mode 100644 index 00000000..40708368 --- /dev/null +++ b/modules/local/deseq2_qc/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bioconductor-deseq2=1.34.0 diff --git a/modules/local/deseq2_qc/main.nf b/modules/local/deseq2_qc/main.nf new file mode 100644 index 00000000..8f52f7d2 --- /dev/null +++ b/modules/local/deseq2_qc/main.nf @@ -0,0 +1,66 @@ +process DESEQ2_QC { + tag "$meta.id" + label "process_single" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0' : + 'biocontainers/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0' }" + + input: + tuple val(meta), path(counts) + path deseq2_pca_header + path deseq2_clustering_header + + output: + path "*.pdf" , optional:true, emit: pdf + path "*.RData" , optional:true, emit: rdata + path "*.rds" , optional:true, emit: rds + path "*pca.vals.txt" , optional:true, emit: pca_txt + path "*pca.vals_mqc.tsv" , optional:true, emit: pca_multiqc + path "*sample.dists.txt" , optional:true, emit: dists_txt + path "*sample.dists_mqc.tsv", optional:true, emit: dists_multiqc + path "*.log" , optional:true, emit: log + path "size_factors" , optional:true, emit: size_factors + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + //template 'deseq2_qc.R' + """ + Rscript $projectDir/modules/local/deseq2_qc/templates/deseq2_qc.R \ + -i ${counts} \ + -c 3 \ + -d "PeakID" \ + -o ${prefix} \ + --vst + """ + + stub: + """ + touch ${meta.id}.pdf + touch ${meta.id}.RData + touch ${meta.id}.rds + touch ${meta.id}.pca.vals.txt + touch ${meta.id}.pca.vals_mqc.tsv + touch ${meta.id}.sample.dists.txt + touch ${meta.id}.sample.dists_mqc.tsv + touch ${meta.id}.log + path "size_factors" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-deseq2: \$(Rscript -e "library(DESeq2); cat(as.character(packageVersion('DESeq2')))") + END_VERSIONS + """ + + +} + + + diff --git a/modules/local/deseq2_qc/templates/deseq2_qc.R b/modules/local/deseq2_qc/templates/deseq2_qc.R new file mode 100755 index 00000000..8aff2570 --- /dev/null +++ b/modules/local/deseq2_qc/templates/deseq2_qc.R @@ -0,0 +1,288 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## REQUIREMENTS ## +################################################ +################################################ + +## PCA, HEATMAP AND SCATTERPLOTS FOR SAMPLES IN COUNTS FILE +## - SAMPLE NAMES HAVE TO END IN e.g. "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS. +## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +suppressPackageStartupMessages({ + library(optparse) + library(DESeq2) + library(ggplot2) + library(RColorBrewer) + library(pheatmap) +}) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +# opt <- list( +# output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), +# count_file = '$counts', +# count_col = 3, +# vst_name = 'vst', +# vst = TRUE, +# id_col = "PeakID") + +# opt_types <- lapply(opt, class) + +# # Apply parameter overrides + +# args_opt <- parse_args('$task.ext.args') +# for ( ao in names(args_opt)){ +# if (! ao %in% names(opt)){ +# stop(paste("Invalid option:", ao)) +# }else{ + +# # Preserve classes from defaults where possible +# if (! is.null(opt[[ao]])){ +# args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) +# } +# opt[[ao]] <- args_opt[[ao]] +# } +# } +# if ( ! is.null(opt\$round_digits)){ +# opt\$round_digits <- as.numeric(opt\$round_digits) +# } + +# if (is.null(opt$count_file)){ +# print_help(opt_parser) +# stop("Please provide a counts file.", call.=FALSE) +# } + + +option_list <- list( + make_option(c("-i", "--count_file"), type="character", help="Count matrix"), + make_option(c("-c", "--count_col"), type="integer", default=3), + make_option(c("-d", "--id_col"), type="character", default="PeakID"), + make_option(c("-o", "--output_prefix"), type="character", default="deseq2_qc"), + make_option(c("--vst"), action="store_true", default=TRUE) +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +################################################ +################################################ +## READ IN COUNTS FILE ## +################################################ + +count.table <- read.delim(file=opt$count_file,header=TRUE,check.names=FALSE, na.strings = c(".", "NA", "")) +rownames(count.table) <- count.table[,opt$id_col] +count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] + +count.table[is.na(count.table)] <- 0 + + +################################################ +################################################ +## RUN DESEQ2 ## +################################################ +################################################ + + +samples.vec <- colnames(count.table) +name_components <- strsplit(samples.vec, "_") +n_components <- length(name_components[[1]]) +decompose <- n_components!=1 && all(sapply(name_components, length)==n_components) +coldata <- data.frame(samples.vec, sample=samples.vec, row.names=1) +if (decompose) { + groupings <- as.data.frame(lapply(1:n_components, function(i) sapply(name_components, "[[", i))) + names(groupings) <- paste0("Group", 1:n_components) + n_distinct <- sapply(groupings, function(grp) length(unique(grp))) + groupings <- groupings[n_distinct!=1 & n_distinct!=length(samples.vec)] + if (ncol(groupings)!=0) { + coldata <- cbind(coldata, groupings) + } else { + decompose <- FALSE + } +} + +DDSFile <- paste(opt$outprefix,".dds.RData",sep="") + +counts <- count.table[,samples.vec,drop=FALSE] +dds <- DESeqDataSetFromMatrix(countData=round(counts), colData=coldata, design=~ 1) +dds <- estimateSizeFactors(dds) +if (min(dim(count.table))<=1) { # No point if only one sample, or one gene + save(dds,file=DDSFile) + saveRDS(dds, file = paste(opt$output_prefix,"rds", sep = '.')) + warning("Not enough samples or genes in counts file for PCA.", call.=FALSE) + quit(save = "no", status = 0, runLast = FALSE) +} +if (!opt$vst) { + vst_name <- "rlog" + rld <- rlog(dds) +} else { + vst_name <- "vst" + rld <- varianceStabilizingTransformation(dds) +} + +assay(dds, vst_name) <- assay(rld) +save(dds,file=DDSFile) +saveRDS(dds, file = paste(opt$output_prefix, "rds", sep = '.')) + +################################################ +################################################ +## PLOT QC ## +################################################ +################################################ + +##' PCA pre-processeor +##' +##' Generate all the necessary information to plot PCA from a DESeq2 object +##' in which an assay containing a variance-stabilised matrix of counts is +##' stored. Copied from DESeq2::plotPCA, but with additional ability to +##' say which assay to run the PCA on. +##' +##' @param object The DESeq2DataSet object. +##' @param ntop number of top genes to use for principla components, selected by highest row variance. +##' @param assay the name or index of the assay that stores the variance-stabilised data. +##' @return A data.frame containing the projected data alongside the grouping columns. +##' A 'percentVar' attribute is set which includes the percentage of variation each PC explains, +##' and additionally how much the variation within that PC is explained by the grouping variable. +##' @author Gavin Kelly +plotPCA_vst <- function (object, ntop = 500, assay=length(assays(object))) { + rv <- rowVars(assay(object, assay)) + select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] + pca <- prcomp(t(assay(object, assay)[select, ]), center=TRUE, scale=FALSE) + percentVar <- pca$sdev^2/sum(pca$sdev^2) + df <- cbind( as.data.frame(colData(object)), pca$x) + #Order points so extreme samples are more likely to get label + ord <- order(abs(rank(df$PC1)-median(df$PC1)), abs(rank(df$PC2)-median(df$PC2))) + df <- df[ord,] + attr(df, "percentVar") <- data.frame(PC=seq(along=percentVar), percentVar=100*percentVar) + return(df) +} + +PlotFile <- paste(opt$outprefix,".plots.pdf",sep="") + +pdf(file=PlotFile, onefile=TRUE, width=7, height=7) +## PCA +ntop <- c(500, Inf) +for (n_top_var in ntop) { + pca.data <- plotPCA_vst(dds, assay=vst_name, ntop=n_top_var) + percentVar <- round(attr(pca.data, "percentVar")$percentVar) + plot_subtitle <- ifelse(n_top_var==Inf, "All peaks", paste("Top", n_top_var, "peaks")) + pl <- ggplot(pca.data, aes(PC1, PC2, label=paste0(" ", sample, " "))) + + geom_point() + + geom_text(check_overlap=TRUE, vjust=0.5, hjust="inward") + + xlab(paste0("PC1: ",percentVar[1],"% variance")) + + ylab(paste0("PC2: ",percentVar[2],"% variance")) + + labs(title = paste0("First PCs on ", vst_name, "-transformed data"), subtitle = plot_subtitle) + + theme(legend.position="top", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1)) + print(pl) + + if (decompose) { + pc_names <- paste0("PC", attr(pca.data, "percentVar")$PC) + long_pc <- reshape(pca.data, varying=pc_names, direction="long", sep="", timevar="component", idvar="pcrow") + long_pc <- subset(long_pc, component<=5) + long_pc_grp <- reshape(long_pc, varying=names(groupings), direction="long", sep="", timevar="grouper") + long_pc_grp <- subset(long_pc_grp, grouper<=5) + long_pc_grp$component <- paste("PC", long_pc_grp$component) + long_pc_grp$grouper <- paste0(long_pc_grp$grouper, c("st","nd","rd","th","th")[long_pc_grp$grouper], " prefix") + pl <- ggplot(long_pc_grp, aes(x=Group, y=PC)) + + geom_point() + + stat_summary(fun=mean, geom="line", aes(group = 1)) + + labs(x=NULL, y=NULL, subtitle = plot_subtitle, title="PCs split by sample-name prefixes") + + facet_grid(component~grouper, scales="free_x") + + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + print(pl) + } +} # at end of loop, we'll be using the user-defined ntop if any, else all genes + +## WRITE PC1 vs PC2 VALUES TO FILE +pca.vals <- pca.data[,c("PC1","PC2")] +colnames(pca.vals) <- paste0(colnames(pca.vals), ": ", percentVar[1:2], '% variance') +pca.vals <- cbind(sample = rownames(pca.vals), pca.vals) +write.table(pca.vals, file = paste(opt$outprefix, ".pca.vals.txt", sep=""), + row.names = FALSE, col.names = TRUE, sep = "\t", quote = TRUE) + +## SAMPLE CORRELATION HEATMAP +sampleDists <- dist(t(assay(dds, vst_name))) +sampleDistMatrix <- as.matrix(sampleDists) +colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) +pheatmap( + sampleDistMatrix, + clustering_distance_rows=sampleDists, + clustering_distance_cols=sampleDists, + col=colors, + main=paste("Euclidean distance between", vst_name, "of samples") +) + +## WRITE SAMPLE DISTANCES TO FILE +write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste(opt$outprefix, ".sample.dists.txt", sep=""), + row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE) +dev.off() + +################################################ +################################################ +## SAVE SIZE FACTORS ## +################################################ +################################################ + +SizeFactorsDir <- "size_factors/" +if (file.exists(SizeFactorsDir) == FALSE) { + dir.create(SizeFactorsDir, recursive=TRUE) +} + +NormFactorsFile <- paste(SizeFactorsDir,opt$outprefix, ".size_factors.RData", sep="") + +normFactors <- sizeFactors(dds) +save(normFactors, file=NormFactorsFile) + +for (name in names(sizeFactors(dds))) { + sizeFactorFile <- paste(SizeFactorsDir,name, ".size_factors.txt", sep="") + write(as.numeric(sizeFactors(dds)[name]), file=sizeFactorFile) +} + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +RLogFile <- "R_sessionInfo.log" + +sink(RLogFile) +a <- sessionInfo() +print(a) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- paste(R.version[['major']],R.version[['minor']], sep = ".") +deseq2.version <- as.character(packageVersion('DESeq2')) + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-deseq2:', deseq2.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ \ No newline at end of file diff --git a/modules/nf-core/peka/main.nf b/modules/nf-core/peka/main.nf index 416a44a8..84256428 100644 --- a/modules/nf-core/peka/main.nf +++ b/modules/nf-core/peka/main.nf @@ -1,6 +1,6 @@ process PEKA { tag "$meta.id" - label 'process_low' + label 'process_high' conda "bioconda::peka=1.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf index 81cdee95..37a201f5 100644 --- a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf +++ b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf @@ -1,6 +1,6 @@ process UCSC_BEDGRAPHTOBIGWIG { tag "$meta.id" - label 'process_single' + label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. conda "${moduleDir}/environment.yml" diff --git a/nextflow.config b/nextflow.config index 83861b5f..71319c7e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { skip_umi_extract = true skip_trimming = false skip_transcriptome = true + skip_deseq2_qc = false // Output params save_reference = false diff --git a/subworkflows/local/consensus_peak_table.nf b/subworkflows/local/consensus_peak_table.nf index aec66902..e0a747ff 100644 --- a/subworkflows/local/consensus_peak_table.nf +++ b/subworkflows/local/consensus_peak_table.nf @@ -5,6 +5,7 @@ // // MODULES // +include { DESEQ2_QC } from '../../modules/local/deseq2_qc' include { BEDTOOLS_SORT as CONSENSUS_PEAKS_SORT } from '../../modules/nf-core/bedtools/sort/main' include { BEDTOOLS_SORT as CROSSLINKS_SORT } from '../../modules/nf-core/bedtools/sort/main' include { BEDTOOLS_MAP as CONSENSUS_MAP } from '../../modules/nf-core/bedtools/map/main' @@ -18,9 +19,13 @@ workflow CONSENSUS_PEAK_TABLE { take: all_crosslinks // channel: [ val(meta), [ xl.bed ] ] consensus_peaks // channel: [ val(meta), [ peaks.bed ] ] + genome_fasta genome_fai // channel: [ val(meta), [ index ] ] gtf // channel: [ val(meta), [ gtf ] ] table_name // val "Clippy_Consensus_Counts.tsv" must end in tsv + ch_deseq2_pca_header_multiqc + ch_deseq2_clustering_header_multiqc + skip_deseq2_qc main: ch_versions = Channel.empty() @@ -45,7 +50,7 @@ workflow CONSENSUS_PEAK_TABLE { CONSENSUS_MAP ( ch_consensus_map, - genome_fai + genome_fai, ) CONSENSUS_MAP.out.mapped @@ -59,7 +64,48 @@ workflow CONSENSUS_PEAK_TABLE { table_name ) + // + // Generate QC plots with DESeq2 + // + ch_deseq2_qc_pdf = Channel.empty() + ch_deseq2_qc_rdata = Channel.empty() + ch_deseq2_qc_rds = Channel.empty() + ch_deseq2_qc_pca_txt = Channel.empty() + ch_deseq2_qc_pca_multiqc = Channel.empty() + ch_deseq2_qc_dists_txt = Channel.empty() + ch_deseq2_qc_dists_multiqc = Channel.empty() + ch_deseq2_qc_log = Channel.empty() + ch_deseq2_qc_size_factors = Channel.empty() + if (!skip_deseq2_qc) { + DESEQ2_QC ( + GET_CONSENSUS_COUNTS.out.tsv, + ch_deseq2_pca_header_multiqc, + ch_deseq2_clustering_header_multiqc + ) + ch_deseq2_qc_pdf = DESEQ2_QC.out.pdf + ch_deseq2_qc_rdata = DESEQ2_QC.out.rdata + ch_deseq2_qc_rds = DESEQ2_QC.out.rds + ch_deseq2_qc_pca_txt = DESEQ2_QC.out.pca_txt + ch_deseq2_qc_pca_multiqc = DESEQ2_QC.out.pca_multiqc + ch_deseq2_qc_dists_txt = DESEQ2_QC.out.dists_txt + ch_deseq2_qc_dists_multiqc = DESEQ2_QC.out.dists_multiqc + ch_deseq2_qc_log = DESEQ2_QC.out.log + ch_deseq2_qc_size_factors = DESEQ2_QC.out.size_factors + ch_versions = ch_versions.mix(DESEQ2_QC.out.versions) + } + emit: - mapped_table = GET_CONSENSUS_COUNTS.out.tsv // channel: [ val(meta), [ tsv ] ] + mapped_table = GET_CONSENSUS_COUNTS.out.tsv // channel: [ val(meta), [ tsv ] ] + + + deseq2_qc_pdf = ch_deseq2_qc_pdf // channel: [ pdf ] + deseq2_qc_rdata = ch_deseq2_qc_rdata // channel: [ rdata ] + deseq2_qc_rds = ch_deseq2_qc_rds // channel: [ rds ] + deseq2_qc_pca_txt = ch_deseq2_qc_pca_txt // channel: [ txt ] + deseq2_qc_pca_multiqc = ch_deseq2_qc_pca_multiqc // channel: [ txt ] + deseq2_qc_dists_txt = ch_deseq2_qc_dists_txt // channel: [ txt ] + deseq2_qc_dists_multiqc = ch_deseq2_qc_dists_multiqc // channel: [ txt ] + deseq2_qc_log = ch_deseq2_qc_log // channel: [ txt ] + deseq2_qc_size_factors = ch_deseq2_qc_size_factors // channel: [ txt ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index ae509397..606e5f0c 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -117,7 +117,7 @@ workflow PREPARE_GENOME { } } else { - ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta ).index + ch_bt_index = BOWTIE_BUILD ( ch_ncrna_fasta).index ch_versions = ch_versions.mix(BOWTIE_BUILD.out.versions) } diff --git a/subworkflows/local/rna_align.nf b/subworkflows/local/rna_align.nf index a6e00c3b..d5ea9dd4 100644 --- a/subworkflows/local/rna_align.nf +++ b/subworkflows/local/rna_align.nf @@ -229,4 +229,4 @@ workflow RNA_ALIGN { transcript_multi_bai = ch_transcript_multi_bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] -} +} \ No newline at end of file diff --git a/workflows/clipseq.nf b/workflows/clipseq.nf index a3078633..8e0a17e2 100644 --- a/workflows/clipseq.nf +++ b/workflows/clipseq.nf @@ -56,6 +56,10 @@ if ((caller_list + callers).unique().size() != caller_list.size()) { ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true) ch_dummy_file2 = file("$projectDir/assets/dummy_file2.txt", checkIfExists: true) +// deseq2 header files: +ch_multiqc_merged_replicate_deseq2_pca_header = file("$projectDir/assets/merged_replicate_deseq2_pca_header.txt", checkIfExists: true) +ch_multiqc_merged_replicate_deseq2_clustering_header = file("$projectDir/assets/merged_replicate_deseq2_clustering_header.txt", checkIfExists: true) + // // Check if an AWS iGenome has been provided to use the appropriate version of STAR // def is_aws_igenome = false // if (params.fasta && params.gtf) { @@ -133,10 +137,6 @@ include { ICOUNTMINI_PEAKS } from "../m include { GUNZIP as GUNZIP_ICOUNTMINI_SIGXLS } from "../modules/nf-core/gunzip/main" include { GUNZIP as GUNZIP_ICOUNTMINI_PEAKS } from "../modules/nf-core/gunzip/main" -include { ICOUNTMINI_SIGXLS as CONSENSUS_ICOUNTMINI_SIGXLS } from "../modules/nf-core/icountmini/sigxls/main" -include { ICOUNTMINI_PEAKS as CONSENSUS_ICOUNTMINI_PEAKS } from "../modules/nf-core/icountmini/peaks/main" -include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS } from "../modules/nf-core/gunzip/main" -include { GUNZIP as CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS } from "../modules/nf-core/gunzip/main" include { PARACLU as PARACLU_GENOME } from "../modules/nf-core/paraclu/main" include { PARACLU as PARACLU_GENOME_CONSENSUS } from "../modules/nf-core/paraclu/main" @@ -251,6 +251,7 @@ workflow CLIPSEQ { ch_ncrna_genome_index = PREPARE_GENOME.out.ncrna_index.collect() } + // // SUBWORKFLOW: Read in samplesheet, validate, stage input files and merge replicates // @@ -267,7 +268,7 @@ workflow CLIPSEQ { // ch_fastq | view // - // SUBWORKFLOW: Extract UMI, trim and run b4 and after fastqc + // SUBWORKFLOW: Extract UMI, trim and run before and after fastqc // if(params.source == "fastq" & params.run_preprocessing) { FASTQ_FASTQC_UMITOOLS_TRIMGALORE ( @@ -366,7 +367,7 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions) ch_genome_unique_dedupe_bam = GENOME_UNIQUE_DEDUP.out.bam ch_genome_unique_dedupe_bai = GENOME_UNIQUE_DEDUP.out.bai - //ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log + ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log GENOME_MULTI_DEDUP ( ch_genome_multi_bam_bai @@ -440,6 +441,10 @@ workflow CLIPSEQ { ch_merged_summaries ) ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions) + ch_merged_summary_type = MERGE_SUMMARY.out.summary_type_adjusted + ch_merged_summary_subtype = MERGE_SUMMARY.out.summary_subtype_adjusted + ch_merged_summary_gene = MERGE_SUMMARY.out.summary_gene_adjusted + ICOUNTMINI_METAGENE ( ch_genome_crosslink_group_resolved_bed, @@ -492,6 +497,15 @@ workflow CLIPSEQ { ch_icountmini_sigxls = Channel.empty() ch_paraclu_genome_peaks = Channel.empty() + // + // PEKA CHANNELS + // + ch_peka_icountmini_peaks = Channel.empty() + ch_peka_clippy_peaks = Channel.empty() + ch_peka_paraclu_peaks = Channel.empty() + ch_peka_pureclip_peaks = Channel.empty() + + if(params.run_peakcalling) { if('clippy' in callers) { @@ -514,9 +528,13 @@ workflow CLIPSEQ { CLIPPY_CONSENSUS_PEAK_TABLE ( ch_all_crosslinks, CLIPPY_GENOME_CONSENSUS.out.peaks, + ch_fasta, ch_fasta_fai, ch_regions_used, - "Clippy_Consensus_AllCounts.tsv" + "Clippy_Consensus_AllCounts.tsv", + ch_multiqc_merged_replicate_deseq2_pca_header, + ch_multiqc_merged_replicate_deseq2_clustering_header, + params.skip_deseq2_qc ) ch_versions = ch_versions.mix(CLIPPY_CONSENSUS_PEAK_TABLE.out.versions) } @@ -573,37 +591,6 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(GUNZIP_ICOUNTMINI_PEAKS.out.versions) ch_icountmini_peaks = GUNZIP_ICOUNTMINI_PEAKS.out.gunzip - if(params.consensus_peak){ - CONSENSUS_ICOUNTMINI_SIGXLS ( - ch_consensus_crosslinks_final_bed, - ch_seg_gtf.collect{ it[1]} - ) - // CHANNEL: Create combined channel of input crosslinks and sigxls - ch_consensus_peaks_input = ch_consensus_crosslinks_final_bed - .map{ [ it[0].id, it[0], it[1] ] } - .join( CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls.map{ [ it[0].id, it[0], it[1] ] } ) - .map { [ it[1], it[2], it[4] ] } - //EXAMPLE CHANNEL STRUCT: [ [id:test], BED(crosslinks), BED(sigxls) ] - CONSENSUS_ICOUNTMINI_PEAKS ( - ch_consensus_peaks_input - ) - ch_consensus_peaks = CONSENSUS_ICOUNTMINI_PEAKS.out.peaks - CONSENSUS_GUNZIP_ICOUNTMINI_SIGXLS ( - CONSENSUS_ICOUNTMINI_SIGXLS.out.sigxls - ) - CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS ( - CONSENSUS_ICOUNTMINI_PEAKS.out.peaks - ) - ICOUNT_CONSENSUS_PEAK_TABLE ( - ch_all_crosslinks, - CONSENSUS_GUNZIP_ICOUNTMINI_PEAKS.out.gunzip, - ch_fasta_fai, - ch_regions_used, - "iCount-Mini_Consensus_AllCounts.tsv" - ) - ch_versions = ch_versions.mix(ICOUNT_CONSENSUS_PEAK_TABLE.out.versions) - } - if(params.run_peka) { PEKA_ICOUNT ( ch_icountmini_peaks, @@ -638,9 +625,13 @@ workflow CLIPSEQ { PARACLU_CONSENSUS_PEAK_TABLE ( ch_all_crosslinks, PARACLU_GENOME_CONSENSUS.out.bed, + ch_fasta, ch_fasta_fai, ch_regions_used, - "Paraclu_Consensus_AllCounts.tsv" + "Paraclu_Consensus_AllCounts.tsv", + ch_multiqc_merged_replicate_deseq2_pca_header, + ch_multiqc_merged_replicate_deseq2_clustering_header, + params.skip_deseq2_qc ) ch_versions = ch_versions.mix(PARACLU_CONSENSUS_PEAK_TABLE.out.versions) } @@ -755,26 +746,34 @@ workflow CLIPSEQ { } if(params.run_reporting) { - // + // MODULE: Collect software versions - // - DUMP_SOFTWARE_VERSIONS ( - ch_versions.unique().collectFile() - ) - - // - // MODULE: Run clipqc - // - // CLIPSEQ_CLIPQC ( - // ch_bt_log.map{ it[1] }, - // ch_star_log.map{ it[1] }, - // ch_umi_log.map{ it[1] }, - // ch_genome_crosslink_bed.map{ it[1] }, - // ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] }, - // PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] }, - // CLIPPY_GENOME.out.peaks.map{ it[1] } + + // DUMP_SOFTWARE_VERSIONS ( + // ch_versions.unique().collectFile() // ) + + // MODULE: Run clipqc + + CLIPQC ( + ch_ncrna_log.collect{ it[1] }, + ch_genome_log.collect{ it[1] }, + ch_umi_log.collect{ it[1] }, + ch_genome_crosslink_group_resolved_bed.collect{ it[1] }, + ch_icountmini_peaks.collect{ it[1] }, + ch_paraclu_genome_peaks.collect {it[1]}, + ch_clippy_genome_peaks.collect {it[1]}, + ch_pureclip_genome_peaks.collect {it[1] }, + ch_merged_summary_type.collect{it[1]}, + ch_merged_summary_subtype.collect{it[1]}, + ch_merged_summary_gene.collect{it[1]}, + + //ICOUNT_ANALYSE.out.bed_peaks.map{ it[1] }, + //PARACLU_ANALYSE_GENOME.out.peaks.map{ it[1] }, + //CLIPPY_GENOME.out.peaks.map{ it[1] } + ) + // // MODULE: Run multiqc // @@ -787,14 +786,17 @@ workflow CLIPSEQ { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect()) - + //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_yml.collect()) + //ch_multiqc_files = ch_multiqc_files.mix(DUMP_SOFTWARE_VERSIONS.out.mqc_unique_yml.collect()) + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_ncrna_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_genome_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CLIPQC.out.tsv.collect().ifEmpty([])) + + ch_tmp = ch_ncrna_log.collect{it[1]} MULTIQC ( ch_multiqc_files.collect(),