From 335df69a996246f7e9952479f89f49903021f5cf Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Wed, 23 Apr 2025 10:53:39 -0700 Subject: [PATCH 01/21] add falcon cluster method to the workflow --- bin/conda_env_falcon.yml | 21 ++++++ bin/scripts/summarize_results_falcon.py | 88 +++++++++++++++++++++++++ nf_workflow.nf | 64 +++++++++++++++++- workflowinput.yaml | 10 +++ 4 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 bin/conda_env_falcon.yml create mode 100644 bin/scripts/summarize_results_falcon.py diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml new file mode 100644 index 0000000..0fb7f39 --- /dev/null +++ b/bin/conda_env_falcon.yml @@ -0,0 +1,21 @@ +channels: + - conda-forge + - bioconda + - defaults +channel_priority: strict +dependencies: + - python=3.10 + - pandas=2.2.0 + - pyteomics=4.7.1 + - pip: + - numpy==1.23.4 + - xmltodict + - requests + - tqdm + - psutil + - pyopenms==3.0.0 + - spectrum-utils==0.3.5 + - falcon-ms + - pymzml + - numcodecs + - pyarrow \ No newline at end of file diff --git a/bin/scripts/summarize_results_falcon.py b/bin/scripts/summarize_results_falcon.py new file mode 100644 index 0000000..d6ec09e --- /dev/null +++ b/bin/scripts/summarize_results_falcon.py @@ -0,0 +1,88 @@ +import os +import pandas as pd +import argparse +import numpy as np + +def rewrite_falcon_mgf(mgf_input, mgf_output): + with open(mgf_input, 'r') as infile, open(mgf_output, 'w') as outfile: + for line in infile: + if line.startswith("CLUSTER="): + try: + cluster_num = int(line.strip().split("=")[1]) + outfile.write(f"SCANS={cluster_num + 1}\n") + except ValueError: + outfile.write(line) + else: + outfile.write(line) + +def main(): + parser = argparse.ArgumentParser(description='Summarizing Falcon Results') + parser.add_argument('falcon_clusters', help='falcon_clusters') + parser.add_argument('falcon_mgf', help='falcon_mgf') + #parser.add_argument('output_summary_folder', help='output_summary_folder') + args = parser.parse_args() + + clusterinfo_df = pd.read_csv(args.falcon_clusters, sep=',', comment='#') + + print(args) + print(clusterinfo_df) + + clusterinfo_df = clusterinfo_df.sort_values(by='cluster', key=lambda x: x.replace(-1, np.inf)) + + # Filtering out not in clusters data + clusterinfo_df = clusterinfo_df[clusterinfo_df["cluster"] != -1] + + + # Grouping by cluster + grouped_cluster_df = clusterinfo_df.groupby(["cluster"]) + cluster_summary_list = [] + for cluster, cluster_group_df in grouped_cluster_df: + #TODO :Read these from mgf, as the representative is a medoid + + cluster_count = len(cluster_group_df) + cluster_mz = cluster_group_df["precursor_mz"].mean() + cluster_rt = cluster_group_df["retention_time"].mean() + cluster_charge = cluster_group_df["precursor_charge"].mean() + # adjust the col name to map the classical MN wokflow + output_dict = {} + output_dict["number of spectra"] = cluster_count + output_dict["parent mass"] = cluster_mz + output_dict["RTMean"] = cluster_rt + output_dict["precursor charge"] = cluster_charge + output_dict["cluster index"] = cluster[0] + 1 + + cluster_summary_list.append(output_dict) + + # Creating a cluster summary + cluster_summary_df = pd.DataFrame(cluster_summary_list) + cluster_summary_df.to_csv("clustersummary.tsv", sep='\t', index=False) + + # Creating cluster info + clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[2] + ".mzML") + clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[-1]) + + # Rename relevant columns to MS-Cluster format + clusterinfo_df = clusterinfo_df.rename(columns={ + "filename": "#Filename", + "cluster": "#ClusterIdx", + "scan": "#Scan", + "precursor_mz": "#ParentMass", + "precursor_charge": "#Charge", + "retention_time": "#RetTime" + }) + + # Just to prevent other processes in the workflow raise error + clusterinfo_df["#PrecIntensity"] = 0 + + # Select required columns + clusterinfo_df = clusterinfo_df[[ + "#ClusterIdx", "#Filename", "#Scan", "#ParentMass", "#Charge", "#RetTime", "#PrecIntensity" + ]] + clusterinfo_df.to_csv("clusterinfo.tsv", sep='\t', index=False) + + #TODO: Rewriting MGF files + rewrite_falcon_mgf(args.falcon_mgf, "specs_ms.mgf") + # TODO: Maybe make this compatible with FBMN, since its already clustered. + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/nf_workflow.nf b/nf_workflow.nf index e56637a..0bc53d6 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -14,6 +14,8 @@ params.metadata_filename = "data/metadata.tsv" // Clustering Parameters params.min_cluster_size = "2" +params.cluster_method = "MS-Cluster" // can be MS-Cluster or Falcon +params.eps = "0.01" // eps for falcon // Tolerance Parameters params.pm_tolerance = "2.0" @@ -113,6 +115,56 @@ process mscluster { """ } +process falcon { + publishDir "$params.publishdir/nf_output/clustering", mode: 'copy' + + conda "$TOOL_FOLDER/conda_env_falcon.yml" + + input: + val inputSpectra + val ready + + output: + file 'falcon.mgf' + file 'falcon.csv' + + """ + mkdir clustering + falcon \ + ${inputSpectra} \ + falcon --export_representatives \ + --precursor_tol 20 ppm \ + --fragment_tol 0.05 \ + --min_mz_range 0 \ + --min_mz 0 \ + --max_mz 2000 \ + --min_samples $params.min_cluster_size \ + --eps $params.eps --work_dir clustering \ + --overwrite + """ +} + +process summarize_falcon { + publishDir "$params.publishdir/nf_output/clustering", mode: 'copy' + + conda "$TOOL_FOLDER/conda_env.yml" + + input: + file input_csv + file input_mgf + + output: + file 'clusterinfo.tsv' + file 'clustersummary.tsv' + file 'specs_ms.mgf' + + """ + python $TOOL_FOLDER/scripts/summarize_results_falcon.py \ + ${input_csv} \ + ${input_mgf} + """ +} + // TODO: Finish Implementing this, as this is currently an no-op process massqlFilterSpectra { publishDir "$params.publishdir/nf_output", mode: 'copy' @@ -559,7 +611,8 @@ process PrepareForModiFinder{ } workflow { - // Preps input spectrum files + +// Preps input spectrum files input_spectra_ch = Channel.fromPath(params.input_spectra) // Downloads input data and lists privtae spectra @@ -570,7 +623,14 @@ workflow { filesummary(input_spectra_ch, _download_ready) // Clustering - (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) + if(params.cluster_method == "MS-Cluster"){ + (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) + } + else if (params.cluster_method == "Falcon"){ + input_falcon_spectra_ch = params.input_spectra + "/*.mzML" + (clustered_spectra_falcon_ch, falcon_tsv_ch) = falcon(input_falcon_spectra_ch, _download_ready) + (clusterinfo_ch, clustersummary_ch, clustered_spectra_intermediate_ch) = summarize_falcon(falcon_tsv_ch, clustered_spectra_falcon_ch) + } if(params.massql_filter != "None"){ clustered_spectra_ch = massqlFilterSpectra(clustered_spectra_intermediate_ch) diff --git a/workflowinput.yaml b/workflowinput.yaml index a0bec38..c637872 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -95,6 +95,16 @@ parameterlist: formvalue: "2" tooltip: "Set to 0 to turn clustering off, be careful with this setting" + - displayname: Clustering Methods + paramtype: select + nf_paramname: cluster_method + formvalue: "MS-Cluster" + options: + - value: "MS-Cluster" + display: "MS-Cluster" + - value: "Falcon" + display: "Falcon" + - displayname: Advanced Filtering Parameters paramtype: section From 916222b81228a3ef96a4d7a5e0c3349e2508a5af Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Wed, 23 Apr 2025 13:18:15 -0700 Subject: [PATCH 02/21] change the default settings for falcon and correct the display name for clustering tool --- nf_workflow.nf | 2 +- workflowinput.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index 0bc53d6..707569b 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -133,7 +133,7 @@ process falcon { falcon \ ${inputSpectra} \ falcon --export_representatives \ - --precursor_tol 20 ppm \ + --precursor_tol 0.05 Da \ --fragment_tol 0.05 \ --min_mz_range 0 \ --min_mz 0 \ diff --git a/workflowinput.yaml b/workflowinput.yaml index c637872..e3b6d25 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -95,7 +95,7 @@ parameterlist: formvalue: "2" tooltip: "Set to 0 to turn clustering off, be careful with this setting" - - displayname: Clustering Methods + - displayname: Clustering Tool paramtype: select nf_paramname: cluster_method formvalue: "MS-Cluster" From e30ba1e385a78af0e31b3098163a282b95812b19 Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Wed, 23 Apr 2025 14:27:31 -0700 Subject: [PATCH 03/21] Update the workflow version number to pin a version for Falcon --- workflowinput.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflowinput.yaml b/workflowinput.yaml index e3b6d25..f296a92 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -1,7 +1,7 @@ workflowname: classical_networking_workflow workflowdescription: This is Classical Molecular Networking for GNPS2 workflowlongdescription: This is Classical Molecular Networking for GNPS2 -workflowversion: "2025.04.11" +workflowversion: "2025.04.23" workflowfile: nf_workflow.nf workflowautohide: false adminonly: false From 9960abbd72b3a3f2715f9de4b3966a2f78ebb48e Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Wed, 23 Apr 2025 15:56:32 -0700 Subject: [PATCH 04/21] fix the input file path for Falcon --- nf_workflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index 707569b..071ef91 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -627,7 +627,7 @@ workflow { (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) } else if (params.cluster_method == "Falcon"){ - input_falcon_spectra_ch = params.input_spectra + "/*.mzML" + input_falcon_spectra_ch = params.input_spectra + "/*" (clustered_spectra_falcon_ch, falcon_tsv_ch) = falcon(input_falcon_spectra_ch, _download_ready) (clusterinfo_ch, clustersummary_ch, clustered_spectra_intermediate_ch) = summarize_falcon(falcon_tsv_ch, clustered_spectra_falcon_ch) } From 913e06f938aa68d71d1b638638c4d07d2cb7a5ad Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Thu, 1 May 2025 10:01:55 -0700 Subject: [PATCH 05/21] updating --- workflowdisplay.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/workflowdisplay.yaml b/workflowdisplay.yaml index c6200a0..f50f3b1 100644 --- a/workflowdisplay.yaml +++ b/workflowdisplay.yaml @@ -307,6 +307,15 @@ Views: - matches_path:TASKLOCATION/[task]/nf_output/modifinder_input.csv - with_library_search:"0" +- name: PostNetworking - MassQL Interactive Analysis + displayname: PostNetworking - MassQL Interactive Analysis + viewname: massqlpostmn + displaytype: linkout + parameters: + baseurl: "https://massqlpostmn.gnps2.org" + urlparam: + - task_id:[task] + - name: section displayname: Download Network Files viewname: section From 3b45ed688a2d85a6919fc1a3b63efd1349b3f091 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Mon, 7 Jul 2025 10:21:05 -0700 Subject: [PATCH 06/21] using dropdown for clustering --- workflowinput.yaml | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/workflowinput.yaml b/workflowinput.yaml index f296a92..875bd7b 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -1,7 +1,7 @@ workflowname: classical_networking_workflow workflowdescription: This is Classical Molecular Networking for GNPS2 workflowlongdescription: This is Classical Molecular Networking for GNPS2 -workflowversion: "2025.04.23" +workflowversion: "2026.01.13" workflowfile: nf_workflow.nf workflowautohide: false adminonly: false @@ -87,13 +87,24 @@ parameterlist: - displayname: Clustering Parameters paramtype: section - - - displayname: Min Cluster Size - paramtype: text + + - displayname: Minimum Cluster Size (Or Disable Clustering) + paramtype: select nf_paramname: min_cluster_size - formplaceholder: Enter the min_cluster_size formvalue: "2" - tooltip: "Set to 0 to turn clustering off, be careful with this setting" + options: + - value: "0" + display: "Clustering Off" + - value: "1" + display: "1" + - value: "2" + display: "2" + - value: "3" + display: "3" + - value: "4" + display: "4" + - value: "5" + display: "5" - displayname: Clustering Tool paramtype: select @@ -135,7 +146,7 @@ parameterlist: - value: "0" display: "no" - - displayname: MassQL Filtering + - displayname: MassQL Filtering (EXPERIMENTAL) paramtype: text nf_paramname: massql_filter formplaceholder: Enter the massql_filter From 96f733c0d015b41182fb854f54c74ff1b3704a83 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 6 Aug 2025 09:30:41 -0700 Subject: [PATCH 07/21] updating --- workflowdisplay.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/workflowdisplay.yaml b/workflowdisplay.yaml index f50f3b1..6af7541 100644 --- a/workflowdisplay.yaml +++ b/workflowdisplay.yaml @@ -295,6 +295,16 @@ Views: - input_spectra:TASKLOCATION/[task]/input_spectra - description:Downstream from [task] Classical Molecular Networking +- name: PostNetworking - MassQL Interactive Analysis + displayname: PostNetworking - MassQL Interactive Analysis + viewname: massqlpostmn + displaytype: linkout + parameters: + baseurl: "https://massqlpostmn.gnps2.org" + urlparam: + - task_id:[task] + + - name: ModiFinder - Localize Analog Modifications displayname: ModiFinder - Localize Analog Modifications viewname: high_throughput_modifinder From 46f43af1f72f438d33e03504c2001fd3756b00bb Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 8 Oct 2025 11:50:21 -0700 Subject: [PATCH 08/21] getting latest version --- GNPS2_DeploymentTooling | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GNPS2_DeploymentTooling b/GNPS2_DeploymentTooling index a92a048..f0983fa 160000 --- a/GNPS2_DeploymentTooling +++ b/GNPS2_DeploymentTooling @@ -1 +1 @@ -Subproject commit a92a0488cd8032ad1274b58c01d6aed47fc7c7c6 +Subproject commit f0983facea2c0604db3113baafb916ba3065fc18 From d946cb0dc39d7c4eeb7386451837cd8bb504a591 Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Wed, 8 Oct 2025 13:04:43 -0700 Subject: [PATCH 09/21] fix similarity mode parameter name --- nf_workflow.nf | 6 +++--- workflowinput.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index 071ef91..7b47415 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -28,7 +28,7 @@ params.precursor_filter = "1" // Molecular Networking Options params.topology = "classic" // or can be transitive -params.cal_all_pairs ='gnps' //or can be index +params.similarity ='gnps' //or can be index params.parallelism = 24 params.networking_min_matched_peaks = 6 @@ -657,11 +657,11 @@ workflow { gnps_library_results_ch = gnps_library_results_ch.ifEmpty(file("NO_FILE")) // Networking - if(params.cal_all_pairs == "gnps"){ + if(params.similarity == "gnps"){ params_ch = networkingGNPSPrepParams(clustered_spectra_ch) networking_results_temp_ch = calculatePairs(clustered_spectra_ch, params_ch.collect()) } - else if (params.cal_all_pairs == "index"){ + else if (params.similarity == "index"){ networking_results_temp_ch = calculatePairs_index(clustered_spectra_ch) } merged_networking_pairs_ch = networking_results_temp_ch.collectFile(name: "merged_pairs.tsv", storeDir: "./nf_output/networking", keepHeader: true) diff --git a/workflowinput.yaml b/workflowinput.yaml index 875bd7b..86ee6a3 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -174,9 +174,9 @@ parameterlist: formplaceholder: Enter the networking_max_shift formvalue: "1999" - - displayname: all_pairwise_method + - displayname: Similarity Mode paramtype: select - nf_paramname: cal_all_pairs + nf_paramname: similarity formvalue: "gnps" options: - value: "gnps" From c47ca4248ee3d849b80198f501490cedcecd887d Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Mon, 12 Jan 2026 10:14:04 -0800 Subject: [PATCH 10/21] Bump share code (#43) * Bump ShareCode * Bump version. --- GNPS_sharedcode | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GNPS_sharedcode b/GNPS_sharedcode index 632fb36..b647ed6 160000 --- a/GNPS_sharedcode +++ b/GNPS_sharedcode @@ -1 +1 @@ -Subproject commit 632fb365c96d2942da8e5514cecceda8d22b288f +Subproject commit b647ed604af57c88328b03f29a3ed44662db6919 From 0d4555a84d90547543e9f44c6ccebf080eed2103 Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Tue, 13 Jan 2026 12:04:17 -0800 Subject: [PATCH 11/21] add falcon clustering --- bin/conda_env_falcon.yml | 11 +- .../convert_falcon_to_mscluster_format.py | 212 ++++++++++++++ bin/scripts/falcon_wrapper.py | 273 ++++++++++++++++++ nf_workflow.nf | 86 +++--- workflowinput.yaml | 73 +++++ 5 files changed, 606 insertions(+), 49 deletions(-) create mode 100755 bin/scripts/convert_falcon_to_mscluster_format.py create mode 100755 bin/scripts/falcon_wrapper.py diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml index 0fb7f39..d816410 100644 --- a/bin/conda_env_falcon.yml +++ b/bin/conda_env_falcon.yml @@ -5,17 +5,22 @@ channels: channel_priority: strict dependencies: - python=3.10 - - pandas=2.2.0 + - pandas=1.5.0 - pyteomics=4.7.1 + - libglib + - libstdcxx-ng + - gcc=12.2.0 - pip: - numpy==1.23.4 - xmltodict - requests - tqdm - psutil + - h5py - pyopenms==3.0.0 - spectrum-utils==0.3.5 - - falcon-ms - pymzml - numcodecs - - pyarrow \ No newline at end of file + - pyarrow + - joblib + - git+https://github.com/XianghuWang-287/falcon.git@public-version diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py new file mode 100755 index 0000000..aa2ccdd --- /dev/null +++ b/bin/scripts/convert_falcon_to_mscluster_format.py @@ -0,0 +1,212 @@ +#!/usr/bin/python + +import sys +import os +import argparse +import pandas as pd + + + +def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2): + """ + Convert falcon output to mscluster format. + + Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch + MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity + + Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0) + instead of fetching from MGF files. + """ + # Load falcon CSV + clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#') + + print(f"Loaded {len(clusterinfo_df)} rows from falcon CSV") + print(f"Columns: {clusterinfo_df.columns.tolist()}") + + + if 'identifier' in clusterinfo_df.columns: + + clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply( + lambda x: x.split(":")[2] + ".mzML" if len(x.split(":")) > 2 else (x.split(":")[-2] + ".mzML" if len(x.split(":")) > 1 else "unknown.mzML") + ) + clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply( + lambda x: int(x.split(":")[-1]) if x.split(":")[-1].isdigit() else 0 + ) + + # Ensure we have the required columns + required_cols = ['cluster', 'filename', 'scan', 'precursor_mz', 'retention_time'] + missing_cols = [col for col in required_cols if col not in clusterinfo_df.columns] + + if missing_cols: + print(f"ERROR: Required columns not found in falcon CSV: {missing_cols}") + print(f"Available columns: {clusterinfo_df.columns.tolist()}") + print(f"First few rows:") + print(clusterinfo_df.head()) + sys.exit(1) + + if min_cluster_size > 1: + clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1] + + # Convert to mscluster format + mscluster_rows = [] + spec_idx_counter = 0 + + for idx, row in clusterinfo_df.iterrows(): + cluster_idx = int(row['cluster']) + + # Skip singletons if min_cluster_size > 1 + if cluster_idx == -1 and min_cluster_size > 1: + continue + + # Handle cluster indexing: falcon uses 0-based, mscluster uses 1-based + # But we also need to handle -1 (singletons) + if cluster_idx == -1: + # Singletons: use a large number or handle separately + cluster_idx = 999999 # Use a large number for singletons + else: + cluster_idx = cluster_idx + 1 # Convert to 1-based + + filename = str(row['filename']) + scan = int(row['scan']) + precursor_mz = float(row['precursor_mz']) + retention_time = float(row['retention_time']) + + # Use default values for fields that falcon doesn't provide + # Don't fetch from MGF files - just use defaults + charge = 0 + precursor_intensity = 0.0 + + + if 'precursor_charge' in row and pd.notna(row['precursor_charge']): + try: + charge = int(row['precursor_charge']) + except (ValueError, TypeError): + charge = 0 + elif 'charge' in row and pd.notna(row['charge']): + try: + charge = int(row['charge']) + except (ValueError, TypeError): + charge = 0 + + if 'precursor_intensity' in row and pd.notna(row['precursor_intensity']): + try: + precursor_intensity = float(row['precursor_intensity']) + except (ValueError, TypeError): + precursor_intensity = 0.0 + + # Convert retention time to seconds if it's in minutes + # Falcon typically outputs RT in minutes, mscluster uses seconds + if retention_time > 0 and retention_time < 1000: # Likely in minutes + retention_time = retention_time * 60.0 + + if not filename.startswith('input_spectra/'): + filename = f"input_spectra/{filename}" + + spec_idx = spec_idx_counter + spec_idx_counter += 1 + + mscluster_row = { + '#ClusterIdx': cluster_idx, + '#Filename': filename, + '#SpecIdx': spec_idx, + '#Scan': scan, + '#ParentMass': precursor_mz, + '#Charge': charge, + '#RetTime': retention_time, + '#PrecIntensity': precursor_intensity + } + mscluster_rows.append(mscluster_row) + + # Create DataFrame + mscluster_df = pd.DataFrame(mscluster_rows) + + # Filter by min_cluster_size + if min_cluster_size > 1: + # Count spectra per cluster + cluster_counts = mscluster_df['#ClusterIdx'].value_counts() + valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index + mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)] + + # Create cluster summary + cluster_summary_rows = [] + for cluster_idx in mscluster_df['#ClusterIdx'].unique(): + cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx] + num_spectra = len(cluster_data) + + # Calculate mean RT (in minutes) + mean_rt = cluster_data['#RetTime'].mean() / 60.0 + + # Calculate parent mass (mean of #ParentMass) + parent_mass = cluster_data['#ParentMass'].mean() + + # Calculate precursor mass (parent mass / charge, then take mean) + # If charge is 0, use parent mass directly + precursor_masses = [] + for idx, row in cluster_data.iterrows(): + if row['#Charge'] > 0: + precursor_masses.append(row['#ParentMass'] / row['#Charge']) + else: + precursor_masses.append(row['#ParentMass']) + precursor_mass = sum(precursor_masses) / len(precursor_masses) if precursor_masses else parent_mass + + # Calculate precursor charge (most common charge, or mean if no clear mode) + charges = cluster_data['#Charge'].values + charges_nonzero = charges[charges > 0] + if len(charges_nonzero) > 0: + # Use mode (most common charge) using pandas + charge_series = pd.Series(charges_nonzero) + mode_values = charge_series.mode() + if len(mode_values) > 0: + precursor_charge = int(mode_values.iloc[0]) + else: + precursor_charge = int(charges_nonzero.mean()) + else: + precursor_charge = 0 + + # Calculate sum of precursor intensity + sum_precursor_intensity = cluster_data['#PrecIntensity'].sum() + + cluster_summary_row = { + 'cluster index': cluster_idx, + 'number of spectra': num_spectra, + 'parent mass': parent_mass, + 'precursor charge': precursor_charge, + 'precursor mass': precursor_mass, + 'sum(precursor intensity)': sum_precursor_intensity, + 'RTMean': mean_rt + } + cluster_summary_rows.append(cluster_summary_row) + + cluster_summary_df = pd.DataFrame(cluster_summary_rows) + cluster_summary_df = cluster_summary_df.sort_values('cluster index') + + # Save outputs + mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False) + cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False) + + print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters") + print(f"Saved clusterinfo to {output_clusterinfo}") + print(f"Saved clustersummary to {output_clustersummary}") + + +def main(): + parser = argparse.ArgumentParser(description='Convert Falcon output to MSCluster format') + parser.add_argument('falcon_csv', help='Falcon CSV output file') + parser.add_argument('input_spectra_folder', help='Input spectra folder (for reference, not used for fetching data)') + parser.add_argument('output_clusterinfo', help='Output clusterinfo.tsv file') + parser.add_argument('output_clustersummary', help='Output clustersummary.tsv file') + parser.add_argument('--min_cluster_size', type=int, default=2, help='Minimum cluster size') + + args = parser.parse_args() + + convert_falcon_to_mscluster_format( + args.falcon_csv, + args.input_spectra_folder, + args.output_clusterinfo, + args.output_clustersummary, + args.min_cluster_size + ) + + +if __name__ == "__main__": + main() diff --git a/bin/scripts/falcon_wrapper.py b/bin/scripts/falcon_wrapper.py new file mode 100755 index 0000000..792fc76 --- /dev/null +++ b/bin/scripts/falcon_wrapper.py @@ -0,0 +1,273 @@ +#!/usr/bin/python + +import sys +import os +import glob +import argparse +import subprocess +import pandas as pd +import ming_spectrum_library + +def run_falcon(input_spectra_path, output_prefix="falcon", + precursor_tol="20 ppm", fragment_tol=0.05, + min_mz_range=0, min_mz=0, max_mz=30000, eps=0.1): + """ + Runs Falcon with specified parameters. + input_spectra_path can be a file or a folder. + """ + # Check if input is a file or folder + if os.path.isfile(input_spectra_path): + # Single file input + all_spectrum_files = [input_spectra_path] + elif os.path.isdir(input_spectra_path): + # Folder input - list all spectrum files + all_mgf_files = glob.glob(os.path.join(input_spectra_path, "*.mgf")) + all_mzxml_files = glob.glob(os.path.join(input_spectra_path, "*.mzXML")) + all_mzml_files = glob.glob(os.path.join(input_spectra_path, "*.mzML")) + + all_spectrum_files = all_mgf_files + all_mzxml_files + all_mzml_files + + if len(all_spectrum_files) == 0: + print(f"ERROR: No spectrum files found in {input_spectra_path}") + sys.exit(1) + + # Sort these filenames + all_spectrum_files.sort() + else: + print(f"ERROR: Input path does not exist: {input_spectra_path}") + sys.exit(1) + + # Create pattern for falcon (it accepts wildcards or file list) + if len(all_spectrum_files) == 1: + mzml_pattern = all_spectrum_files[0] + else: + # Falcon can accept multiple files, we'll pass them as space-separated + mzml_pattern = " ".join(all_spectrum_files) + + # Parse precursor_tol - falcon expects two arguments + # Format can be "2.0" (just number) or "20 ppm" (value and unit) + # Falcon needs two arguments, so we need to handle both cases + precursor_tol_str = str(precursor_tol).strip() + if " " in precursor_tol_str: + # Already has format like "20 ppm" or "2.0 Da" - split into two args + parts = precursor_tol_str.split(None, 1) # Split on first space + if len(parts) == 2: + precursor_tol_args = f"{parts[0]} {parts[1]}" + else: + # Fallback: use value twice + precursor_tol_args = f"{parts[0]} {parts[0]}" + else: + # Just a number, assume Da and use twice (min and max tolerance) + try: + tol_value = float(precursor_tol_str) + precursor_tol_args = f"{tol_value} {tol_value}" + except ValueError: + # If can't parse, use default + precursor_tol_args = "2.0 2.0" + + command = ( + f"falcon {mzml_pattern} {output_prefix} " + f"--export_representatives " + f"--precursor_tol {precursor_tol_args} " + f"--fragment_tol {fragment_tol} " + f"--min_mz_range {min_mz_range} " + f"--min_mz {min_mz} --max_mz {max_mz} " + f"--eps {eps} " + f"--hash_len 400 " + f"--n_neighbors_ann 64 " + f"--n_probe 16 " + f"--batch_size 32768 " + ) + print(f"[run_falcon] Running: {command}") + process = subprocess.Popen(command, shell=True) + retcode = process.wait() + if retcode != 0: + print(f"ERROR: Falcon failed with exit code {retcode}") + sys.exit(1) + + +def main(): + # Parse arguments + parser = argparse.ArgumentParser(description='Falcon Clustering Wrapper') + parser.add_argument('input_spectra_folder', help='Input Spectra Folder') + parser.add_argument('output_spectra_folder', help='Output Spectra Folder') + parser.add_argument('final_output_folder', help='final_output_folder') + + parser.add_argument('--min_cluster_size', default="2", help='min_cluster_size (not used by falcon directly)') + + parser.add_argument('--pm_tolerance', default="20 ppm", help='pm_tolerance (precursor tolerance)') + parser.add_argument('--fragment_tolerance', default="0.05", help='fragment_tolerance') + + # Filters (not all are used by falcon) + parser.add_argument('--min_peak_intensity', default="0.0", help='min_peak_intensity (not used by falcon)') + parser.add_argument('--window_filter', default="1", help='window_filter (not used by falcon)') + parser.add_argument('--precursor_filter', default="1", help='precursor_filter (not used by falcon)') + + # Falcon-specific parameters + parser.add_argument('--eps', default="0.1", help='Falcon eps parameter') + parser.add_argument('--min_mz', default="0", help='Falcon min_mz parameter') + parser.add_argument('--max_mz', default="30000", help='Falcon max_mz parameter') + + args = parser.parse_args() + + # Check if input is a file or folder (Nextflow may pass a file) + input_spectra_path = args.input_spectra_folder + if os.path.isfile(args.input_spectra_folder): + # If it's a single file, we need to use its directory + input_spectra_path = os.path.dirname(args.input_spectra_folder) + if not input_spectra_path: + input_spectra_path = "." + + # Running falcon + output_prefix = "falcon" + run_falcon(input_spectra_path, output_prefix, + precursor_tol=args.pm_tolerance, + fragment_tol=float(args.fragment_tolerance), + min_mz_range=0, + min_mz=int(args.min_mz), + max_mz=int(args.max_mz), + eps=float(args.eps)) + + # Falcon outputs: + # - falcon.csv (cluster assignments) + # - falcon.mgf (representative spectra) + + falcon_csv = f"{output_prefix}.csv" + falcon_mgf = f"{output_prefix}.mgf" + + if not os.path.exists(falcon_csv): + print(f"ERROR: Falcon output {falcon_csv} not found") + sys.exit(1) + + if not os.path.exists(falcon_mgf): + print(f"ERROR: Falcon output {falcon_mgf} not found") + sys.exit(1) + + # Convert falcon output to mscluster format first + script_dir = os.path.dirname(os.path.abspath(__file__)) + convert_script = os.path.join(script_dir, "convert_falcon_to_mscluster_format.py") + + clusterinfo_file = os.path.join(args.final_output_folder, "clusterinfo.tsv") + clustersummary_file = os.path.join(args.final_output_folder, "clustersummary.tsv") + + # For conversion, we need the original input path to get spectrum info + convert_input_path = input_spectra_path + + convert_cmd = ( + f"python {convert_script} " + f"{falcon_csv} " + f"{convert_input_path} " + f"{clusterinfo_file} " + f"{clustersummary_file} " + f"--min_cluster_size {args.min_cluster_size}" + ) + + print(f"Converting falcon output to mscluster format...") + print(f"Running: {convert_cmd}") + ret_code = os.system(convert_cmd) + + if ret_code != 0: + print(f"ERROR: Conversion failed with exit code {ret_code}") + sys.exit(1) + + # Now update MGF file with correct scan numbers based on clusterinfo.tsv + # The scan number in MGF should match the #ClusterIdx in clusterinfo.tsv + # Read clusterinfo to get cluster indices + clusterinfo_df = pd.read_csv(clusterinfo_file, sep='\t') + + # Get unique cluster indices (these will be the scan numbers in MGF) + unique_clusters = sorted(clusterinfo_df['#ClusterIdx'].unique()) + + # Read falcon MGF file - parse manually to access cluster field + output_mgf_filename = os.path.join(args.final_output_folder, "specs_ms.mgf") + os.makedirs(args.final_output_folder, exist_ok=True) + + # Parse falcon MGF to get cluster information and update scan numbers + mgf_spectra = [] + with open(falcon_mgf, 'r') as f: + spec = None + for line in f: + line = line.strip() + if line == 'BEGIN IONS': + spec = {'peaks': [], 'headers': {}} + elif line == 'END IONS': + if spec is not None: + mgf_spectra.append(spec) + spec = None + elif spec is not None: + if '=' in line: + key, val = line.split('=', 1) + spec['headers'][key.upper()] = val + else: + # Peak data + parts = line.split() + if len(parts) == 2: + try: + mz, intensity = float(parts[0]), float(parts[1]) + spec['peaks'].append((mz, intensity)) + except: + pass + + # Create mapping: falcon cluster (0-based) -> mscluster cluster index (1-based, which is #ClusterIdx) + # falcon cluster IDs in MGF are 0-based, but we converted to 1-based in clusterinfo + cluster_to_scan = {} + for cluster_idx in unique_clusters: + # falcon uses 0-based, mscluster uses 1-based + falcon_cluster = cluster_idx - 1 + cluster_to_scan[falcon_cluster] = cluster_idx + + # Write MGF with correct scan numbers + # Only include spectra that correspond to clusters in clusterinfo.tsv + # This ensures MGF and clusterinfo.tsv are consistent + skipped_count = 0 + with open(output_mgf_filename, 'w') as out_mgf: + for spec in mgf_spectra: + # Get cluster ID from falcon MGF + falcon_cluster = -1 + if 'CLUSTER' in spec['headers']: + try: + falcon_cluster = int(spec['headers']['CLUSTER']) + except (ValueError, TypeError): + pass + + # Only include spectra that are in clusterinfo.tsv + # Skip clusters that were filtered out by min_cluster_size + if falcon_cluster in cluster_to_scan: + scan_number = cluster_to_scan[falcon_cluster] + + # Write MGF entry with correct scan number + out_mgf.write("BEGIN IONS\n") + + # Write headers, updating SCANS= line + for key, val in spec['headers'].items(): + if key == 'SCANS': + out_mgf.write(f"SCANS={scan_number}\n") + else: + out_mgf.write(f"{key}={val}\n") + + # Add SCANS if it wasn't in the original + if 'SCANS' not in spec['headers']: + out_mgf.write(f"SCANS={scan_number}\n") + + # Write peaks + for mz, intensity in spec['peaks']: + out_mgf.write(f"{mz} {intensity}\n") + + out_mgf.write("END IONS\n\n") + else: + # Skip clusters that are not in clusterinfo.tsv (filtered out by min_cluster_size) + skipped_count += 1 + if falcon_cluster != -1: # Only warn for non-singleton clusters + print(f"INFO: Skipping falcon cluster {falcon_cluster} (filtered out by min_cluster_size)") + + if skipped_count > 0: + print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size") + + print(f"Falcon clustering completed. Output files:") + print(f" - MGF: {output_mgf_filename}") + print(f" - ClusterInfo: {clusterinfo_file}") + print(f" - ClusterSummary: {clustersummary_file}") + + +if __name__ == "__main__": + main() diff --git a/nf_workflow.nf b/nf_workflow.nf index 7b47415..fd74c3e 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -13,14 +13,20 @@ params.metadata_per_file_grouping = "No" // Yes means that each file can be its params.metadata_filename = "data/metadata.tsv" // Clustering Parameters +params.clustering_tool = "mscluster" // or "falcon" params.min_cluster_size = "2" -params.cluster_method = "MS-Cluster" // can be MS-Cluster or Falcon -params.eps = "0.01" // eps for falcon // Tolerance Parameters params.pm_tolerance = "2.0" params.fragment_tolerance = "0.5" +// Falcon-specific parameters +params.falcon_pm_tolerance = "20 ppm" +params.falcon_fragment_tolerance = "0.05" +params.falcon_eps = "0.1" +params.falcon_min_mz = "0" +params.falcon_max_mz = "30000" + // Filtering params.min_peak_intensity = "0.0" params.window_filter = "1" @@ -116,52 +122,37 @@ process mscluster { } process falcon { - publishDir "$params.publishdir/nf_output/clustering", mode: 'copy' + publishDir "$params.publishdir/nf_output", mode: 'copy' conda "$TOOL_FOLDER/conda_env_falcon.yml" + + // This is necessary because the glibc libraries are not always used in the conda environment, and defaults to the system which could be old + beforeScript 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib' input: - val inputSpectra + file inputSpectra val ready output: - file 'falcon.mgf' - file 'falcon.csv' + file 'clustering/specs_ms.mgf' + file 'clustering/clusterinfo.tsv' + file 'clustering/clustersummary.tsv' """ mkdir clustering - falcon \ - ${inputSpectra} \ - falcon --export_representatives \ - --precursor_tol 0.05 Da \ - --fragment_tol 0.05 \ - --min_mz_range 0 \ - --min_mz 0 \ - --max_mz 2000 \ - --min_samples $params.min_cluster_size \ - --eps $params.eps --work_dir clustering \ - --overwrite - """ -} - -process summarize_falcon { - publishDir "$params.publishdir/nf_output/clustering", mode: 'copy' - - conda "$TOOL_FOLDER/conda_env.yml" - - input: - file input_csv - file input_mgf - - output: - file 'clusterinfo.tsv' - file 'clustersummary.tsv' - file 'specs_ms.mgf' - - """ - python $TOOL_FOLDER/scripts/summarize_results_falcon.py \ - ${input_csv} \ - ${input_mgf} + python $TOOL_FOLDER/scripts/falcon_wrapper.py \ + $inputSpectra \ + spectra \ + clustering \ + --min_cluster_size $params.min_cluster_size \ + --pm_tolerance "$params.falcon_pm_tolerance" \ + --fragment_tolerance $params.falcon_fragment_tolerance \ + --min_peak_intensity $params.min_peak_intensity \ + --window_filter $params.window_filter \ + --precursor_filter $params.precursor_filter \ + --eps $params.falcon_eps \ + --min_mz $params.falcon_min_mz \ + --max_mz $params.falcon_max_mz """ } @@ -611,8 +602,7 @@ process PrepareForModiFinder{ } workflow { - -// Preps input spectrum files + // Preps input spectrum files input_spectra_ch = Channel.fromPath(params.input_spectra) // Downloads input data and lists privtae spectra @@ -622,14 +612,18 @@ workflow { // File summaries filesummary(input_spectra_ch, _download_ready) + // Note: For subsequent processes (library search, networking), they use pm_tolerance and fragment_tolerance + // These are set to mscluster defaults. If using falcon, users should be aware that downstream processes + // will use the mscluster tolerance values unless they also update those parameters. + // This is acceptable because downstream processes work on clustered spectra, and the tolerance + // for library search and networking can be different from clustering tolerance. + // Clustering - if(params.cluster_method == "MS-Cluster"){ - (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) + if(params.clustering_tool == "falcon"){ + (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = falcon(input_spectra_ch, _download_ready) } - else if (params.cluster_method == "Falcon"){ - input_falcon_spectra_ch = params.input_spectra + "/*" - (clustered_spectra_falcon_ch, falcon_tsv_ch) = falcon(input_falcon_spectra_ch, _download_ready) - (clusterinfo_ch, clustersummary_ch, clustered_spectra_intermediate_ch) = summarize_falcon(falcon_tsv_ch, clustered_spectra_falcon_ch) + else{ + (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) } if(params.massql_filter != "None"){ diff --git a/workflowinput.yaml b/workflowinput.yaml index 86ee6a3..6164359 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -88,6 +88,17 @@ parameterlist: - displayname: Clustering Parameters paramtype: section + - displayname: Clustering Tool + paramtype: select + nf_paramname: clustering_tool + formvalue: "mscluster" + options: + - value: "mscluster" + display: "MSCluster" + - value: "falcon" + display: "Falcon" + tooltip: "Select the clustering tool to use for spectral clustering" + - displayname: Minimum Cluster Size (Or Disable Clustering) paramtype: select nf_paramname: min_cluster_size @@ -105,6 +116,68 @@ parameterlist: display: "4" - value: "5" display: "5" + + - displayname: Falcon Parameters + paramtype: section + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Precursor Ion Tolerance + paramtype: text + nf_paramname: falcon_pm_tolerance + formplaceholder: Enter the pm_tolerance (e.g., 20 ppm or 2.0 Da) + formvalue: "20 ppm" + tooltip: "Precursor tolerance with unit (e.g., 20 ppm or 2.0 Da)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Fragment Ion Tolerance + paramtype: text + nf_paramname: falcon_fragment_tolerance + formplaceholder: Enter the fragment_tolerance + formvalue: "0.05" + tooltip: "Fragment tolerance (Da)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Epsilon (eps) + paramtype: text + nf_paramname: falcon_eps + formplaceholder: Enter the falcon_eps + formvalue: "0.1" + tooltip: "Falcon clustering epsilon parameter (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Min MZ + paramtype: text + nf_paramname: falcon_min_mz + formplaceholder: Enter the falcon_min_mz + formvalue: "0" + tooltip: "Falcon minimum m/z value (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Max MZ + paramtype: text + nf_paramname: falcon_max_mz + formplaceholder: Enter the falcon_max_mz + formvalue: "30000" + tooltip: "Falcon maximum m/z value (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" - displayname: Clustering Tool paramtype: select From 10cbf66ddf641ce706c84c5db9986ebbdf4e3a2c Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Tue, 13 Jan 2026 14:12:21 -0800 Subject: [PATCH 12/21] bugfix --- bin/conda_env_falcon.yml | 2 +- workflowinput.yaml | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml index d816410..3c27485 100644 --- a/bin/conda_env_falcon.yml +++ b/bin/conda_env_falcon.yml @@ -23,4 +23,4 @@ dependencies: - numcodecs - pyarrow - joblib - - git+https://github.com/XianghuWang-287/falcon.git@public-version + - falcon-ms diff --git a/workflowinput.yaml b/workflowinput.yaml index 6164359..fb869a5 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -179,16 +179,6 @@ parameterlist: - key: clustering_tool value: "falcon" - - displayname: Clustering Tool - paramtype: select - nf_paramname: cluster_method - formvalue: "MS-Cluster" - options: - - value: "MS-Cluster" - display: "MS-Cluster" - - value: "Falcon" - display: "Falcon" - - displayname: Advanced Filtering Parameters paramtype: section From 71457a600f543ad375c98c9686cdcd84c0b5ac3a Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Tue, 13 Jan 2026 14:36:38 -0800 Subject: [PATCH 13/21] update falcon env --- bin/conda_env_falcon.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml index 3c27485..1bb31cf 100644 --- a/bin/conda_env_falcon.yml +++ b/bin/conda_env_falcon.yml @@ -5,22 +5,17 @@ channels: channel_priority: strict dependencies: - python=3.10 - - pandas=1.5.0 + - pandas=2.2.0 - pyteomics=4.7.1 - - libglib - - libstdcxx-ng - - gcc=12.2.0 - pip: - numpy==1.23.4 - xmltodict - requests - tqdm - psutil - - h5py - pyopenms==3.0.0 - spectrum-utils==0.3.5 + - falcon-ms - pymzml - numcodecs - pyarrow - - joblib - - falcon-ms From 16e6704dd099b76bff0f30b951eeed7eab697817 Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Fri, 23 Jan 2026 13:30:51 -0800 Subject: [PATCH 14/21] bug fix for cluster id mapping --- .../convert_falcon_to_mscluster_format.py | 42 ++++++++++- bin/scripts/falcon_wrapper.py | 71 +++++++++++++------ 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py index aa2ccdd..e1310f7 100755 --- a/bin/scripts/convert_falcon_to_mscluster_format.py +++ b/bin/scripts/convert_falcon_to_mscluster_format.py @@ -120,6 +120,16 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ # Create DataFrame mscluster_df = pd.DataFrame(mscluster_rows) + # IMPORTANT: Before filtering, save the original falcon cluster ID (0-based) for each row + # This will help us match falcon MGF clusters to clusterinfo clusters later + # The original falcon cluster ID is stored in the 'cluster' column from falcon.csv + # We need to track: original_falcon_cluster (0-based) -> sequential_index (1-based) + + # First, add a column to track original falcon cluster (0-based) before filtering + # We'll need to reconstruct this from the cluster_idx we converted + # cluster_idx was converted from falcon's 0-based to 1-based, so original = cluster_idx - 1 + mscluster_df['_original_falcon_cluster'] = mscluster_df['#ClusterIdx'] - 1 + # Filter by min_cluster_size if min_cluster_size > 1: # Count spectra per cluster @@ -127,9 +137,29 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)] - # Create cluster summary + # IMPORTANT: Remap cluster indices to sequential (1, 2, 3, ...) for consistency + # This ensures clusterinfo.tsv, clustersummary.tsv, and MGF SCANS all use the same sequential indices + # This is necessary for compatibility with ExecMolecularParallelPairs which uses index-based CLUSTERID + original_clusters = sorted(mscluster_df['#ClusterIdx'].unique()) + cluster_remap = {orig: new for new, orig in enumerate(original_clusters, start=1)} + + # Create mapping: original falcon cluster (0-based) -> sequential index (1-based) + # This mapping will be used in falcon_wrapper.py to match MGF clusters + falcon_cluster_to_sequential = {} + for orig_cluster_idx in original_clusters: + original_falcon_cluster = orig_cluster_idx - 1 # Convert back to 0-based + sequential_idx = cluster_remap[orig_cluster_idx] + falcon_cluster_to_sequential[original_falcon_cluster] = sequential_idx + + # Remap #ClusterIdx in mscluster_df + mscluster_df['#ClusterIdx'] = mscluster_df['#ClusterIdx'].map(cluster_remap) + + # Remove the temporary column + mscluster_df = mscluster_df.drop(columns=['_original_falcon_cluster']) + + # Create cluster summary with remapped indices cluster_summary_rows = [] - for cluster_idx in mscluster_df['#ClusterIdx'].unique(): + for cluster_idx in sorted(mscluster_df['#ClusterIdx'].unique()): cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx] num_spectra = len(cluster_data) @@ -167,7 +197,7 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ sum_precursor_intensity = cluster_data['#PrecIntensity'].sum() cluster_summary_row = { - 'cluster index': cluster_idx, + 'cluster index': cluster_idx, # Already remapped to sequential 'number of spectra': num_spectra, 'parent mass': parent_mass, 'precursor charge': precursor_charge, @@ -180,6 +210,11 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ cluster_summary_df = pd.DataFrame(cluster_summary_rows) cluster_summary_df = cluster_summary_df.sort_values('cluster index') + # Ensure cluster index is string type to match network graph node types + # Network graph nodes from pairs file (CLUSTERID1/CLUSTERID2) are typically strings + # This ensures type matching when add_clusterinfo_summary_to_graph checks "if cluster_index in G" + cluster_summary_df['cluster index'] = cluster_summary_df['cluster index'].astype(str) + # Save outputs mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False) cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False) @@ -187,6 +222,7 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters") print(f"Saved clusterinfo to {output_clusterinfo}") print(f"Saved clustersummary to {output_clustersummary}") + print(f"Note: Cluster indices have been remapped to sequential (1, 2, 3, ...) for consistency") def main(): diff --git a/bin/scripts/falcon_wrapper.py b/bin/scripts/falcon_wrapper.py index 792fc76..f984a71 100755 --- a/bin/scripts/falcon_wrapper.py +++ b/bin/scripts/falcon_wrapper.py @@ -171,11 +171,12 @@ def main(): sys.exit(1) # Now update MGF file with correct scan numbers based on clusterinfo.tsv - # The scan number in MGF should match the #ClusterIdx in clusterinfo.tsv + # IMPORTANT: clusterinfo.tsv now uses sequential indices (1, 2, 3, ...) + # The scan number in MGF should match the #ClusterIdx in clusterinfo.tsv (which is already sequential) # Read clusterinfo to get cluster indices clusterinfo_df = pd.read_csv(clusterinfo_file, sep='\t') - # Get unique cluster indices (these will be the scan numbers in MGF) + # Get unique cluster indices (these are already sequential: 1, 2, 3, ...) unique_clusters = sorted(clusterinfo_df['#ClusterIdx'].unique()) # Read falcon MGF file - parse manually to access cluster field @@ -208,18 +209,42 @@ def main(): except: pass - # Create mapping: falcon cluster (0-based) -> mscluster cluster index (1-based, which is #ClusterIdx) - # falcon cluster IDs in MGF are 0-based, but we converted to 1-based in clusterinfo - cluster_to_scan = {} - for cluster_idx in unique_clusters: - # falcon uses 0-based, mscluster uses 1-based - falcon_cluster = cluster_idx - 1 - cluster_to_scan[falcon_cluster] = cluster_idx - - # Write MGF with correct scan numbers - # Only include spectra that correspond to clusters in clusterinfo.tsv - # This ensures MGF and clusterinfo.tsv are consistent + # IMPORTANT: clusterinfo.tsv now uses sequential indices (1, 2, 3, ...) + # We need to map falcon clusters (0-based) to these sequential indices + # Rebuild the mapping by reading falcon.csv and matching with clusterinfo.tsv + # This avoids needing a separate JSON file that might not be passed through Nextflow + + # Read falcon.csv to get original falcon cluster assignments + falcon_cluster_to_sequential = {} + if os.path.exists(falcon_csv): + falcon_df = pd.read_csv(falcon_csv, sep=',', comment='#') + + # Filter by min_cluster_size (same as in convert_falcon_to_mscluster_format.py) + if int(args.min_cluster_size) > 1: + falcon_df = falcon_df[falcon_df['cluster'] != -1] + + # Group by original falcon cluster (0-based) + # The sequential index in clusterinfo corresponds to the order after filtering + # We'll match by grouping falcon clusters and assigning sequential indices + original_clusters = sorted(falcon_df['cluster'].unique()) + sequential_idx = 1 + for orig_falcon_cluster in original_clusters: + # Count spectra in this cluster + cluster_spectra = falcon_df[falcon_df['cluster'] == orig_falcon_cluster] + if len(cluster_spectra) >= int(args.min_cluster_size): + falcon_cluster_to_sequential[orig_falcon_cluster] = sequential_idx + sequential_idx += 1 + + print(f"Rebuilt falcon cluster mapping: {len(falcon_cluster_to_sequential)} clusters") + else: + print(f"WARNING: falcon.csv not found at {falcon_csv}, cannot rebuild mapping") + falcon_cluster_to_sequential = {} + + # Write MGF with sequential scan numbers matching clusterinfo.tsv + # Since clusterinfo.tsv already has sequential indices, we'll use them directly skipped_count = 0 + processed_clusters = set() # Track which sequential cluster indices we've processed + with open(output_mgf_filename, 'w') as out_mgf: for spec in mgf_spectra: # Get cluster ID from falcon MGF @@ -230,12 +255,12 @@ def main(): except (ValueError, TypeError): pass - # Only include spectra that are in clusterinfo.tsv - # Skip clusters that were filtered out by min_cluster_size - if falcon_cluster in cluster_to_scan: - scan_number = cluster_to_scan[falcon_cluster] + # Map falcon cluster (0-based) to sequential index (1-based) + if falcon_cluster in falcon_cluster_to_sequential: + sequential_idx = falcon_cluster_to_sequential[falcon_cluster] + scan_number = sequential_idx # Use sequential index as SCANS - # Write MGF entry with correct scan number + # Write MGF entry with sequential scan number out_mgf.write("BEGIN IONS\n") # Write headers, updating SCANS= line @@ -254,11 +279,17 @@ def main(): out_mgf.write(f"{mz} {intensity}\n") out_mgf.write("END IONS\n\n") + processed_clusters.add(sequential_idx) else: - # Skip clusters that are not in clusterinfo.tsv (filtered out by min_cluster_size) + # Skip clusters that are not in the mapping (filtered out by min_cluster_size) skipped_count += 1 if falcon_cluster != -1: # Only warn for non-singleton clusters - print(f"INFO: Skipping falcon cluster {falcon_cluster} (filtered out by min_cluster_size)") + print(f"INFO: Skipping falcon cluster {falcon_cluster} (filtered out by min_cluster_size or not in mapping)") + + if skipped_count > 0: + print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size") + + print(f"Wrote {len(processed_clusters)} spectra to MGF file with sequential SCANS (1, 2, 3, ...)") if skipped_count > 0: print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size") From c1df14fd96b51c94d9c3bf648a73f0807254aa11 Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Fri, 23 Jan 2026 13:51:59 -0800 Subject: [PATCH 15/21] update workflow version --- workflowinput.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflowinput.yaml b/workflowinput.yaml index fb869a5..523543c 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -1,7 +1,7 @@ workflowname: classical_networking_workflow workflowdescription: This is Classical Molecular Networking for GNPS2 workflowlongdescription: This is Classical Molecular Networking for GNPS2 -workflowversion: "2026.01.13" +workflowversion: "2026.01.23" workflowfile: nf_workflow.nf workflowautohide: false adminonly: false From 075a972080f320f9e4742ee888a7bb8a470714c2 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Fri, 6 Feb 2026 12:10:13 -0800 Subject: [PATCH 16/21] using library summary with a module --- .gitmodules | 3 +++ bin/NextflowModules | 1 + nf_workflow.nf | 43 ++++++++++++++++++++----------------------- 3 files changed, 24 insertions(+), 23 deletions(-) create mode 160000 bin/NextflowModules diff --git a/.gitmodules b/.gitmodules index b0a66e4..e4e3b30 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "bin/scripts/downloadpublicdata"] path = bin/scripts/downloadpublicdata url = https://github.com/Wang-Bioinformatics-Lab/downloadpublicdata.git +[submodule "bin/NextflowModules"] + path = bin/NextflowModules + url = https://github.com/Wang-Bioinformatics-Lab/NextflowModules.git diff --git a/bin/NextflowModules b/bin/NextflowModules new file mode 160000 index 0000000..ad68182 --- /dev/null +++ b/bin/NextflowModules @@ -0,0 +1 @@ +Subproject commit ad681820f3795f45c772946b9b3b6da1baac580b diff --git a/nf_workflow.nf b/nf_workflow.nf index fd74c3e..b849129 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -74,9 +74,28 @@ params.cache_directory = "data/cache" params.publishdir = "$baseDir" TOOL_FOLDER = "$baseDir/bin" +MODULES_FOLDER = "$TOOL_FOLDER/NextflowModules" + +// COMPATIBILITY NOTE: The following might be necessary if this workflow is being deployed in a slightly different environemnt +// checking if outdir is defined, +// if so, then set publishdir to outdir +if (params.outdir) { + _publishdir = params.outdir +} +else{ + _publishdir = params.publishdir +} + +// Augmenting with nf_output +_publishdir = "${_publishdir}/nf_output" + + +// A lot of useful modules are already implemented and added to the nextflow modules, you can import them to use +// the publishdir is a key word that we're using around all our modules to control where the output files will be saved +include {summaryLibrary} from "$MODULES_FOLDER/nf_library_search_modules.nf" process filesummary { - publishDir "$params.publishdir/nf_output", mode: 'copy' + publishDir "$_publishdir", mode: 'copy' conda "$TOOL_FOLDER/conda_env.yml" @@ -533,28 +552,6 @@ process prepInputFiles { """ } -process summaryLibrary { - publishDir "$params.publishdir/nf_output", mode: 'copy' - - maxForks 8 - - cache 'lenient' - - conda "$TOOL_FOLDER/conda_env.yml" - - input: - path library_file - - output: - path '*.tsv' optional true - - """ - python $TOOL_FOLDER/scripts/library_summary.py \ - $library_file \ - ${library_file}.tsv - """ -} - process createFeatureTable { publishDir "$params.publishdir/nf_output/clustering", mode: 'copy' From f315e7c98bb0b9bc3bbf5ccc0a9cda0f1652cfa2 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Fri, 6 Feb 2026 12:10:59 -0800 Subject: [PATCH 17/21] saving file --- nf_workflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index b849129..2095410 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -641,7 +641,7 @@ workflow { library_summary_ch = summaryLibrary(libraries_ch) // Merging all these tsv files from library_summary_ch within nextflow - library_summary_merged_ch = library_summary_ch.collectFile(name: "library_summary.tsv", keepHeader: true) + library_summary_ch.collectFile(name: 'librarysummary.tsv', keepHeader: true, storeDir: _publishdir + "/librarysummary") library_summary_merged_ch = library_summary_merged_ch.ifEmpty(file("NO_FILE")) gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch) From ae66e586c408fa5c987bd7088140b23d7a2e528c Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 11 Feb 2026 21:14:07 -0800 Subject: [PATCH 18/21] place holder for library --- workflowdisplay.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflowdisplay.yaml b/workflowdisplay.yaml index 6af7541..434f0e3 100644 --- a/workflowdisplay.yaml +++ b/workflowdisplay.yaml @@ -100,8 +100,13 @@ Views: - title: "Smiles" data: Smiles columnDefs: '[ {"targets": 0,"data": null,"render": function ( data, type, row, meta ) { + if (row["SpectrumID"] && row["SpectrumID"].includes("CCMSLIB")) { + return ` + View Mirror + `; + } return ` - View Mirror + CCMSLIB view coming soon `;}}, {"targets": 10,"data": null,"render": function ( data, type, row, meta ) { return ` From 5f03570268b9a5df7b7d63efe9c2cb0e3b0242c1 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 11 Feb 2026 21:14:16 -0800 Subject: [PATCH 19/21] cleanup rendering --- Makefile | 5 ++++- nf_workflow.nf | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8c0d434..1edcc5d 100644 --- a/Makefile +++ b/Makefile @@ -6,4 +6,7 @@ run_usi_download: --download_usi_filename=./data/usi_files/input_files.tsv --cache_directory=./data/cache run_transitive: - nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive \ No newline at end of file + nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive + +init_modules: + git submodule update --init --recursive \ No newline at end of file diff --git a/nf_workflow.nf b/nf_workflow.nf index 2095410..1ee91d0 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -641,7 +641,7 @@ workflow { library_summary_ch = summaryLibrary(libraries_ch) // Merging all these tsv files from library_summary_ch within nextflow - library_summary_ch.collectFile(name: 'librarysummary.tsv', keepHeader: true, storeDir: _publishdir + "/librarysummary") + library_summary_merged_ch = library_summary_ch.collectFile(name: 'librarysummary.tsv', keepHeader: true, storeDir: _publishdir + "/librarysummary") library_summary_merged_ch = library_summary_merged_ch.ifEmpty(file("NO_FILE")) gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch) From 1da51dc6a7789529e440b3ab4e15dd131effb56c Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 11 Feb 2026 21:17:27 -0800 Subject: [PATCH 20/21] cleanup --- nf_workflow.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index 1ee91d0..77e2b5e 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -575,7 +575,7 @@ process createFeatureTable { } -process PrepareForModiFinder{ +process prepareForModiFinder{ publishDir "$params.publishdir/nf_output", mode: 'copy' errorStrategy 'ignore' @@ -701,6 +701,6 @@ workflow { createFeatureTable(clusterinfo_ch) // Preparing for Modifinder - PrepareForModiFinder(gnps_library_results_ch, filtered_networking_pairs_enriched_ch) + prepareForModiFinder(gnps_library_results_ch, filtered_networking_pairs_enriched_ch) } \ No newline at end of file From 1d84df19377d51d01f07eb9fab10a0716fe9b5d2 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 11 Feb 2026 22:02:58 -0800 Subject: [PATCH 21/21] fixing linkouts for private libraries --- bin/NextflowModules | 2 +- nf_workflow.nf | 39 +++++++++++++++++++++------------------ workflowdisplay.yaml | 15 +++++++++++---- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/bin/NextflowModules b/bin/NextflowModules index ad68182..f357f3a 160000 --- a/bin/NextflowModules +++ b/bin/NextflowModules @@ -1 +1 @@ -Subproject commit ad681820f3795f45c772946b9b3b6da1baac580b +Subproject commit f357f3ae993553b011347a5af9d46d9318ff7b20 diff --git a/nf_workflow.nf b/nf_workflow.nf index 77e2b5e..2316d93 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -93,6 +93,7 @@ _publishdir = "${_publishdir}/nf_output" // A lot of useful modules are already implemented and added to the nextflow modules, you can import them to use // the publishdir is a key word that we're using around all our modules to control where the output files will be saved include {summaryLibrary} from "$MODULES_FOLDER/nf_library_search_modules.nf" +include {librarygetGNPSAnnotations} from "$MODULES_FOLDER/nf_library_search_modules.nf" addParams(publishdir: "$_publishdir/library") process filesummary { publishDir "$_publishdir", mode: 'copy' @@ -242,28 +243,28 @@ process librarymergeResults { """ } -process librarygetGNPSAnnotations { - publishDir "$params.publishdir/nf_output/library", mode: 'copy' +// process librarygetGNPSAnnotations { +// publishDir "$params.publishdir/nf_output/library", mode: 'copy' - //cache 'lenient' - cache 'false' +// //cache 'lenient' +// cache 'false' - conda "$TOOL_FOLDER/conda_env.yml" +// conda "$TOOL_FOLDER/conda_env.yml" - input: - path "merged_results.tsv" - path "library_summary.tsv" +// input: +// path "merged_results.tsv" +// path "library_summary.tsv" - output: - path 'merged_results_with_gnps.tsv' +// output: +// path 'merged_results_with_gnps.tsv' - """ - python $TOOL_FOLDER/scripts/getGNPS_library_annotations.py \ - merged_results.tsv \ - merged_results_with_gnps.tsv \ - --librarysummary library_summary.tsv - """ -} +// """ +// python $TOOL_FOLDER/scripts/getGNPS_library_annotations.py \ +// merged_results.tsv \ +// merged_results_with_gnps.tsv \ +// --librarysummary library_summary.tsv +// """ +// } // Molecular Networking process networkingGNPSPrepParams { @@ -644,7 +645,9 @@ workflow { library_summary_merged_ch = library_summary_ch.collectFile(name: 'librarysummary.tsv', keepHeader: true, storeDir: _publishdir + "/librarysummary") library_summary_merged_ch = library_summary_merged_ch.ifEmpty(file("NO_FILE")) - gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch) + // Getting library annotations + force_offline = "No" // This can be set to Yes to avoid any online queries to GNPS, which is useful for testing or if you have a local copy of the GNPS library + gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch, "1", "0", force_offline) gnps_library_results_ch = gnps_library_results_ch.ifEmpty(file("NO_FILE")) // Networking diff --git a/workflowdisplay.yaml b/workflowdisplay.yaml index 434f0e3..91a5c44 100644 --- a/workflowdisplay.yaml +++ b/workflowdisplay.yaml @@ -98,7 +98,11 @@ Views: - title: "SpectrumID" data: SpectrumID - title: "Smiles" - data: Smiles + data: Smiles + - title: "Library Name" + data: LibraryName + - title: "Library Scan" + data: LibScan columnDefs: '[ {"targets": 0,"data": null,"render": function ( data, type, row, meta ) { if (row["SpectrumID"] && row["SpectrumID"].includes("CCMSLIB")) { return ` @@ -106,12 +110,15 @@ Views: `; } return ` - CCMSLIB view coming soon + View Mirror `;}}, {"targets": 10,"data": null,"render": function ( data, type, row, meta ) { return ` - `;}},]' + `;}}, + {"targets": [11, 12], "visible": false}]' + + - name: Network Components List displayname: Network Components List @@ -362,4 +369,4 @@ Views: viewname: metadatadownload displaytype: download parameters: - filename: nf_output/metadata/merged_metadata.tsv \ No newline at end of file + filename: nf_output/metadata/merged_metadata.tsvtsv \ No newline at end of file