diff --git a/.gitmodules b/.gitmodules
index b0a66e4..e4e3b30 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
[submodule "bin/scripts/downloadpublicdata"]
path = bin/scripts/downloadpublicdata
url = https://github.com/Wang-Bioinformatics-Lab/downloadpublicdata.git
+[submodule "bin/NextflowModules"]
+ path = bin/NextflowModules
+ url = https://github.com/Wang-Bioinformatics-Lab/NextflowModules.git
diff --git a/Makefile b/Makefile
index 8c0d434..1edcc5d 100644
--- a/Makefile
+++ b/Makefile
@@ -6,4 +6,7 @@ run_usi_download:
--download_usi_filename=./data/usi_files/input_files.tsv --cache_directory=./data/cache
run_transitive:
- nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive
\ No newline at end of file
+ nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive
+
+init_modules:
+ git submodule update --init --recursive
\ No newline at end of file
diff --git a/bin/NextflowModules b/bin/NextflowModules
new file mode 160000
index 0000000..f357f3a
--- /dev/null
+++ b/bin/NextflowModules
@@ -0,0 +1 @@
+Subproject commit f357f3ae993553b011347a5af9d46d9318ff7b20
diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml
new file mode 100644
index 0000000..1bb31cf
--- /dev/null
+++ b/bin/conda_env_falcon.yml
@@ -0,0 +1,21 @@
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+channel_priority: strict
+dependencies:
+ - python=3.10
+ - pandas=2.2.0
+ - pyteomics=4.7.1
+ - pip:
+ - numpy==1.23.4
+ - xmltodict
+ - requests
+ - tqdm
+ - psutil
+ - pyopenms==3.0.0
+ - spectrum-utils==0.3.5
+ - falcon-ms
+ - pymzml
+ - numcodecs
+ - pyarrow
diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py
new file mode 100755
index 0000000..e1310f7
--- /dev/null
+++ b/bin/scripts/convert_falcon_to_mscluster_format.py
@@ -0,0 +1,248 @@
+#!/usr/bin/python
+
+import sys
+import os
+import argparse
+import pandas as pd
+
+
+
+def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2):
+ """
+ Convert falcon output to mscluster format.
+
+ Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch
+ MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity
+
+ Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0)
+ instead of fetching from MGF files.
+ """
+ # Load falcon CSV
+ clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#')
+
+ print(f"Loaded {len(clusterinfo_df)} rows from falcon CSV")
+ print(f"Columns: {clusterinfo_df.columns.tolist()}")
+
+
+ if 'identifier' in clusterinfo_df.columns:
+
+ clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(
+ lambda x: x.split(":")[2] + ".mzML" if len(x.split(":")) > 2 else (x.split(":")[-2] + ".mzML" if len(x.split(":")) > 1 else "unknown.mzML")
+ )
+ clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(
+ lambda x: int(x.split(":")[-1]) if x.split(":")[-1].isdigit() else 0
+ )
+
+ # Ensure we have the required columns
+ required_cols = ['cluster', 'filename', 'scan', 'precursor_mz', 'retention_time']
+ missing_cols = [col for col in required_cols if col not in clusterinfo_df.columns]
+
+ if missing_cols:
+ print(f"ERROR: Required columns not found in falcon CSV: {missing_cols}")
+ print(f"Available columns: {clusterinfo_df.columns.tolist()}")
+ print(f"First few rows:")
+ print(clusterinfo_df.head())
+ sys.exit(1)
+
+ if min_cluster_size > 1:
+ clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1]
+
+ # Convert to mscluster format
+ mscluster_rows = []
+ spec_idx_counter = 0
+
+ for idx, row in clusterinfo_df.iterrows():
+ cluster_idx = int(row['cluster'])
+
+ # Skip singletons if min_cluster_size > 1
+ if cluster_idx == -1 and min_cluster_size > 1:
+ continue
+
+ # Handle cluster indexing: falcon uses 0-based, mscluster uses 1-based
+ # But we also need to handle -1 (singletons)
+ if cluster_idx == -1:
+ # Singletons: use a large number or handle separately
+ cluster_idx = 999999 # Use a large number for singletons
+ else:
+ cluster_idx = cluster_idx + 1 # Convert to 1-based
+
+ filename = str(row['filename'])
+ scan = int(row['scan'])
+ precursor_mz = float(row['precursor_mz'])
+ retention_time = float(row['retention_time'])
+
+ # Use default values for fields that falcon doesn't provide
+ # Don't fetch from MGF files - just use defaults
+ charge = 0
+ precursor_intensity = 0.0
+
+
+ if 'precursor_charge' in row and pd.notna(row['precursor_charge']):
+ try:
+ charge = int(row['precursor_charge'])
+ except (ValueError, TypeError):
+ charge = 0
+ elif 'charge' in row and pd.notna(row['charge']):
+ try:
+ charge = int(row['charge'])
+ except (ValueError, TypeError):
+ charge = 0
+
+ if 'precursor_intensity' in row and pd.notna(row['precursor_intensity']):
+ try:
+ precursor_intensity = float(row['precursor_intensity'])
+ except (ValueError, TypeError):
+ precursor_intensity = 0.0
+
+ # Convert retention time to seconds if it's in minutes
+ # Falcon typically outputs RT in minutes, mscluster uses seconds
+ if retention_time > 0 and retention_time < 1000: # Likely in minutes
+ retention_time = retention_time * 60.0
+
+ if not filename.startswith('input_spectra/'):
+ filename = f"input_spectra/{filename}"
+
+ spec_idx = spec_idx_counter
+ spec_idx_counter += 1
+
+ mscluster_row = {
+ '#ClusterIdx': cluster_idx,
+ '#Filename': filename,
+ '#SpecIdx': spec_idx,
+ '#Scan': scan,
+ '#ParentMass': precursor_mz,
+ '#Charge': charge,
+ '#RetTime': retention_time,
+ '#PrecIntensity': precursor_intensity
+ }
+ mscluster_rows.append(mscluster_row)
+
+ # Create DataFrame
+ mscluster_df = pd.DataFrame(mscluster_rows)
+
+ # IMPORTANT: Before filtering, save the original falcon cluster ID (0-based) for each row
+ # This will help us match falcon MGF clusters to clusterinfo clusters later
+ # The original falcon cluster ID is stored in the 'cluster' column from falcon.csv
+ # We need to track: original_falcon_cluster (0-based) -> sequential_index (1-based)
+
+ # First, add a column to track original falcon cluster (0-based) before filtering
+ # We'll need to reconstruct this from the cluster_idx we converted
+ # cluster_idx was converted from falcon's 0-based to 1-based, so original = cluster_idx - 1
+ mscluster_df['_original_falcon_cluster'] = mscluster_df['#ClusterIdx'] - 1
+
+ # Filter by min_cluster_size
+ if min_cluster_size > 1:
+ # Count spectra per cluster
+ cluster_counts = mscluster_df['#ClusterIdx'].value_counts()
+ valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index
+ mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)]
+
+ # IMPORTANT: Remap cluster indices to sequential (1, 2, 3, ...) for consistency
+ # This ensures clusterinfo.tsv, clustersummary.tsv, and MGF SCANS all use the same sequential indices
+ # This is necessary for compatibility with ExecMolecularParallelPairs which uses index-based CLUSTERID
+ original_clusters = sorted(mscluster_df['#ClusterIdx'].unique())
+ cluster_remap = {orig: new for new, orig in enumerate(original_clusters, start=1)}
+
+ # Create mapping: original falcon cluster (0-based) -> sequential index (1-based)
+ # This mapping will be used in falcon_wrapper.py to match MGF clusters
+ falcon_cluster_to_sequential = {}
+ for orig_cluster_idx in original_clusters:
+ original_falcon_cluster = orig_cluster_idx - 1 # Convert back to 0-based
+ sequential_idx = cluster_remap[orig_cluster_idx]
+ falcon_cluster_to_sequential[original_falcon_cluster] = sequential_idx
+
+ # Remap #ClusterIdx in mscluster_df
+ mscluster_df['#ClusterIdx'] = mscluster_df['#ClusterIdx'].map(cluster_remap)
+
+ # Remove the temporary column
+ mscluster_df = mscluster_df.drop(columns=['_original_falcon_cluster'])
+
+ # Create cluster summary with remapped indices
+ cluster_summary_rows = []
+ for cluster_idx in sorted(mscluster_df['#ClusterIdx'].unique()):
+ cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx]
+ num_spectra = len(cluster_data)
+
+ # Calculate mean RT (in minutes)
+ mean_rt = cluster_data['#RetTime'].mean() / 60.0
+
+ # Calculate parent mass (mean of #ParentMass)
+ parent_mass = cluster_data['#ParentMass'].mean()
+
+ # Calculate precursor mass (parent mass / charge, then take mean)
+ # If charge is 0, use parent mass directly
+ precursor_masses = []
+ for idx, row in cluster_data.iterrows():
+ if row['#Charge'] > 0:
+ precursor_masses.append(row['#ParentMass'] / row['#Charge'])
+ else:
+ precursor_masses.append(row['#ParentMass'])
+ precursor_mass = sum(precursor_masses) / len(precursor_masses) if precursor_masses else parent_mass
+
+ # Calculate precursor charge (most common charge, or mean if no clear mode)
+ charges = cluster_data['#Charge'].values
+ charges_nonzero = charges[charges > 0]
+ if len(charges_nonzero) > 0:
+ # Use mode (most common charge) using pandas
+ charge_series = pd.Series(charges_nonzero)
+ mode_values = charge_series.mode()
+ if len(mode_values) > 0:
+ precursor_charge = int(mode_values.iloc[0])
+ else:
+ precursor_charge = int(charges_nonzero.mean())
+ else:
+ precursor_charge = 0
+
+ # Calculate sum of precursor intensity
+ sum_precursor_intensity = cluster_data['#PrecIntensity'].sum()
+
+ cluster_summary_row = {
+ 'cluster index': cluster_idx, # Already remapped to sequential
+ 'number of spectra': num_spectra,
+ 'parent mass': parent_mass,
+ 'precursor charge': precursor_charge,
+ 'precursor mass': precursor_mass,
+ 'sum(precursor intensity)': sum_precursor_intensity,
+ 'RTMean': mean_rt
+ }
+ cluster_summary_rows.append(cluster_summary_row)
+
+ cluster_summary_df = pd.DataFrame(cluster_summary_rows)
+ cluster_summary_df = cluster_summary_df.sort_values('cluster index')
+
+ # Ensure cluster index is string type to match network graph node types
+ # Network graph nodes from pairs file (CLUSTERID1/CLUSTERID2) are typically strings
+ # This ensures type matching when add_clusterinfo_summary_to_graph checks "if cluster_index in G"
+ cluster_summary_df['cluster index'] = cluster_summary_df['cluster index'].astype(str)
+
+ # Save outputs
+ mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False)
+ cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False)
+
+ print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters")
+ print(f"Saved clusterinfo to {output_clusterinfo}")
+ print(f"Saved clustersummary to {output_clustersummary}")
+ print(f"Note: Cluster indices have been remapped to sequential (1, 2, 3, ...) for consistency")
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Convert Falcon output to MSCluster format')
+ parser.add_argument('falcon_csv', help='Falcon CSV output file')
+ parser.add_argument('input_spectra_folder', help='Input spectra folder (for reference, not used for fetching data)')
+ parser.add_argument('output_clusterinfo', help='Output clusterinfo.tsv file')
+ parser.add_argument('output_clustersummary', help='Output clustersummary.tsv file')
+ parser.add_argument('--min_cluster_size', type=int, default=2, help='Minimum cluster size')
+
+ args = parser.parse_args()
+
+ convert_falcon_to_mscluster_format(
+ args.falcon_csv,
+ args.input_spectra_folder,
+ args.output_clusterinfo,
+ args.output_clustersummary,
+ args.min_cluster_size
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bin/scripts/falcon_wrapper.py b/bin/scripts/falcon_wrapper.py
new file mode 100755
index 0000000..f984a71
--- /dev/null
+++ b/bin/scripts/falcon_wrapper.py
@@ -0,0 +1,304 @@
+#!/usr/bin/python
+
+import sys
+import os
+import glob
+import argparse
+import subprocess
+import pandas as pd
+import ming_spectrum_library
+
+def run_falcon(input_spectra_path, output_prefix="falcon",
+ precursor_tol="20 ppm", fragment_tol=0.05,
+ min_mz_range=0, min_mz=0, max_mz=30000, eps=0.1):
+ """
+ Runs Falcon with specified parameters.
+ input_spectra_path can be a file or a folder.
+ """
+ # Check if input is a file or folder
+ if os.path.isfile(input_spectra_path):
+ # Single file input
+ all_spectrum_files = [input_spectra_path]
+ elif os.path.isdir(input_spectra_path):
+ # Folder input - list all spectrum files
+ all_mgf_files = glob.glob(os.path.join(input_spectra_path, "*.mgf"))
+ all_mzxml_files = glob.glob(os.path.join(input_spectra_path, "*.mzXML"))
+ all_mzml_files = glob.glob(os.path.join(input_spectra_path, "*.mzML"))
+
+ all_spectrum_files = all_mgf_files + all_mzxml_files + all_mzml_files
+
+ if len(all_spectrum_files) == 0:
+ print(f"ERROR: No spectrum files found in {input_spectra_path}")
+ sys.exit(1)
+
+ # Sort these filenames
+ all_spectrum_files.sort()
+ else:
+ print(f"ERROR: Input path does not exist: {input_spectra_path}")
+ sys.exit(1)
+
+ # Create pattern for falcon (it accepts wildcards or file list)
+ if len(all_spectrum_files) == 1:
+ mzml_pattern = all_spectrum_files[0]
+ else:
+ # Falcon can accept multiple files, we'll pass them as space-separated
+ mzml_pattern = " ".join(all_spectrum_files)
+
+ # Parse precursor_tol - falcon expects two arguments
+ # Format can be "2.0" (just number) or "20 ppm" (value and unit)
+ # Falcon needs two arguments, so we need to handle both cases
+ precursor_tol_str = str(precursor_tol).strip()
+ if " " in precursor_tol_str:
+ # Already has format like "20 ppm" or "2.0 Da" - split into two args
+ parts = precursor_tol_str.split(None, 1) # Split on first space
+ if len(parts) == 2:
+ precursor_tol_args = f"{parts[0]} {parts[1]}"
+ else:
+ # Fallback: use value twice
+ precursor_tol_args = f"{parts[0]} {parts[0]}"
+ else:
+ # Just a number, assume Da and use twice (min and max tolerance)
+ try:
+ tol_value = float(precursor_tol_str)
+ precursor_tol_args = f"{tol_value} {tol_value}"
+ except ValueError:
+ # If can't parse, use default
+ precursor_tol_args = "2.0 2.0"
+
+ command = (
+ f"falcon {mzml_pattern} {output_prefix} "
+ f"--export_representatives "
+ f"--precursor_tol {precursor_tol_args} "
+ f"--fragment_tol {fragment_tol} "
+ f"--min_mz_range {min_mz_range} "
+ f"--min_mz {min_mz} --max_mz {max_mz} "
+ f"--eps {eps} "
+ f"--hash_len 400 "
+ f"--n_neighbors_ann 64 "
+ f"--n_probe 16 "
+ f"--batch_size 32768 "
+ )
+ print(f"[run_falcon] Running: {command}")
+ process = subprocess.Popen(command, shell=True)
+ retcode = process.wait()
+ if retcode != 0:
+ print(f"ERROR: Falcon failed with exit code {retcode}")
+ sys.exit(1)
+
+
+def main():
+ # Parse arguments
+ parser = argparse.ArgumentParser(description='Falcon Clustering Wrapper')
+ parser.add_argument('input_spectra_folder', help='Input Spectra Folder')
+ parser.add_argument('output_spectra_folder', help='Output Spectra Folder')
+ parser.add_argument('final_output_folder', help='final_output_folder')
+
+ parser.add_argument('--min_cluster_size', default="2", help='min_cluster_size (not used by falcon directly)')
+
+ parser.add_argument('--pm_tolerance', default="20 ppm", help='pm_tolerance (precursor tolerance)')
+ parser.add_argument('--fragment_tolerance', default="0.05", help='fragment_tolerance')
+
+ # Filters (not all are used by falcon)
+ parser.add_argument('--min_peak_intensity', default="0.0", help='min_peak_intensity (not used by falcon)')
+ parser.add_argument('--window_filter', default="1", help='window_filter (not used by falcon)')
+ parser.add_argument('--precursor_filter', default="1", help='precursor_filter (not used by falcon)')
+
+ # Falcon-specific parameters
+ parser.add_argument('--eps', default="0.1", help='Falcon eps parameter')
+ parser.add_argument('--min_mz', default="0", help='Falcon min_mz parameter')
+ parser.add_argument('--max_mz', default="30000", help='Falcon max_mz parameter')
+
+ args = parser.parse_args()
+
+ # Check if input is a file or folder (Nextflow may pass a file)
+ input_spectra_path = args.input_spectra_folder
+ if os.path.isfile(args.input_spectra_folder):
+ # If it's a single file, we need to use its directory
+ input_spectra_path = os.path.dirname(args.input_spectra_folder)
+ if not input_spectra_path:
+ input_spectra_path = "."
+
+ # Running falcon
+ output_prefix = "falcon"
+ run_falcon(input_spectra_path, output_prefix,
+ precursor_tol=args.pm_tolerance,
+ fragment_tol=float(args.fragment_tolerance),
+ min_mz_range=0,
+ min_mz=int(args.min_mz),
+ max_mz=int(args.max_mz),
+ eps=float(args.eps))
+
+ # Falcon outputs:
+ # - falcon.csv (cluster assignments)
+ # - falcon.mgf (representative spectra)
+
+ falcon_csv = f"{output_prefix}.csv"
+ falcon_mgf = f"{output_prefix}.mgf"
+
+ if not os.path.exists(falcon_csv):
+ print(f"ERROR: Falcon output {falcon_csv} not found")
+ sys.exit(1)
+
+ if not os.path.exists(falcon_mgf):
+ print(f"ERROR: Falcon output {falcon_mgf} not found")
+ sys.exit(1)
+
+ # Convert falcon output to mscluster format first
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ convert_script = os.path.join(script_dir, "convert_falcon_to_mscluster_format.py")
+
+ clusterinfo_file = os.path.join(args.final_output_folder, "clusterinfo.tsv")
+ clustersummary_file = os.path.join(args.final_output_folder, "clustersummary.tsv")
+
+ # For conversion, we need the original input path to get spectrum info
+ convert_input_path = input_spectra_path
+
+ convert_cmd = (
+ f"python {convert_script} "
+ f"{falcon_csv} "
+ f"{convert_input_path} "
+ f"{clusterinfo_file} "
+ f"{clustersummary_file} "
+ f"--min_cluster_size {args.min_cluster_size}"
+ )
+
+ print(f"Converting falcon output to mscluster format...")
+ print(f"Running: {convert_cmd}")
+ ret_code = os.system(convert_cmd)
+
+ if ret_code != 0:
+ print(f"ERROR: Conversion failed with exit code {ret_code}")
+ sys.exit(1)
+
+ # Now update MGF file with correct scan numbers based on clusterinfo.tsv
+ # IMPORTANT: clusterinfo.tsv now uses sequential indices (1, 2, 3, ...)
+ # The scan number in MGF should match the #ClusterIdx in clusterinfo.tsv (which is already sequential)
+ # Read clusterinfo to get cluster indices
+ clusterinfo_df = pd.read_csv(clusterinfo_file, sep='\t')
+
+ # Get unique cluster indices (these are already sequential: 1, 2, 3, ...)
+ unique_clusters = sorted(clusterinfo_df['#ClusterIdx'].unique())
+
+ # Read falcon MGF file - parse manually to access cluster field
+ output_mgf_filename = os.path.join(args.final_output_folder, "specs_ms.mgf")
+ os.makedirs(args.final_output_folder, exist_ok=True)
+
+ # Parse falcon MGF to get cluster information and update scan numbers
+ mgf_spectra = []
+ with open(falcon_mgf, 'r') as f:
+ spec = None
+ for line in f:
+ line = line.strip()
+ if line == 'BEGIN IONS':
+ spec = {'peaks': [], 'headers': {}}
+ elif line == 'END IONS':
+ if spec is not None:
+ mgf_spectra.append(spec)
+ spec = None
+ elif spec is not None:
+ if '=' in line:
+ key, val = line.split('=', 1)
+ spec['headers'][key.upper()] = val
+ else:
+ # Peak data
+ parts = line.split()
+ if len(parts) == 2:
+ try:
+ mz, intensity = float(parts[0]), float(parts[1])
+ spec['peaks'].append((mz, intensity))
+ except:
+ pass
+
+ # IMPORTANT: clusterinfo.tsv now uses sequential indices (1, 2, 3, ...)
+ # We need to map falcon clusters (0-based) to these sequential indices
+ # Rebuild the mapping by reading falcon.csv and matching with clusterinfo.tsv
+ # This avoids needing a separate JSON file that might not be passed through Nextflow
+
+ # Read falcon.csv to get original falcon cluster assignments
+ falcon_cluster_to_sequential = {}
+ if os.path.exists(falcon_csv):
+ falcon_df = pd.read_csv(falcon_csv, sep=',', comment='#')
+
+ # Filter by min_cluster_size (same as in convert_falcon_to_mscluster_format.py)
+ if int(args.min_cluster_size) > 1:
+ falcon_df = falcon_df[falcon_df['cluster'] != -1]
+
+ # Group by original falcon cluster (0-based)
+ # The sequential index in clusterinfo corresponds to the order after filtering
+ # We'll match by grouping falcon clusters and assigning sequential indices
+ original_clusters = sorted(falcon_df['cluster'].unique())
+ sequential_idx = 1
+ for orig_falcon_cluster in original_clusters:
+ # Count spectra in this cluster
+ cluster_spectra = falcon_df[falcon_df['cluster'] == orig_falcon_cluster]
+ if len(cluster_spectra) >= int(args.min_cluster_size):
+ falcon_cluster_to_sequential[orig_falcon_cluster] = sequential_idx
+ sequential_idx += 1
+
+ print(f"Rebuilt falcon cluster mapping: {len(falcon_cluster_to_sequential)} clusters")
+ else:
+ print(f"WARNING: falcon.csv not found at {falcon_csv}, cannot rebuild mapping")
+ falcon_cluster_to_sequential = {}
+
+ # Write MGF with sequential scan numbers matching clusterinfo.tsv
+ # Since clusterinfo.tsv already has sequential indices, we'll use them directly
+ skipped_count = 0
+ processed_clusters = set() # Track which sequential cluster indices we've processed
+
+ with open(output_mgf_filename, 'w') as out_mgf:
+ for spec in mgf_spectra:
+ # Get cluster ID from falcon MGF
+ falcon_cluster = -1
+ if 'CLUSTER' in spec['headers']:
+ try:
+ falcon_cluster = int(spec['headers']['CLUSTER'])
+ except (ValueError, TypeError):
+ pass
+
+ # Map falcon cluster (0-based) to sequential index (1-based)
+ if falcon_cluster in falcon_cluster_to_sequential:
+ sequential_idx = falcon_cluster_to_sequential[falcon_cluster]
+ scan_number = sequential_idx # Use sequential index as SCANS
+
+ # Write MGF entry with sequential scan number
+ out_mgf.write("BEGIN IONS\n")
+
+ # Write headers, updating SCANS= line
+ for key, val in spec['headers'].items():
+ if key == 'SCANS':
+ out_mgf.write(f"SCANS={scan_number}\n")
+ else:
+ out_mgf.write(f"{key}={val}\n")
+
+ # Add SCANS if it wasn't in the original
+ if 'SCANS' not in spec['headers']:
+ out_mgf.write(f"SCANS={scan_number}\n")
+
+ # Write peaks
+ for mz, intensity in spec['peaks']:
+ out_mgf.write(f"{mz} {intensity}\n")
+
+ out_mgf.write("END IONS\n\n")
+ processed_clusters.add(sequential_idx)
+ else:
+ # Skip clusters that are not in the mapping (filtered out by min_cluster_size)
+ skipped_count += 1
+ if falcon_cluster != -1: # Only warn for non-singleton clusters
+ print(f"INFO: Skipping falcon cluster {falcon_cluster} (filtered out by min_cluster_size or not in mapping)")
+
+ if skipped_count > 0:
+ print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size")
+
+ print(f"Wrote {len(processed_clusters)} spectra to MGF file with sequential SCANS (1, 2, 3, ...)")
+
+ if skipped_count > 0:
+ print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size")
+
+ print(f"Falcon clustering completed. Output files:")
+ print(f" - MGF: {output_mgf_filename}")
+ print(f" - ClusterInfo: {clusterinfo_file}")
+ print(f" - ClusterSummary: {clustersummary_file}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bin/scripts/summarize_results_falcon.py b/bin/scripts/summarize_results_falcon.py
new file mode 100644
index 0000000..d6ec09e
--- /dev/null
+++ b/bin/scripts/summarize_results_falcon.py
@@ -0,0 +1,88 @@
+import os
+import pandas as pd
+import argparse
+import numpy as np
+
+def rewrite_falcon_mgf(mgf_input, mgf_output):
+ with open(mgf_input, 'r') as infile, open(mgf_output, 'w') as outfile:
+ for line in infile:
+ if line.startswith("CLUSTER="):
+ try:
+ cluster_num = int(line.strip().split("=")[1])
+ outfile.write(f"SCANS={cluster_num + 1}\n")
+ except ValueError:
+ outfile.write(line)
+ else:
+ outfile.write(line)
+
+def main():
+ parser = argparse.ArgumentParser(description='Summarizing Falcon Results')
+ parser.add_argument('falcon_clusters', help='falcon_clusters')
+ parser.add_argument('falcon_mgf', help='falcon_mgf')
+ #parser.add_argument('output_summary_folder', help='output_summary_folder')
+ args = parser.parse_args()
+
+ clusterinfo_df = pd.read_csv(args.falcon_clusters, sep=',', comment='#')
+
+ print(args)
+ print(clusterinfo_df)
+
+ clusterinfo_df = clusterinfo_df.sort_values(by='cluster', key=lambda x: x.replace(-1, np.inf))
+
+ # Filtering out not in clusters data
+ clusterinfo_df = clusterinfo_df[clusterinfo_df["cluster"] != -1]
+
+
+ # Grouping by cluster
+ grouped_cluster_df = clusterinfo_df.groupby(["cluster"])
+ cluster_summary_list = []
+ for cluster, cluster_group_df in grouped_cluster_df:
+ #TODO :Read these from mgf, as the representative is a medoid
+
+ cluster_count = len(cluster_group_df)
+ cluster_mz = cluster_group_df["precursor_mz"].mean()
+ cluster_rt = cluster_group_df["retention_time"].mean()
+ cluster_charge = cluster_group_df["precursor_charge"].mean()
+ # adjust the col name to map the classical MN wokflow
+ output_dict = {}
+ output_dict["number of spectra"] = cluster_count
+ output_dict["parent mass"] = cluster_mz
+ output_dict["RTMean"] = cluster_rt
+ output_dict["precursor charge"] = cluster_charge
+ output_dict["cluster index"] = cluster[0] + 1
+
+ cluster_summary_list.append(output_dict)
+
+ # Creating a cluster summary
+ cluster_summary_df = pd.DataFrame(cluster_summary_list)
+ cluster_summary_df.to_csv("clustersummary.tsv", sep='\t', index=False)
+
+ # Creating cluster info
+ clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[2] + ".mzML")
+ clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[-1])
+
+ # Rename relevant columns to MS-Cluster format
+ clusterinfo_df = clusterinfo_df.rename(columns={
+ "filename": "#Filename",
+ "cluster": "#ClusterIdx",
+ "scan": "#Scan",
+ "precursor_mz": "#ParentMass",
+ "precursor_charge": "#Charge",
+ "retention_time": "#RetTime"
+ })
+
+ # Just to prevent other processes in the workflow raise error
+ clusterinfo_df["#PrecIntensity"] = 0
+
+ # Select required columns
+ clusterinfo_df = clusterinfo_df[[
+ "#ClusterIdx", "#Filename", "#Scan", "#ParentMass", "#Charge", "#RetTime", "#PrecIntensity"
+ ]]
+ clusterinfo_df.to_csv("clusterinfo.tsv", sep='\t', index=False)
+
+ #TODO: Rewriting MGF files
+ rewrite_falcon_mgf(args.falcon_mgf, "specs_ms.mgf")
+ # TODO: Maybe make this compatible with FBMN, since its already clustered.
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/nf_workflow.nf b/nf_workflow.nf
index 865f302..2316d93 100644
--- a/nf_workflow.nf
+++ b/nf_workflow.nf
@@ -13,12 +13,20 @@ params.metadata_per_file_grouping = "No" // Yes means that each file can be its
params.metadata_filename = "data/metadata.tsv"
// Clustering Parameters
+params.clustering_tool = "mscluster" // or "falcon"
params.min_cluster_size = "2"
// Tolerance Parameters
params.pm_tolerance = "2.0"
params.fragment_tolerance = "0.5"
+// Falcon-specific parameters
+params.falcon_pm_tolerance = "20 ppm"
+params.falcon_fragment_tolerance = "0.05"
+params.falcon_eps = "0.1"
+params.falcon_min_mz = "0"
+params.falcon_max_mz = "30000"
+
// Filtering
params.min_peak_intensity = "0.0"
params.window_filter = "1"
@@ -66,9 +74,29 @@ params.cache_directory = "data/cache"
params.publishdir = "$baseDir"
TOOL_FOLDER = "$baseDir/bin"
+MODULES_FOLDER = "$TOOL_FOLDER/NextflowModules"
+
+// COMPATIBILITY NOTE: The following might be necessary if this workflow is being deployed in a slightly different environemnt
+// checking if outdir is defined,
+// if so, then set publishdir to outdir
+if (params.outdir) {
+ _publishdir = params.outdir
+}
+else{
+ _publishdir = params.publishdir
+}
+
+// Augmenting with nf_output
+_publishdir = "${_publishdir}/nf_output"
+
+
+// A lot of useful modules are already implemented and added to the nextflow modules, you can import them to use
+// the publishdir is a key word that we're using around all our modules to control where the output files will be saved
+include {summaryLibrary} from "$MODULES_FOLDER/nf_library_search_modules.nf"
+include {librarygetGNPSAnnotations} from "$MODULES_FOLDER/nf_library_search_modules.nf" addParams(publishdir: "$_publishdir/library")
process filesummary {
- publishDir "$params.publishdir/nf_output", mode: 'copy'
+ publishDir "$_publishdir", mode: 'copy'
conda "$TOOL_FOLDER/conda_env.yml"
@@ -113,6 +141,41 @@ process mscluster {
"""
}
+process falcon {
+ publishDir "$params.publishdir/nf_output", mode: 'copy'
+
+ conda "$TOOL_FOLDER/conda_env_falcon.yml"
+
+ // This is necessary because the glibc libraries are not always used in the conda environment, and defaults to the system which could be old
+ beforeScript 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib'
+
+ input:
+ file inputSpectra
+ val ready
+
+ output:
+ file 'clustering/specs_ms.mgf'
+ file 'clustering/clusterinfo.tsv'
+ file 'clustering/clustersummary.tsv'
+
+ """
+ mkdir clustering
+ python $TOOL_FOLDER/scripts/falcon_wrapper.py \
+ $inputSpectra \
+ spectra \
+ clustering \
+ --min_cluster_size $params.min_cluster_size \
+ --pm_tolerance "$params.falcon_pm_tolerance" \
+ --fragment_tolerance $params.falcon_fragment_tolerance \
+ --min_peak_intensity $params.min_peak_intensity \
+ --window_filter $params.window_filter \
+ --precursor_filter $params.precursor_filter \
+ --eps $params.falcon_eps \
+ --min_mz $params.falcon_min_mz \
+ --max_mz $params.falcon_max_mz
+ """
+}
+
// TODO: Finish Implementing this, as this is currently an no-op
process massqlFilterSpectra {
publishDir "$params.publishdir/nf_output", mode: 'copy'
@@ -180,28 +243,28 @@ process librarymergeResults {
"""
}
-process librarygetGNPSAnnotations {
- publishDir "$params.publishdir/nf_output/library", mode: 'copy'
+// process librarygetGNPSAnnotations {
+// publishDir "$params.publishdir/nf_output/library", mode: 'copy'
- //cache 'lenient'
- cache 'false'
+// //cache 'lenient'
+// cache 'false'
- conda "$TOOL_FOLDER/conda_env.yml"
+// conda "$TOOL_FOLDER/conda_env.yml"
- input:
- path "merged_results.tsv"
- path "library_summary.tsv"
+// input:
+// path "merged_results.tsv"
+// path "library_summary.tsv"
- output:
- path 'merged_results_with_gnps.tsv'
+// output:
+// path 'merged_results_with_gnps.tsv'
- """
- python $TOOL_FOLDER/scripts/getGNPS_library_annotations.py \
- merged_results.tsv \
- merged_results_with_gnps.tsv \
- --librarysummary library_summary.tsv
- """
-}
+// """
+// python $TOOL_FOLDER/scripts/getGNPS_library_annotations.py \
+// merged_results.tsv \
+// merged_results_with_gnps.tsv \
+// --librarysummary library_summary.tsv
+// """
+// }
// Molecular Networking
process networkingGNPSPrepParams {
@@ -490,28 +553,6 @@ process prepInputFiles {
"""
}
-process summaryLibrary {
- publishDir "$params.publishdir/nf_output", mode: 'copy'
-
- maxForks 8
-
- cache 'lenient'
-
- conda "$TOOL_FOLDER/conda_env.yml"
-
- input:
- path library_file
-
- output:
- path '*.tsv' optional true
-
- """
- python $TOOL_FOLDER/scripts/library_summary.py \
- $library_file \
- ${library_file}.tsv
- """
-}
-
process createFeatureTable {
publishDir "$params.publishdir/nf_output/clustering", mode: 'copy'
@@ -535,7 +576,7 @@ process createFeatureTable {
}
-process PrepareForModiFinder{
+process prepareForModiFinder{
publishDir "$params.publishdir/nf_output", mode: 'copy'
errorStrategy 'ignore'
@@ -569,8 +610,19 @@ workflow {
// File summaries
filesummary(input_spectra_ch, _download_ready)
+ // Note: For subsequent processes (library search, networking), they use pm_tolerance and fragment_tolerance
+ // These are set to mscluster defaults. If using falcon, users should be aware that downstream processes
+ // will use the mscluster tolerance values unless they also update those parameters.
+ // This is acceptable because downstream processes work on clustered spectra, and the tolerance
+ // for library search and networking can be different from clustering tolerance.
+
// Clustering
- (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready)
+ if(params.clustering_tool == "falcon"){
+ (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = falcon(input_spectra_ch, _download_ready)
+ }
+ else{
+ (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready)
+ }
if(params.massql_filter != "None"){
clustered_spectra_ch = massqlFilterSpectra(clustered_spectra_intermediate_ch)
@@ -590,10 +642,12 @@ workflow {
library_summary_ch = summaryLibrary(libraries_ch)
// Merging all these tsv files from library_summary_ch within nextflow
- library_summary_merged_ch = library_summary_ch.collectFile(name: "library_summary.tsv", keepHeader: true)
+ library_summary_merged_ch = library_summary_ch.collectFile(name: 'librarysummary.tsv', keepHeader: true, storeDir: _publishdir + "/librarysummary")
library_summary_merged_ch = library_summary_merged_ch.ifEmpty(file("NO_FILE"))
- gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch)
+ // Getting library annotations
+ force_offline = "No" // This can be set to Yes to avoid any online queries to GNPS, which is useful for testing or if you have a local copy of the GNPS library
+ gnps_library_results_ch = librarygetGNPSAnnotations(merged_results_ch, library_summary_merged_ch, "1", "0", force_offline)
gnps_library_results_ch = gnps_library_results_ch.ifEmpty(file("NO_FILE"))
// Networking
@@ -650,6 +704,6 @@ workflow {
createFeatureTable(clusterinfo_ch)
// Preparing for Modifinder
- PrepareForModiFinder(gnps_library_results_ch, filtered_networking_pairs_enriched_ch)
+ prepareForModiFinder(gnps_library_results_ch, filtered_networking_pairs_enriched_ch)
}
\ No newline at end of file
diff --git a/workflowdisplay.yaml b/workflowdisplay.yaml
index 6af7541..91a5c44 100644
--- a/workflowdisplay.yaml
+++ b/workflowdisplay.yaml
@@ -98,15 +98,27 @@ Views:
- title: "SpectrumID"
data: SpectrumID
- title: "Smiles"
- data: Smiles
+ data: Smiles
+ - title: "Library Name"
+ data: LibraryName
+ - title: "Library Scan"
+ data: LibScan
columnDefs: '[ {"targets": 0,"data": null,"render": function ( data, type, row, meta ) {
+ if (row["SpectrumID"] && row["SpectrumID"].includes("CCMSLIB")) {
+ return `
+ View Mirror
+ `;
+ }
return `
- View Mirror
+ View Mirror
`;}},
{"targets": 10,"data": null,"render": function ( data, type, row, meta ) {
return `
- `;}},]'
+ `;}},
+ {"targets": [11, 12], "visible": false}]'
+
+
- name: Network Components List
displayname: Network Components List
@@ -357,4 +369,4 @@ Views:
viewname: metadatadownload
displaytype: download
parameters:
- filename: nf_output/metadata/merged_metadata.tsv
\ No newline at end of file
+ filename: nf_output/metadata/merged_metadata.tsvtsv
\ No newline at end of file
diff --git a/workflowinput.yaml b/workflowinput.yaml
index f35d1e1..f534392 100644
--- a/workflowinput.yaml
+++ b/workflowinput.yaml
@@ -1,7 +1,7 @@
workflowname: classical_networking_workflow
workflowdescription: This is Classical Molecular Networking for GNPS2
workflowlongdescription: This is Classical Molecular Networking for GNPS2
-workflowversion: "2026.01.21"
+workflowversion: "2026.02.06"
workflowfile: nf_workflow.nf
workflowautohide: false
adminonly: false
@@ -88,6 +88,17 @@ parameterlist:
- displayname: Clustering Parameters
paramtype: section
+ - displayname: Clustering Tool
+ paramtype: select
+ nf_paramname: clustering_tool
+ formvalue: "mscluster"
+ options:
+ - value: "mscluster"
+ display: "MSCluster"
+ - value: "falcon"
+ display: "Falcon"
+ tooltip: "Select the clustering tool to use for spectral clustering"
+
- displayname: Minimum Cluster Size (Or Disable Clustering)
paramtype: select
nf_paramname: min_cluster_size
@@ -105,6 +116,68 @@ parameterlist:
display: "4"
- value: "5"
display: "5"
+
+ - displayname: Falcon Parameters
+ paramtype: section
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
+
+ - displayname: Precursor Ion Tolerance
+ paramtype: text
+ nf_paramname: falcon_pm_tolerance
+ formplaceholder: Enter the pm_tolerance (e.g., 20 ppm or 2.0 Da)
+ formvalue: "20 ppm"
+ tooltip: "Precursor tolerance with unit (e.g., 20 ppm or 2.0 Da)"
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
+
+ - displayname: Fragment Ion Tolerance
+ paramtype: text
+ nf_paramname: falcon_fragment_tolerance
+ formplaceholder: Enter the fragment_tolerance
+ formvalue: "0.05"
+ tooltip: "Fragment tolerance (Da)"
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
+
+ - displayname: Falcon Epsilon (eps)
+ paramtype: text
+ nf_paramname: falcon_eps
+ formplaceholder: Enter the falcon_eps
+ formvalue: "0.1"
+ tooltip: "Falcon clustering epsilon parameter (only used when clustering_tool is falcon)"
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
+
+ - displayname: Falcon Min MZ
+ paramtype: text
+ nf_paramname: falcon_min_mz
+ formplaceholder: Enter the falcon_min_mz
+ formvalue: "0"
+ tooltip: "Falcon minimum m/z value (only used when clustering_tool is falcon)"
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
+
+ - displayname: Falcon Max MZ
+ paramtype: text
+ nf_paramname: falcon_max_mz
+ formplaceholder: Enter the falcon_max_mz
+ formvalue: "30000"
+ tooltip: "Falcon maximum m/z value (only used when clustering_tool is falcon)"
+ showif:
+ - condition:
+ - key: clustering_tool
+ value: "falcon"
- displayname: Advanced Filtering Parameters
paramtype: section