Wang-Bioinformatics-Lab · XianghuWang-287 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "bin/scripts/downloadpublicdata"]
 	path = bin/scripts/downloadpublicdata
 	url = https://github.com/Wang-Bioinformatics-Lab/downloadpublicdata.git
+[submodule "bin/NextflowModules"]
+	path = bin/NextflowModules
+	url = https://github.com/Wang-Bioinformatics-Lab/NextflowModules.git
diff --git a/Makefile b/Makefile
@@ -6,4 +6,7 @@ run_usi_download:
 	--download_usi_filename=./data/usi_files/input_files.tsv --cache_directory=./data/cache
 
 run_transitive:
-	nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive
+	nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive
+
+init_modules:
+	git submodule update --init --recursive
diff --git a/bin/NextflowModules b/bin/NextflowModules
diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml
@@ -0,0 +1,21 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+channel_priority: strict
+dependencies:
+  - python=3.10
+  - pandas=2.2.0
+  - pyteomics=4.7.1
+  - pip:
+    - numpy==1.23.4
+    - xmltodict
+    - requests
+    - tqdm
+    - psutil
+    - pyopenms==3.0.0
+    - spectrum-utils==0.3.5
+    - falcon-ms
+    - pymzml
+    - numcodecs
+    - pyarrow
diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py
@@ -0,0 +1,248 @@
+#!/usr/bin/python
+
+import sys
+import os
+import argparse
+import pandas as pd
+
+
+
+def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2):
+    """
+    Convert falcon output to mscluster format.
+
+    Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch
+    MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity
+
+    Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0)
+    instead of fetching from MGF files.
+    """
+    # Load falcon CSV
+    clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#')
+
+    print(f"Loaded {len(clusterinfo_df)} rows from falcon CSV")
+    print(f"Columns: {clusterinfo_df.columns.tolist()}")
+
+
+    if 'identifier' in clusterinfo_df.columns:
+
+        clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(
+            lambda x: x.split(":")[2] + ".mzML" if len(x.split(":")) > 2 else (x.split(":")[-2] + ".mzML" if len(x.split(":")) > 1 else "unknown.mzML")
+        )
+        clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(
+            lambda x: int(x.split(":")[-1]) if x.split(":")[-1].isdigit() else 0
+        )
+
+    # Ensure we have the required columns
+    required_cols = ['cluster', 'filename', 'scan', 'precursor_mz', 'retention_time']
+    missing_cols = [col for col in required_cols if col not in clusterinfo_df.columns]
+
+    if missing_cols:
+        print(f"ERROR: Required columns not found in falcon CSV: {missing_cols}")
+        print(f"Available columns: {clusterinfo_df.columns.tolist()}")
+        print(f"First few rows:")
+        print(clusterinfo_df.head())
+        sys.exit(1)
+
+    if min_cluster_size > 1:
+        clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1]
+
+    # Convert to mscluster format
+    mscluster_rows = []
+    spec_idx_counter = 0
+
+    for idx, row in clusterinfo_df.iterrows():
+        cluster_idx = int(row['cluster'])
+
+        # Skip singletons if min_cluster_size > 1
+        if cluster_idx == -1 and min_cluster_size > 1:
+            continue
+
+        # Handle cluster indexing: falcon uses 0-based, mscluster uses 1-based
+        # But we also need to handle -1 (singletons)
+        if cluster_idx == -1:
+            # Singletons: use a large number or handle separately
+            cluster_idx = 999999  # Use a large number for singletons
+        else:
+            cluster_idx = cluster_idx + 1  # Convert to 1-based
+
+        filename = str(row['filename'])
+        scan = int(row['scan'])
+        precursor_mz = float(row['precursor_mz'])
+        retention_time = float(row['retention_time'])
+
+        # Use default values for fields that falcon doesn't provide
+        # Don't fetch from MGF files - just use defaults
+        charge = 0
+        precursor_intensity = 0.0
+
+
+        if 'precursor_charge' in row and pd.notna(row['precursor_charge']):
+            try:
+                charge = int(row['precursor_charge'])
+            except (ValueError, TypeError):
+                charge = 0
+        elif 'charge' in row and pd.notna(row['charge']):
+            try:
+                charge = int(row['charge'])
+            except (ValueError, TypeError):
+                charge = 0
+
+        if 'precursor_intensity' in row and pd.notna(row['precursor_intensity']):
+            try:
+                precursor_intensity = float(row['precursor_intensity'])
+            except (ValueError, TypeError):
+                precursor_intensity = 0.0
+
+        # Convert retention time to seconds if it's in minutes
+        # Falcon typically outputs RT in minutes, mscluster uses seconds
+        if retention_time > 0 and retention_time < 1000:  # Likely in minutes
+            retention_time = retention_time * 60.0
+
+        if not filename.startswith('input_spectra/'):
+            filename = f"input_spectra/{filename}"
+
+        spec_idx = spec_idx_counter
+        spec_idx_counter += 1
+
+        mscluster_row = {
+            '#ClusterIdx': cluster_idx,
+            '#Filename': filename,
+            '#SpecIdx': spec_idx,
+            '#Scan': scan,
+            '#ParentMass': precursor_mz,
+            '#Charge': charge,
+            '#RetTime': retention_time,
+            '#PrecIntensity': precursor_intensity
+        }
+        mscluster_rows.append(mscluster_row)
+
+    # Create DataFrame
+    mscluster_df = pd.DataFrame(mscluster_rows)
+
+    # IMPORTANT: Before filtering, save the original falcon cluster ID (0-based) for each row
+    # This will help us match falcon MGF clusters to clusterinfo clusters later
+    # The original falcon cluster ID is stored in the 'cluster' column from falcon.csv
+    # We need to track: original_falcon_cluster (0-based) -> sequential_index (1-based)
+
+    # First, add a column to track original falcon cluster (0-based) before filtering
+    # We'll need to reconstruct this from the cluster_idx we converted
+    # cluster_idx was converted from falcon's 0-based to 1-based, so original = cluster_idx - 1
+    mscluster_df['_original_falcon_cluster'] = mscluster_df['#ClusterIdx'] - 1
+
+    # Filter by min_cluster_size
+    if min_cluster_size > 1:
+        # Count spectra per cluster
+        cluster_counts = mscluster_df['#ClusterIdx'].value_counts()
+        valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index
+        mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)]
+
+    # IMPORTANT: Remap cluster indices to sequential (1, 2, 3, ...) for consistency
+    # This ensures clusterinfo.tsv, clustersummary.tsv, and MGF SCANS all use the same sequential indices
+    # This is necessary for compatibility with ExecMolecularParallelPairs which uses index-based CLUSTERID
+    original_clusters = sorted(mscluster_df['#ClusterIdx'].unique())
+    cluster_remap = {orig: new for new, orig in enumerate(original_clusters, start=1)}
+
+    # Create mapping: original falcon cluster (0-based) -> sequential index (1-based)
+    # This mapping will be used in falcon_wrapper.py to match MGF clusters
+    falcon_cluster_to_sequential = {}
+    for orig_cluster_idx in original_clusters:
+        original_falcon_cluster = orig_cluster_idx - 1  # Convert back to 0-based
+        sequential_idx = cluster_remap[orig_cluster_idx]
+        falcon_cluster_to_sequential[original_falcon_cluster] = sequential_idx
+
+    # Remap #ClusterIdx in mscluster_df
+    mscluster_df['#ClusterIdx'] = mscluster_df['#ClusterIdx'].map(cluster_remap)
+
+    # Remove the temporary column
+    mscluster_df = mscluster_df.drop(columns=['_original_falcon_cluster'])
+
+    # Create cluster summary with remapped indices
+    cluster_summary_rows = []
+    for cluster_idx in sorted(mscluster_df['#ClusterIdx'].unique()):
+        cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx]
+        num_spectra = len(cluster_data)
+
+        # Calculate mean RT (in minutes)
+        mean_rt = cluster_data['#RetTime'].mean() / 60.0
+
+        # Calculate parent mass (mean of #ParentMass)
+        parent_mass = cluster_data['#ParentMass'].mean()
+
+        # Calculate precursor mass (parent mass / charge, then take mean)
+        # If charge is 0, use parent mass directly
+        precursor_masses = []
+        for idx, row in cluster_data.iterrows():
+            if row['#Charge'] > 0:
+                precursor_masses.append(row['#ParentMass'] / row['#Charge'])
+            else:
+                precursor_masses.append(row['#ParentMass'])
+        precursor_mass = sum(precursor_masses) / len(precursor_masses) if precursor_masses else parent_mass
+
+        # Calculate precursor charge (most common charge, or mean if no clear mode)
+        charges = cluster_data['#Charge'].values
+        charges_nonzero = charges[charges > 0]
+        if len(charges_nonzero) > 0:
+            # Use mode (most common charge) using pandas
+            charge_series = pd.Series(charges_nonzero)
+            mode_values = charge_series.mode()
+            if len(mode_values) > 0:
+                precursor_charge = int(mode_values.iloc[0])
+            else:
+                precursor_charge = int(charges_nonzero.mean())
+        else:
+            precursor_charge = 0
+
+        # Calculate sum of precursor intensity
+        sum_precursor_intensity = cluster_data['#PrecIntensity'].sum()
+
+        cluster_summary_row = {
+            'cluster index': cluster_idx,  # Already remapped to sequential
+            'number of spectra': num_spectra,
+            'parent mass': parent_mass,
+            'precursor charge': precursor_charge,
+            'precursor mass': precursor_mass,
+            'sum(precursor intensity)': sum_precursor_intensity,
+            'RTMean': mean_rt
+        }
+        cluster_summary_rows.append(cluster_summary_row)
+
+    cluster_summary_df = pd.DataFrame(cluster_summary_rows)
+    cluster_summary_df = cluster_summary_df.sort_values('cluster index')
+
+    # Ensure cluster index is string type to match network graph node types
+    # Network graph nodes from pairs file (CLUSTERID1/CLUSTERID2) are typically strings
+    # This ensures type matching when add_clusterinfo_summary_to_graph checks "if cluster_index in G"
+    cluster_summary_df['cluster index'] = cluster_summary_df['cluster index'].astype(str)
+
+    # Save outputs
+    mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False)
+    cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False)
+
+    print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters")
+    print(f"Saved clusterinfo to {output_clusterinfo}")
+    print(f"Saved clustersummary to {output_clustersummary}")
+    print(f"Note: Cluster indices have been remapped to sequential (1, 2, 3, ...) for consistency")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert Falcon output to MSCluster format')
+    parser.add_argument('falcon_csv', help='Falcon CSV output file')
+    parser.add_argument('input_spectra_folder', help='Input spectra folder (for reference, not used for fetching data)')
+    parser.add_argument('output_clusterinfo', help='Output clusterinfo.tsv file')
+    parser.add_argument('output_clustersummary', help='Output clustersummary.tsv file')
+    parser.add_argument('--min_cluster_size', type=int, default=2, help='Minimum cluster size')
+
+    args = parser.parse_args()
+
+    convert_falcon_to_mscluster_format(
+        args.falcon_csv,
+        args.input_spectra_folder,
+        args.output_clusterinfo,
+        args.output_clustersummary,
+        args.min_cluster_size
+    )
+
+
+if __name__ == "__main__":
+    main()