Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "bin/scripts/downloadpublicdata"]
path = bin/scripts/downloadpublicdata
url = https://github.com/Wang-Bioinformatics-Lab/downloadpublicdata.git
[submodule "bin/NextflowModules"]
path = bin/NextflowModules
url = https://github.com/Wang-Bioinformatics-Lab/NextflowModules.git
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ run_usi_download:
--download_usi_filename=./data/usi_files/input_files.tsv --cache_directory=./data/cache

run_transitive:
nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive
nextflow run ./nf_workflow.nf --resume -c nextflow.config --topology=transitive

init_modules:
git submodule update --init --recursive
1 change: 1 addition & 0 deletions bin/NextflowModules
Submodule NextflowModules added at f357f3
21 changes: 21 additions & 0 deletions bin/conda_env_falcon.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
channels:
- conda-forge
- bioconda
- defaults
channel_priority: strict
dependencies:
- python=3.10
- pandas=2.2.0
- pyteomics=4.7.1
- pip:
- numpy==1.23.4
- xmltodict
- requests
- tqdm
- psutil
- pyopenms==3.0.0
- spectrum-utils==0.3.5
- falcon-ms
- pymzml
- numcodecs
- pyarrow
248 changes: 248 additions & 0 deletions bin/scripts/convert_falcon_to_mscluster_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
#!/usr/bin/python

import sys
import os
import argparse
import pandas as pd



def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2):
"""
Convert falcon output to mscluster format.

Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch
MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity

Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0)
instead of fetching from MGF files.
"""
# Load falcon CSV
clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#')

print(f"Loaded {len(clusterinfo_df)} rows from falcon CSV")
print(f"Columns: {clusterinfo_df.columns.tolist()}")


if 'identifier' in clusterinfo_df.columns:

clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(
lambda x: x.split(":")[2] + ".mzML" if len(x.split(":")) > 2 else (x.split(":")[-2] + ".mzML" if len(x.split(":")) > 1 else "unknown.mzML")
)
clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(
lambda x: int(x.split(":")[-1]) if x.split(":")[-1].isdigit() else 0
)

# Ensure we have the required columns
required_cols = ['cluster', 'filename', 'scan', 'precursor_mz', 'retention_time']
missing_cols = [col for col in required_cols if col not in clusterinfo_df.columns]

if missing_cols:
print(f"ERROR: Required columns not found in falcon CSV: {missing_cols}")
print(f"Available columns: {clusterinfo_df.columns.tolist()}")
print(f"First few rows:")
print(clusterinfo_df.head())
sys.exit(1)

if min_cluster_size > 1:
clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1]

# Convert to mscluster format
mscluster_rows = []
spec_idx_counter = 0

for idx, row in clusterinfo_df.iterrows():
cluster_idx = int(row['cluster'])

# Skip singletons if min_cluster_size > 1
if cluster_idx == -1 and min_cluster_size > 1:
continue

# Handle cluster indexing: falcon uses 0-based, mscluster uses 1-based
# But we also need to handle -1 (singletons)
if cluster_idx == -1:
# Singletons: use a large number or handle separately
cluster_idx = 999999 # Use a large number for singletons
else:
cluster_idx = cluster_idx + 1 # Convert to 1-based

filename = str(row['filename'])
scan = int(row['scan'])
precursor_mz = float(row['precursor_mz'])
retention_time = float(row['retention_time'])

# Use default values for fields that falcon doesn't provide
# Don't fetch from MGF files - just use defaults
charge = 0
precursor_intensity = 0.0


if 'precursor_charge' in row and pd.notna(row['precursor_charge']):
try:
charge = int(row['precursor_charge'])
except (ValueError, TypeError):
charge = 0
elif 'charge' in row and pd.notna(row['charge']):
try:
charge = int(row['charge'])
except (ValueError, TypeError):
charge = 0

if 'precursor_intensity' in row and pd.notna(row['precursor_intensity']):
try:
precursor_intensity = float(row['precursor_intensity'])
except (ValueError, TypeError):
precursor_intensity = 0.0

# Convert retention time to seconds if it's in minutes
# Falcon typically outputs RT in minutes, mscluster uses seconds
if retention_time > 0 and retention_time < 1000: # Likely in minutes
retention_time = retention_time * 60.0

if not filename.startswith('input_spectra/'):
filename = f"input_spectra/{filename}"

spec_idx = spec_idx_counter
spec_idx_counter += 1

mscluster_row = {
'#ClusterIdx': cluster_idx,
'#Filename': filename,
'#SpecIdx': spec_idx,
'#Scan': scan,
'#ParentMass': precursor_mz,
'#Charge': charge,
'#RetTime': retention_time,
'#PrecIntensity': precursor_intensity
}
mscluster_rows.append(mscluster_row)

# Create DataFrame
mscluster_df = pd.DataFrame(mscluster_rows)

# IMPORTANT: Before filtering, save the original falcon cluster ID (0-based) for each row
# This will help us match falcon MGF clusters to clusterinfo clusters later
# The original falcon cluster ID is stored in the 'cluster' column from falcon.csv
# We need to track: original_falcon_cluster (0-based) -> sequential_index (1-based)

# First, add a column to track original falcon cluster (0-based) before filtering
# We'll need to reconstruct this from the cluster_idx we converted
# cluster_idx was converted from falcon's 0-based to 1-based, so original = cluster_idx - 1
mscluster_df['_original_falcon_cluster'] = mscluster_df['#ClusterIdx'] - 1

# Filter by min_cluster_size
if min_cluster_size > 1:
# Count spectra per cluster
cluster_counts = mscluster_df['#ClusterIdx'].value_counts()
valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index
mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)]

# IMPORTANT: Remap cluster indices to sequential (1, 2, 3, ...) for consistency
# This ensures clusterinfo.tsv, clustersummary.tsv, and MGF SCANS all use the same sequential indices
# This is necessary for compatibility with ExecMolecularParallelPairs which uses index-based CLUSTERID
original_clusters = sorted(mscluster_df['#ClusterIdx'].unique())
cluster_remap = {orig: new for new, orig in enumerate(original_clusters, start=1)}

# Create mapping: original falcon cluster (0-based) -> sequential index (1-based)
# This mapping will be used in falcon_wrapper.py to match MGF clusters
falcon_cluster_to_sequential = {}
for orig_cluster_idx in original_clusters:
original_falcon_cluster = orig_cluster_idx - 1 # Convert back to 0-based
sequential_idx = cluster_remap[orig_cluster_idx]
falcon_cluster_to_sequential[original_falcon_cluster] = sequential_idx

# Remap #ClusterIdx in mscluster_df
mscluster_df['#ClusterIdx'] = mscluster_df['#ClusterIdx'].map(cluster_remap)

# Remove the temporary column
mscluster_df = mscluster_df.drop(columns=['_original_falcon_cluster'])

# Create cluster summary with remapped indices
cluster_summary_rows = []
for cluster_idx in sorted(mscluster_df['#ClusterIdx'].unique()):
cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx]
num_spectra = len(cluster_data)

# Calculate mean RT (in minutes)
mean_rt = cluster_data['#RetTime'].mean() / 60.0

# Calculate parent mass (mean of #ParentMass)
parent_mass = cluster_data['#ParentMass'].mean()

# Calculate precursor mass (parent mass / charge, then take mean)
# If charge is 0, use parent mass directly
precursor_masses = []
for idx, row in cluster_data.iterrows():
if row['#Charge'] > 0:
precursor_masses.append(row['#ParentMass'] / row['#Charge'])
else:
precursor_masses.append(row['#ParentMass'])
precursor_mass = sum(precursor_masses) / len(precursor_masses) if precursor_masses else parent_mass

# Calculate precursor charge (most common charge, or mean if no clear mode)
charges = cluster_data['#Charge'].values
charges_nonzero = charges[charges > 0]
if len(charges_nonzero) > 0:
# Use mode (most common charge) using pandas
charge_series = pd.Series(charges_nonzero)
mode_values = charge_series.mode()
if len(mode_values) > 0:
precursor_charge = int(mode_values.iloc[0])
else:
precursor_charge = int(charges_nonzero.mean())
else:
precursor_charge = 0

# Calculate sum of precursor intensity
sum_precursor_intensity = cluster_data['#PrecIntensity'].sum()

cluster_summary_row = {
'cluster index': cluster_idx, # Already remapped to sequential
'number of spectra': num_spectra,
'parent mass': parent_mass,
'precursor charge': precursor_charge,
'precursor mass': precursor_mass,
'sum(precursor intensity)': sum_precursor_intensity,
'RTMean': mean_rt
}
cluster_summary_rows.append(cluster_summary_row)

cluster_summary_df = pd.DataFrame(cluster_summary_rows)
cluster_summary_df = cluster_summary_df.sort_values('cluster index')

# Ensure cluster index is string type to match network graph node types
# Network graph nodes from pairs file (CLUSTERID1/CLUSTERID2) are typically strings
# This ensures type matching when add_clusterinfo_summary_to_graph checks "if cluster_index in G"
cluster_summary_df['cluster index'] = cluster_summary_df['cluster index'].astype(str)

# Save outputs
mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False)
cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False)

print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters")
print(f"Saved clusterinfo to {output_clusterinfo}")
print(f"Saved clustersummary to {output_clustersummary}")
print(f"Note: Cluster indices have been remapped to sequential (1, 2, 3, ...) for consistency")


def main():
parser = argparse.ArgumentParser(description='Convert Falcon output to MSCluster format')
parser.add_argument('falcon_csv', help='Falcon CSV output file')
parser.add_argument('input_spectra_folder', help='Input spectra folder (for reference, not used for fetching data)')
parser.add_argument('output_clusterinfo', help='Output clusterinfo.tsv file')
parser.add_argument('output_clustersummary', help='Output clustersummary.tsv file')
parser.add_argument('--min_cluster_size', type=int, default=2, help='Minimum cluster size')

args = parser.parse_args()

convert_falcon_to_mscluster_format(
args.falcon_csv,
args.input_spectra_folder,
args.output_clusterinfo,
args.output_clustersummary,
args.min_cluster_size
)


if __name__ == "__main__":
main()
Loading