Reed-CompBio · tristan-f-r · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -2,10 +2,11 @@ import os
 from spras import runner
 import shutil
 import yaml
-from spras.dataset import Dataset
-from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, cytoscape
 import spras.config.config as _config
+from spras.dataset import Dataset
+from spras.evaluation import Evaluation
+from spras.statistics import from_output_pathway, statistics_computation, statistics_options
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
@@ -34,7 +35,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -282,7 +282,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])
@@ -310,18 +310,36 @@ rule viz_cytoscape:
     run:
         cytoscape.run_cytoscape(input.pathways, output.session, container_settings)
 
+# We generate new Snakemake rules for every statistic
+# to allow parallel and lazy computation of individual statistics
+for keys, values in statistics_computation.items():
+    pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
+    rule:
+        # (See https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#procedural-rule-definition)
+        name: pythonic_name
+        input: pathway_file = rules.parse_output.output.standardized_file
+        output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
+        run:
+            (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
+            graph = from_output_pathway(input.pathway_file)
+            for computed, output in zip(values(graph), output):
+                Path(output).write_text(str(computed))
 
 # Write a single summary table for all pathways for each dataset
 rule summary_table:
     input:
         # Collect all pathways generated for the dataset
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
-        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
+        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
+        # Collect all possible options
+        statistics = expand(
+            '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
+            out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
     output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt'])
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset

diff --git a/config/config.yaml b/config/config.yaml
@@ -3,6 +3,15 @@
 # The length of the hash used to identify a parameter combination
 hash_length: 7
 
+# If enabled, this tags all output files with a SPRAS 'revision version'.
+# By default, this will be the hash of all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+# in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+# For some files, the 'SPRAS revision' may be tied to the specific format version that file is on.
+#
+# By default, this is disabled, as it can make output file names confusing. Here, it's set to true since we use this
+# configuration file for testing.
+immutable_files: true
+
 # Collection of container options
 containers:
   # Specify the container framework used by each PRM wrapper. Valid options include:

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,13 +1,14 @@
+import ast
 from pathlib import Path
-from statistics import median
 from typing import Iterable
 
-import networkx as nx
 import pandas as pd
 
+from spras.statistics import from_output_pathway
+
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str], statistics_files: list) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
@@ -17,6 +18,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
     combinations.
     @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
+    @param statistics_files: a list of statistic files with the computed statistics.
     @return: pandas DataFrame with summary information
     """
     # Ensure that NODEID is the first column
@@ -39,52 +41,18 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
     # Iterate through each network file path
     for index, file_path in enumerate(sorted(file_paths)):
-        with open(file_path, 'r') as f:
-            lines = f.readlines()[1:]  # skip the header line
-
         # directed or mixed graphs are parsed and summarized as an undirected graph
-        nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
+        nw = from_output_pathway(file_path)
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
-        number_nodes = nw.number_of_nodes()
-        number_edges = nw.number_of_edges()
-        ncc = nx.number_connected_components(nw)
-
-        # Save the max/median degree, average clustering coefficient, and density
-        if number_nodes == 0:
-            max_degree = 0
-            median_degree = 0.0
-            density = 0.0
-        else:
-            degrees = [deg for _, deg in nw.degree()]
-            max_degree = max(degrees)
-            median_degree = median(degrees)
-            density = nx.density(nw)
-
-        cc = list(nx.connected_components(nw))
-        # Save the max diameter
-        # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
-        diameters = [
-            nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
-            for c in cc
-        ]
-        max_diameter = max(diameters, default=0)
-
-        # Save the average path lengths
-        # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
-        avg_path_lengths = [
-            nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
-            for c in cc
-        ]
-
-        if len(avg_path_lengths) != 0:
-            avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
-        else:
-            avg_path_len = 0.0
+
+        # We use ast.literal_eval here to convert statistic file outputs to ints or floats depending on their string representation.
+        # (e.g. "5.0" -> float(5.0), while "5" -> int(5).)
+        graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
+        cur_nw_info = [nw_name, *graph_statistics]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -105,8 +73,10 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
+    # Get the list of statistic names by their file names
+    statistics_options = [Path(file).stem for file in statistics_files]
     # Prepare column names
-    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
+    col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 
@@ -120,65 +90,4 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     return nw_info
 
 
-def degree(g):
-    return dict(g.degree)
-
-# TODO: redo .run code to work on mixed graphs
-# stats is just a list of functions to apply to the graph.
-# They should take as input a networkx graph or digraph but may have any output.
-# stats = [degree, nx.clustering, nx.betweenness_centrality]
-
-
-# def produce_statistics(g: nx.Graph, s=None) -> dict:
-#     global stats
-#     if s is not None:
-#         stats = s
-#     d = dict()
-#     for s in stats:
-#         sname = s.__name__
-#         d[sname] = s(g)
-#     return d
-
-
-# def load_graph(path: str) -> nx.Graph:
-#     g = nx.read_edgelist(path, data=(('weight', float), ('Direction',str)))
-#     return g
-
-
-# def save(data, pth):
-#     fout = open(pth, 'w')
-#     fout.write('#node\t%s\n' % '\t'.join([s.__name__ for s in stats]))
-#     for node in data[stats[0].__name__]:
-#         row = [data[s.__name__][node] for s in stats]
-#         fout.write('%s\t%s\n' % (node, '\t'.join([str(d) for d in row])))
-#     fout.close()
-
-
-# def run(infile: str, outfile: str) -> None:
-#     """
-#     run function that wraps above functions.
-#     """
-#     # if output directory doesn't exist, make it.
-#     outdir = os.path.dirname(outfile)
-#     if not os.path.exists(outdir):
-#         os.makedirs(outdir)
-
-#     # load graph, produce stats, and write to human-readable file.
-#     g = load_graph(infile)
-#     dat = produce_statistics(g)
-#     save(dat, outfile)
-
-
-# def main(argv):
-#     """
-#     for testing
-#     """
-#     g = load_graph(argv[1])
-#     print(g.nodes)
-#     dat = produce_statistics(g)
-#     print(dat)
-#     save(dat, argv[2])
-
-
-# if __name__ == '__main__':
-#     main(sys.argv)
+# TODO: redo the above code to work on mixed graphs
diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,7 +13,11 @@
 """
 
 import copy as copy
+import functools
+import hashlib
+import importlib.metadata
 import itertools as it
+import sysconfig
 import warnings
 from pathlib import Path
 from typing import Any
@@ -27,6 +31,46 @@
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the current revision of SPRAS.
+
+    Note: This is not dependent on the SPRAS release version number nor the git commit, but rather solely on the PyPA RECORD file,
+    (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
+    hashes of all of the installed SPRAS files [excluding RECORD itself], and is also included in the package distribution.
+    This means that, when developing SPRAS, `spras_revision` will be updated when spras is initially installed. However, for editable
+    pip installs (such as the pip installation used when developing spras), the `spras_revision` will not be updated.
+    """
+    try:
+        site_packages_path = sysconfig.get_path("purelib") # where .dist-info is located.
+
+        record_path = Path(
+            site_packages_path,
+            f"spras-{importlib.metadata.version('spras')}.dist-info",
+            "RECORD"
+        )
+        with open(record_path, 'rb', buffering=0) as f:
+            # Truncated to the magic value 8, the length of the short git revision.
+            return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+    except importlib.metadata.PackageNotFoundError as err:
+        raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err
+
+
+def attach_spras_revision(immutable_files: bool, label: str) -> str:
+    """
+    Attaches the SPRAS revision to a label.
+    This function signature may become more complex as specific labels get versioned.
+
+    @param label: The label to attach the SPRAS revision to.
+    @param immutable_files: if False, this function is equivalent to `id`.
+    """
+    if immutable_files is False: return label
+    # We use the `_` separator here instead of `-` as summary, analysis, and gold standard parts of the
+    # Snakemake workflow process file names by splitting on hyphens to produce new jobs.
+    # If this was separated with a hyphen, we would mess with that string manipulation logic.
+    return f"{label}_{spras_revision()}"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
@@ -88,6 +132,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.analysis_include_ml_aggregate_algo = None
         # A Boolean specifying whether to run the evaluation per algorithm analysis
         self.analysis_include_evaluation_aggregate_algo = None
+        # Specifies whether the files should be OSDF-immutable (i.e. the file names change when the file itself changes)
+        self.immutable_files = parsed_raw_config.immutable_files
 
         self.process_config(parsed_raw_config)
 
@@ -117,6 +163,12 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(self.immutable_files, dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(self.immutable_files, gold_standard.label)
+
         for dataset in raw_config.datasets:
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
@@ -130,8 +182,14 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(self.immutable_files, label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(
+                functools.partial(attach_spras_revision, self.immutable_files),
+                gold_standard["dataset_labels"]
+            )
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -187,7 +245,11 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    if self.immutable_files:
+                        # Incorporates the `spras_revision` into the hash
+                        hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/spras/config/schema.py b/spras/config/schema.py
@@ -101,6 +101,15 @@ class ReconstructionSettings(BaseModel):
 
 class RawConfig(BaseModel):
     containers: ContainerSettings
+    immutable_files: bool = False
+    """
+    If enabled, this tags all files with their local file version.
+    Most files do not have a specific version, and by default, this will be the hash of
+    all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+    in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+
+    By default, this is disabled, as it can make output file names confusing.
+    """
 
     hash_length: int = DEFAULT_HASH_LENGTH
     "The length of the hash used to identify a parameter combination"