Reed-CompBio · tristan-f-r · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -34,7 +34,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -282,7 +281,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])

diff --git a/config/config.yaml b/config/config.yaml
@@ -3,6 +3,15 @@
 # The length of the hash used to identify a parameter combination
 hash_length: 7
 
+# If enabled, this tags all output files with a SPRAS 'revision version'.
+# By default, this will be the hash of all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+# in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+# For some files, the 'SPRAS revision' may be tied to the specific format version that file is on.
+#
+# By default, this is disabled, as it can make output file names confusing. Here, it's set to true since we use this
+# configuration file for testing.
+immutable_files: true
+
 # Collection of container options
 containers:
   # Specify the container framework used by each PRM wrapper. Valid options include:

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -7,7 +7,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,7 +13,11 @@
 """
 
 import copy as copy
+import functools
+import hashlib
+import importlib.metadata
 import itertools as it
+import sysconfig
 import warnings
 from pathlib import Path
 from typing import Any
@@ -27,6 +31,46 @@
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the current revision of SPRAS.
+
+    Note: This is not dependent on the SPRAS release version number nor the git commit, but rather solely on the PyPA RECORD file,
+    (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
+    hashes of all of the installed SPRAS files [excluding RECORD itself], and is also included in the package distribution.
+    This means that, when developing SPRAS, `spras_revision` will be updated when spras is initially installed. However, for editable
+    pip installs (such as the pip installation used when developing spras), the `spras_revision` will not be updated.
+    """
+    try:
+        site_packages_path = sysconfig.get_path("purelib") # where .dist-info is located.
+
+        record_path = Path(
+            site_packages_path,
+            f"spras-{importlib.metadata.version('spras')}.dist-info",
+            "RECORD"
+        )
+        with open(record_path, 'rb', buffering=0) as f:
+            # Truncated to the magic value 8, the length of the short git revision.
+            return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+    except importlib.metadata.PackageNotFoundError as err:
+        raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err
+
+
+def attach_spras_revision(immutable_files: bool, label: str) -> str:
+    """
+    Attaches the SPRAS revision to a label.
+    This function signature may become more complex as specific labels get versioned.
+
+    @param label: The label to attach the SPRAS revision to.
+    @param immutable_files: if False, this function is equivalent to `id`.
+    """
+    if immutable_files is False: return label
+    # We use the `_` separator here instead of `-` as summary, analysis, and gold standard parts of the
+    # Snakemake workflow process file names by splitting on hyphens to produce new jobs.
+    # If this was separated with a hyphen, we would mess with that string manipulation logic.
+    return f"{label}_{spras_revision()}"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
@@ -88,6 +132,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.analysis_include_ml_aggregate_algo = None
         # A Boolean specifying whether to run the evaluation per algorithm analysis
         self.analysis_include_evaluation_aggregate_algo = None
+        # Specifies whether the files should be OSDF-immutable (i.e. the file names change when the file itself changes)
+        self.immutable_files = parsed_raw_config.immutable_files
 
         self.process_config(parsed_raw_config)
 
@@ -117,6 +163,12 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(self.immutable_files, dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(self.immutable_files, gold_standard.label)
+
         for dataset in raw_config.datasets:
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
@@ -130,8 +182,14 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(self.immutable_files, label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(
+                functools.partial(attach_spras_revision, self.immutable_files),
+                gold_standard["dataset_labels"]
+            )
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -187,7 +245,11 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    if self.immutable_files:
+                        # Incorporates the `spras_revision` into the hash
+                        hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/spras/config/schema.py b/spras/config/schema.py
@@ -101,6 +101,15 @@ class ReconstructionSettings(BaseModel):
 
 class RawConfig(BaseModel):
     containers: ContainerSettings
+    immutable_files: bool = False
+    """
+    If enabled, this tags all files with their local file version.
+    Most files do not have a specific version, and by default, this will be the hash of
+    all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+    in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+
+    By default, this is disabled, as it can make output file names confusing.
+    """
 
     hash_length: int = DEFAULT_HASH_LENGTH
     "The length of the hash used to identify a parameter combination"

diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt
@@ -1,10 +1,4 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
-test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	0.0398936170212766	5	2.0	16	3.882808476926124	27	0	27	27	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	0.007295700506524384	469	6.0	6	2.7973618474338107	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	0.05291005291005291	4	1.0	5	1.306439393939394	28	1	27	28	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	0.04183535762483131	6	1.0	5	1.5084498834498834	39	1	38	39	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	0.0989010989010989	4	1.0	2	1.1866666666666668	14	0	14	14	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	0.0033669841848593955	32	1.0	30	6.72248989073389	531	1	530	531	1	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	0.002836867968446916	35	1.0	24	6.038766691954387	616	1	615	616	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
+25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+1874	12845	1	0.007319084148670001	469	6.0	6	2.7952001166950904	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt
@@ -1,13 +1,6 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
-test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{}
-test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'flow': 1, 'capacity': 1}
-test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 200}
-test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 100}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'flow': 1, 'capacity': 1}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 100}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 200}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{}
diff --git a/test/analysis/input/.gitignore b/test/analysis/input/.gitignore
@@ -0,0 +1 @@
+run
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml