Reed-CompBio · tristan-f-r · Jul 1, 2025 · Jul 28, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -16,6 +16,8 @@
 		// For web display
 		"ghcr.io/devcontainers/features/node:1": {},
 		// For scripting
-		"ghcr.io/va-h/devcontainers-features/uv:1": {}
+		"ghcr.io/va-h/devcontainers-features/uv:1": {},
+		// For paxtools
+		"ghcr.io/devcontainers/features/java:1": {}
 	}
 }
diff --git a/cache/__init__.py b/cache/__init__.py
@@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False):
 
     Path(output).unlink(missing_ok=True)
 
-    # Re-download if the directive has expired.
+    # Re-download if the directive has expired / the artifact mysteriously disappeared.
     cache_item = get_cache_item(directive)
-    if has_expired(directive):
+    if has_expired(directive) or not (artifacts_dir / artifact_name).exists():
         (artifacts_dir / artifact_name).unlink(missing_ok=True)
         cache_item.download(artifacts_dir / artifact_name)
 

diff --git a/cache/directory.py b/cache/directory.py
diff --git a/datasets/README.md b/datasets/README.md
@@ -11,3 +11,8 @@ Many of the datasets here have been stripped of their extra post-analysis. Here,
 - [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases)
 - [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap)
 - [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress)
+
+## `explore` folders
+
+To motivate certain decisions made in-code, such as `synthetic-data`'s PANTHER pathway choices, we provide scripts that use live data
+to assist in data curation. These folders can also contain exploratory CLIs for motivating e.g. magic constants.
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
@@ -13,7 +13,7 @@ produce_fetch_rules({
     "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
     "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
     "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
-    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
     "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
 })
 
@@ -42,7 +42,7 @@ rule files:
     input:
         "data/inputs.csv",
         "data/gold_standard.csv",
-        "raw/9606.protein.links.txt"
+        "raw/9606.protein.links.full.txt"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",

diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
@@ -42,7 +42,7 @@ def main():
 
     # See /cache/directory.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
+    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None)
 
     # Threshold anything above a confidence score of 900 to trim down the background interactome
     string = string[string.iloc[:, 2] > 900]

diff --git a/datasets/synthetic-data/.gitignore b/datasets/synthetic-data/.gitignore
@@ -0,0 +1,3 @@
+intermediate
+processed
+raw
diff --git a/datasets/synthetic-data/README.md b/datasets/synthetic-data/README.md
@@ -0,0 +1,67 @@
+# Synthetic Data
+
+## Download STRING Human Interactome
+1. Download the STRING *Homo sapiens* `9606.protein.links.full.v12.0.txt.gz` database file from [STRING](https://string-db.org/cgi/download?sessionId=bL9sRTdIaUEt&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt).
+2. Move the downloaded file into the `raw/human-interactome/` folder.
+3. From the `raw/synthetic-data/` directory, extract the file using:
+
+   ```sh
+   gunzip human-interactome/9606.protein.links.full.v12.0.txt.gz
+   ```
+
+## Download New PANTHER Pathways
+1. Visit [Pathway Commons](https://www.pathwaycommons.org/).
+2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source.  
+   Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway)
+3. Click on the desired pathway and download the **Extended SIF** version of the pathway.
+4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded.
+5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly.
+
+## Sources and Targets
+
+[Sources](http://wlab.ethz.ch/surfaceome/), or `table_S3_surfaceome.xlsx`, (see [original paper](https://doi.org/10.1073/pnas.1808790115))
+are silico human surfaceomes receptors.
+
+[Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907))
+are human transcription factors.
+
+## Steps to Generate SPRAS-Compatible Pathways
+
+This entire workflow can also be done with `uv run snakemake --cores 1` inside this directory.
+
+### 1. Process PANTHER Pathways
+
+1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry.
+2. Run the command:
+   ```sh
+   uv run scripts/process_panther_pathway.py <pathway>
+   ```
+3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory:
+- `edges.txt`
+- `nodes.txt`
+- `prizes-100.txt`
+- `sources.txt`
+- `targets.txt`
+
+### 2. Convert Pathways to SPRAS-Compatible Format
+1.	In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
+2.	From the synthetic-data/ directory, run the command:
+```
+python scripts/panther_spras_formatting.py
+```
+3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.  
+Each subfolder will include the following three files:
+- `<pathway_name>_gs_edges.txt`
+- `<pathway_name>_gs_nodes.txt`
+- `<pathway_name>_node_prizes.txt`
+
+# Pilot Data
+For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both:
+- the list in `combine.py`
+- the list in `overlap_analytics.py`
+
+Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to:
+- the `pathways` vector in `ProcessPantherPathway.R`
+- the list in `panther_spras_formatting.py`
+
+**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.**
diff --git a/datasets/synthetic-data/Snakefile b/datasets/synthetic-data/Snakefile
@@ -0,0 +1,91 @@
+include: "../../cache/Snakefile"
+
+pathways = [
+    "Apoptosis_signaling_pathway",
+    "B_cell_activation",
+    "Beta3_adrenergic_receptor_signaling_pathway",
+    "Cadherin_signaling_pathway",
+    "Fas_signaling_pathway",
+    "FGF_signaling_pathway",
+    "Hedgehog_signaling_pathway",
+    "Insulin_IGF_pathway_protein_kinase_B_signaling_cascade",
+    "Interferon_gamma_signaling_pathway",
+    "Interleukin_signaling_pathway",
+    "JAK_STAT_signaling_pathway",
+    "Nicotinic_acetylcholine_receptor_signaling_pathway",
+    "Notch_signaling_pathway",
+    "PDGF_signaling_pathway",
+    "Ras_pathway",
+    "T_cell_activation",
+    "Toll_receptor_signaling_pathway",
+    "VEGF_signaling_pathway",
+    "Wnt_signaling_pathway",
+]
+
+# TODO: deduplicate from sampling.py
+thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))
+
+rule all:
+    input:
+        "raw/9606.protein.links.full.v12.0.txt",
+        expand([
+            "thresholded/{threshold}/{pathway}/interactome.txt",
+            "thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
+        ], pathway=pathways, threshold=thresholds)
+
+produce_fetch_rules({
+    **{
+        "raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
+        "raw/human-interactome/table_S3_surfaceome.xlsx": ["Surfaceome", "table_S3_surfaceome.xlsx"],
+        "raw/human-interactome/Homo_sapiens_TF.tsv": ["TranscriptionFactors", "Homo_sapiens_TF.tsv"],
+        "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
+    },
+    # See directory.py for the online/cached location of all pathways from PathwayCommons.
+    **{f"raw/pathway-data/{k}.txt": ["PathwayCommons", "PANTHER", f"{k}.txt"] for k in pathways}
+})
+rule process_tfs:
+    input:
+        "raw/human-interactome/Homo_sapiens_TF.tsv",
+        "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv"
+    output:
+        "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+    shell:
+        "uv run scripts/map_transcription_factors.py"
+
+rule process_panther_pathway:
+    input:
+        "raw/pathway-data/{pathway}.txt",
+        "raw/human-interactome/table_S3_surfaceome.xlsx",
+        "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+    output:
+        "intermediate/{pathway}/edges.txt",
+        "intermediate/{pathway}/nodes.txt",
+        "intermediate/{pathway}/sources.txt",
+        "intermediate/{pathway}/targets.txt",
+        "intermediate/{pathway}/prizes.txt"
+    shell:
+        "uv run scripts/process_panther_pathway.py {wildcards.pathway}"
+
+rule make_spras_compatible:
+    input:
+        "intermediate/{pathway}/edges.txt",
+        "intermediate/{pathway}/nodes.txt",
+        "intermediate/{pathway}/sources.txt",
+        "intermediate/{pathway}/targets.txt",
+        "intermediate/{pathway}/prizes.txt"
+    output:
+        "processed/{pathway}/{pathway}_node_prizes.txt",
+        "processed/{pathway}/{pathway}_gs_edges.txt",
+        "processed/{pathway}/{pathway}_gs_nodes.txt"
+    shell:
+        "uv run scripts/panther_spras_formatting.py {wildcards.pathway}"
+
+rule threshold:
+    input:
+        "processed/{pathway}/{pathway}_node_prizes.txt",
+        "processed/{pathway}/{pathway}_gs_edges.txt"
+    output:
+        expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
+        expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
+    shell:
+        "uv run scripts/sampling.py {wildcards.pathway}"
diff --git a/datasets/synthetic-data/explore/README.md b/datasets/synthetic-data/explore/README.md
@@ -0,0 +1,5 @@
+# explore
+
+See [the datasets readme](../../README.md) for the motivation for the `explore` folder.
+
+This folder contains `candidates.py`, which is a CLI for finding all viable pathways with our custom filtering criteria.
diff --git a/datasets/synthetic-data/explore/candidates.py b/datasets/synthetic-data/explore/candidates.py
@@ -0,0 +1,64 @@
+"""
+Utility CLI for finding pathway critetia from PathwayCommons based on our desired participant count.
+This is meant to be interactive for easily examining the available pathways from PathwayCommons over PANTHER
+(and perhaps more later!).
+
+See https://www.pathwaycommons.org/pc2/swagger-ui/index.html#/api-controller-v-2 for the API.
+"""
+
+import requests
+
+from pydantic import BaseModel
+
+SEARCH_URL = "https://www.pathwaycommons.org/pc2/v2/search"
+
+# These schemas were manually examined from the API response, and are thus not exhaustive.
+class SearchHit(BaseModel):
+    uri: str
+    name: str
+    biopaxClass: str
+    numParticipants: int
+    numProcesses: int
+
+class SearchResponse(BaseModel):
+    numHits: int
+    maxHitsPerPage: int
+    searchHit: list[SearchHit]
+
+def request(page: int) -> SearchResponse:
+    return SearchResponse.model_validate(requests.post(
+        'https://www.pathwaycommons.org/pc2/v2/search',
+        headers={
+            'accept': 'application/json',
+            'Content-Type': 'application/json',
+        }, json={
+            # Indicates a BioPAX pathway
+            'q': 'xrefid:P*',
+            'type': 'pathway',
+            'organism': [
+                '9606',
+            ],
+            'datasource': [
+                'panther',
+            ],
+            'page': page,
+        }
+    ).json())
+
+def main():
+    # TODO: weirdly constructed loop? could be nicer if we use numHits and maxHitsPerPage
+    hits: list[SearchHit] = []
+    page = 0
+    response = request(page)
+    print(f"Paginating {page}...")
+    while len(response.searchHit) != 0:
+        hits.extend(response.searchHit)
+        page += 1
+        response = request(page)
+        print(f"Paginating {page}...")
+
+    for hit in hits:
+        print(f"({hit.numParticipants}) {hit.name}")
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/synthetic-data/scripts/interactome.py b/datasets/synthetic-data/scripts/interactome.py
@@ -0,0 +1,81 @@
+import pandas
+from pathlib import Path
+
+current_directory = Path(__file__).parent.resolve()
+
+
+def main():
+    # Convert the interactome to SPRAS format
+    print("Reading interactome...")
+    interactome_df = pandas.read_csv(
+        current_directory / ".." / "raw" / "9606.protein.links.full.v12.0.txt", sep=" ", usecols=["protein1", "protein2", "combined_score"]
+    )
+    interactome_df.columns = ["Protein1", "Protein2", "Weight"]
+
+    # We also want to representatively remove a certain percentage of elements from the interactome,
+    # to make sure our interactome downsampling preserves edge weight distributions
+    # (we don't care to preserve other major topological properties just yet.)
+    # since this file is large, we opt for streaming the interactome for removing edges instead
+
+    print("Initially processing interactome...")
+    interactome_df["Weight"] = interactome_df["Weight"].div(1000)  # scores are from 1-1000: we normalize from 0-1.
+    interactome_df["Direction"] = "U"
+    print("Sorting interactome...")
+    interactome_df = interactome_df.sort_values("Weight", kind="stable")
+
+    print("Mapping interactome...")
+    # STRINGDB -> UniProt accession ID pairings
+    UniProt_AC = pandas.read_csv(current_directory / ".." / "raw" / "human-interactome" / "String_to_Uniprot_ids_2025_04_06.tsv", sep="\t", header=0)
+    one_to_many_dict = UniProt_AC.groupby("From")["Entry"].apply(list).to_dict()
+
+    def get_aliases(protein_id):
+        return one_to_many_dict.get(protein_id, [])
+
+    interactome_df["Protein1_uniprot"] = interactome_df["Protein1"].apply(get_aliases)
+    interactome_df["Protein2_uniprot"] = interactome_df["Protein2"].apply(get_aliases)
+
+    interactome_df = interactome_df.explode("Protein1_uniprot").explode("Protein2_uniprot")
+
+    missing_alias_edges = interactome_df[(interactome_df["Protein1_uniprot"].isna()) | (interactome_df["Protein2_uniprot"].isna())]
+
+    proteins_without_aliases = (
+        pandas.concat(
+            [
+                missing_alias_edges.loc[missing_alias_edges["Protein1_uniprot"].isna(), "Protein1"],
+                missing_alias_edges.loc[missing_alias_edges["Protein2_uniprot"].isna(), "Protein2"],
+            ],
+            ignore_index=True,
+        )
+        .drop_duplicates()
+        .reset_index(drop=True)
+    )
+    proteins_without_aliases = proteins_without_aliases.to_frame(name="protein")
+
+    removed_edges = missing_alias_edges[["Protein1", "Protein2"]]
+    removed_edges = removed_edges.drop_duplicates().reset_index(drop=True)
+
+    (current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes").mkdir(exist_ok=True, parents=True)
+    proteins_without_aliases.to_csv(
+        current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "proteins_missing_aliases.csv",
+        sep="\t",
+        index=False,
+        header=True,
+    )
+    removed_edges.to_csv(
+        current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "removed_edges.txt",
+        sep="\t",
+        index=False,
+        header=True,
+    )
+    interactome_df = interactome_df.dropna(subset=["Protein1_uniprot", "Protein2_uniprot"]).reset_index(drop=True)
+    interactome_df = interactome_df[["Protein1_uniprot", "Protein2_uniprot", "Weight", "Direction"]]
+
+    print("Counting weight counts...")
+    interactome_df["Weight"].value_counts(sort=False).to_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t")
+
+    print("Saving interactome...")
+    interactome_df.to_csv(current_directory / ".." / "processed" / "interactome.tsv", sep="\t", header=False, index=False)
+
+
+if __name__ == "__main__":
+    main()