Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
20b1580
feat: synthetic pathways
tristan-f-r Jul 1, 2025
fc12b4e
Merge branch 'main' into synthetic
tristan-f-r Jul 28, 2025
8ff381f
Merge branch 'main' into synthetic
tristan-f-r Jan 6, 2026
f7c0c2d
fix: use full protein links to unify synthetic with databases
tristan-f-r Jan 6, 2026
73b6d93
Merge branch 'main' into synthetic
tristan-f-r Jan 24, 2026
2ce621a
re-correct links
tristan-f-r Jan 24, 2026
280b92a
fix: interactome fetching
tristan-f-r Jan 24, 2026
db30556
fix(diseases): fetch correct string links
tristan-f-r Jan 24, 2026
0658528
chore: mv to scripts
tristan-f-r Jan 30, 2026
e024e2c
chore: move to scripts, Pathify
tristan-f-r Jan 30, 2026
7b09381
style: fmt
tristan-f-r Jan 30, 2026
2a5feec
drop old thresholding
tristan-f-r Jan 30, 2026
e389b32
begin sampling
tristan-f-r Jan 30, 2026
af0ac30
chore: mv
tristan-f-r Jan 30, 2026
d1ade54
rename
tristan-f-r Jan 30, 2026
7483eea
fix: compute weight counts normally
tristan-f-r Jan 30, 2026
05cf6d6
feat: weight-preserving sampling
tristan-f-r Feb 1, 2026
58e9717
feat: sampling
tristan-f-r Feb 2, 2026
a0f7079
feat: scripted sampling
tristan-f-r Feb 2, 2026
3bb00e8
chore: del some raw
tristan-f-r Feb 3, 2026
775d144
drop all raw interactome files
tristan-f-r Feb 3, 2026
5771bc7
feat: finish up tf mapping again
tristan-f-r Feb 3, 2026
813235d
feat: sampling on a pathway
tristan-f-r Feb 3, 2026
7fb4642
style: fmt
tristan-f-r Feb 3, 2026
83fee81
chore: drop p38 mapk, add notes
tristan-f-r Feb 3, 2026
d7da699
init candidates explorer
tristan-f-r Feb 4, 2026
d45ec82
chore: update directory urls
tristan-f-r Feb 4, 2026
0d3b77e
chore: drop all downloaded pathways
tristan-f-r Feb 4, 2026
751a8f2
fix: file extensions and such
tristan-f-r Feb 4, 2026
2fceaa9
chore: explore and such
tristan-f-r Feb 4, 2026
ac5b93c
feat: base thresholding workflow
tristan-f-r Feb 9, 2026
5cb7352
chore: add paxtools
tristan-f-r Feb 11, 2026
9d3e194
feat: trimming
tristan-f-r Feb 12, 2026
81a4e4e
style: fmt
tristan-f-r Feb 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// For web display
"ghcr.io/devcontainers/features/node:1": {},
// For scripting
"ghcr.io/va-h/devcontainers-features/uv:1": {}
"ghcr.io/va-h/devcontainers-features/uv:1": {},
// For paxtools
"ghcr.io/devcontainers/features/java:1": {}
}
}
4 changes: 2 additions & 2 deletions cache/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False):

Path(output).unlink(missing_ok=True)

# Re-download if the directive has expired.
# Re-download if the directive has expired / the artifact mysteriously disappeared.
cache_item = get_cache_item(directive)
if has_expired(directive):
if has_expired(directive) or not (artifacts_dir / artifact_name).exists():
(artifacts_dir / artifact_name).unlink(missing_ok=True)
cache_item.download(artifacts_dir / artifact_name)

Expand Down
284 changes: 230 additions & 54 deletions cache/directory.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions datasets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ Many of the datasets here have been stripped of their extra post-analysis. Here,
- [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases)
- [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap)
- [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress)

## `explore` folders

To motivate certain decisions made in-code, such as `synthetic-data`'s PANTHER pathway choices, we provide scripts that use live data
to assist in data curation. These folders can also contain exploratory CLIs for motivating e.g. magic constants.
4 changes: 2 additions & 2 deletions datasets/diseases/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ produce_fetch_rules({
"raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
"raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
"raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
"raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
"raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
"raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
})

Expand Down Expand Up @@ -42,7 +42,7 @@ rule files:
input:
"data/inputs.csv",
"data/gold_standard.csv",
"raw/9606.protein.links.txt"
"raw/9606.protein.links.full.txt"
output:
# These are the two we use for the SPRAS run for now
"GS_files/Alopecia_areata_GS.txt",
Expand Down
2 changes: 1 addition & 1 deletion datasets/diseases/scripts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main():

# See /cache/directory.py for information on how this was grabbed.
# 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None)

# Threshold anything above a confidence score of 900 to trim down the background interactome
string = string[string.iloc[:, 2] > 900]
Expand Down
3 changes: 3 additions & 0 deletions datasets/synthetic-data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
intermediate
processed
raw
67 changes: 67 additions & 0 deletions datasets/synthetic-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Synthetic Data

## Download STRING Human Interactome
1. Download the STRING *Homo sapiens* `9606.protein.links.full.v12.0.txt.gz` database file from [STRING](https://string-db.org/cgi/download?sessionId=bL9sRTdIaUEt&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt).
2. Move the downloaded file into the `raw/human-interactome/` folder.
3. From the `raw/synthetic-data/` directory, extract the file using:

```sh
gunzip human-interactome/9606.protein.links.full.v12.0.txt.gz
```

## Download New PANTHER Pathways
1. Visit [Pathway Commons](https://www.pathwaycommons.org/).
2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source.
Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway)
3. Click on the desired pathway and download the **Extended SIF** version of the pathway.
4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded.
5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly.

## Sources and Targets

[Sources](http://wlab.ethz.ch/surfaceome/), or `table_S3_surfaceome.xlsx`, (see [original paper](https://doi.org/10.1073/pnas.1808790115))
are silico human surfaceomes receptors.

[Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907))
are human transcription factors.

## Steps to Generate SPRAS-Compatible Pathways

This entire workflow can also be done with `uv run snakemake --cores 1` inside this directory.

### 1. Process PANTHER Pathways

1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry.
2. Run the command:
```sh
uv run scripts/process_panther_pathway.py <pathway>
```
3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory:
- `edges.txt`
- `nodes.txt`
- `prizes-100.txt`
- `sources.txt`
- `targets.txt`

### 2. Convert Pathways to SPRAS-Compatible Format
1. In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
2. From the synthetic-data/ directory, run the command:
```
python scripts/panther_spras_formatting.py
```
3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.
Each subfolder will include the following three files:
- `<pathway_name>_gs_edges.txt`
- `<pathway_name>_gs_nodes.txt`
- `<pathway_name>_node_prizes.txt`

# Pilot Data
For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both:
- the list in `combine.py`
- the list in `overlap_analytics.py`

Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to:
- the `pathways` vector in `ProcessPantherPathway.R`
- the list in `panther_spras_formatting.py`

**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.**
91 changes: 91 additions & 0 deletions datasets/synthetic-data/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
include: "../../cache/Snakefile"

pathways = [
"Apoptosis_signaling_pathway",
"B_cell_activation",
"Beta3_adrenergic_receptor_signaling_pathway",
"Cadherin_signaling_pathway",
"Fas_signaling_pathway",
"FGF_signaling_pathway",
"Hedgehog_signaling_pathway",
"Insulin_IGF_pathway_protein_kinase_B_signaling_cascade",
"Interferon_gamma_signaling_pathway",
"Interleukin_signaling_pathway",
"JAK_STAT_signaling_pathway",
"Nicotinic_acetylcholine_receptor_signaling_pathway",
"Notch_signaling_pathway",
"PDGF_signaling_pathway",
"Ras_pathway",
"T_cell_activation",
"Toll_receptor_signaling_pathway",
"VEGF_signaling_pathway",
"Wnt_signaling_pathway",
]

# TODO: deduplicate from sampling.py
thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))

rule all:
input:
"raw/9606.protein.links.full.v12.0.txt",
expand([
"thresholded/{threshold}/{pathway}/interactome.txt",
"thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
], pathway=pathways, threshold=thresholds)

produce_fetch_rules({
**{
"raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
"raw/human-interactome/table_S3_surfaceome.xlsx": ["Surfaceome", "table_S3_surfaceome.xlsx"],
"raw/human-interactome/Homo_sapiens_TF.tsv": ["TranscriptionFactors", "Homo_sapiens_TF.tsv"],
"raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
},
# See directory.py for the online/cached location of all pathways from PathwayCommons.
**{f"raw/pathway-data/{k}.txt": ["PathwayCommons", "PANTHER", f"{k}.txt"] for k in pathways}
})
rule process_tfs:
input:
"raw/human-interactome/Homo_sapiens_TF.tsv",
"raw/human-interactome/HUMAN_9606_idmapping_selected.tsv"
output:
"raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
shell:
"uv run scripts/map_transcription_factors.py"

rule process_panther_pathway:
input:
"raw/pathway-data/{pathway}.txt",
"raw/human-interactome/table_S3_surfaceome.xlsx",
"raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
output:
"intermediate/{pathway}/edges.txt",
"intermediate/{pathway}/nodes.txt",
"intermediate/{pathway}/sources.txt",
"intermediate/{pathway}/targets.txt",
"intermediate/{pathway}/prizes.txt"
shell:
"uv run scripts/process_panther_pathway.py {wildcards.pathway}"

rule make_spras_compatible:
input:
"intermediate/{pathway}/edges.txt",
"intermediate/{pathway}/nodes.txt",
"intermediate/{pathway}/sources.txt",
"intermediate/{pathway}/targets.txt",
"intermediate/{pathway}/prizes.txt"
output:
"processed/{pathway}/{pathway}_node_prizes.txt",
"processed/{pathway}/{pathway}_gs_edges.txt",
"processed/{pathway}/{pathway}_gs_nodes.txt"
shell:
"uv run scripts/panther_spras_formatting.py {wildcards.pathway}"

rule threshold:
input:
"processed/{pathway}/{pathway}_node_prizes.txt",
"processed/{pathway}/{pathway}_gs_edges.txt"
output:
expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
shell:
"uv run scripts/sampling.py {wildcards.pathway}"
5 changes: 5 additions & 0 deletions datasets/synthetic-data/explore/README.md
Copy link
Collaborator

@ntalluri ntalluri Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we providing this functionality?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in the comment above, I want to stop downloading PathwayCommons SIF files individually and extract them from OWL. I originally added this file as a quick exploration tool of this data, but I'll drop it once you have your list of signaling pathways from PathwayCommons and the criteria you used to fetch them, and move it over to use said automated selection.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# explore

See [the datasets readme](../../README.md) for the motivation for the `explore` folder.

This folder contains `candidates.py`, which is a CLI for finding all viable pathways with our custom filtering criteria.
64 changes: 64 additions & 0 deletions datasets/synthetic-data/explore/candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Utility CLI for finding pathway critetia from PathwayCommons based on our desired participant count.
This is meant to be interactive for easily examining the available pathways from PathwayCommons over PANTHER
(and perhaps more later!).

See https://www.pathwaycommons.org/pc2/swagger-ui/index.html#/api-controller-v-2 for the API.
"""

import requests

from pydantic import BaseModel

SEARCH_URL = "https://www.pathwaycommons.org/pc2/v2/search"

# These schemas were manually examined from the API response, and are thus not exhaustive.
class SearchHit(BaseModel):
uri: str
name: str
biopaxClass: str
numParticipants: int
numProcesses: int

class SearchResponse(BaseModel):
numHits: int
maxHitsPerPage: int
searchHit: list[SearchHit]

def request(page: int) -> SearchResponse:
return SearchResponse.model_validate(requests.post(
'https://www.pathwaycommons.org/pc2/v2/search',
headers={
'accept': 'application/json',
'Content-Type': 'application/json',
}, json={
# Indicates a BioPAX pathway
'q': 'xrefid:P*',
'type': 'pathway',
'organism': [
'9606',
],
'datasource': [
'panther',
],
'page': page,
}
).json())

def main():
# TODO: weirdly constructed loop? could be nicer if we use numHits and maxHitsPerPage
hits: list[SearchHit] = []
page = 0
response = request(page)
print(f"Paginating {page}...")
while len(response.searchHit) != 0:
hits.extend(response.searchHit)
page += 1
response = request(page)
print(f"Paginating {page}...")

for hit in hits:
print(f"({hit.numParticipants}) {hit.name}")

if __name__ == "__main__":
main()
81 changes: 81 additions & 0 deletions datasets/synthetic-data/scripts/interactome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas
from pathlib import Path

current_directory = Path(__file__).parent.resolve()


def main():
# Convert the interactome to SPRAS format
print("Reading interactome...")
interactome_df = pandas.read_csv(
current_directory / ".." / "raw" / "9606.protein.links.full.v12.0.txt", sep=" ", usecols=["protein1", "protein2", "combined_score"]
)
interactome_df.columns = ["Protein1", "Protein2", "Weight"]

# We also want to representatively remove a certain percentage of elements from the interactome,
# to make sure our interactome downsampling preserves edge weight distributions
# (we don't care to preserve other major topological properties just yet.)
# since this file is large, we opt for streaming the interactome for removing edges instead

print("Initially processing interactome...")
interactome_df["Weight"] = interactome_df["Weight"].div(1000) # scores are from 1-1000: we normalize from 0-1.
interactome_df["Direction"] = "U"
print("Sorting interactome...")
interactome_df = interactome_df.sort_values("Weight", kind="stable")

print("Mapping interactome...")
# STRINGDB -> UniProt accession ID pairings
UniProt_AC = pandas.read_csv(current_directory / ".." / "raw" / "human-interactome" / "String_to_Uniprot_ids_2025_04_06.tsv", sep="\t", header=0)
one_to_many_dict = UniProt_AC.groupby("From")["Entry"].apply(list).to_dict()

def get_aliases(protein_id):
return one_to_many_dict.get(protein_id, [])

interactome_df["Protein1_uniprot"] = interactome_df["Protein1"].apply(get_aliases)
interactome_df["Protein2_uniprot"] = interactome_df["Protein2"].apply(get_aliases)

interactome_df = interactome_df.explode("Protein1_uniprot").explode("Protein2_uniprot")

missing_alias_edges = interactome_df[(interactome_df["Protein1_uniprot"].isna()) | (interactome_df["Protein2_uniprot"].isna())]

proteins_without_aliases = (
pandas.concat(
[
missing_alias_edges.loc[missing_alias_edges["Protein1_uniprot"].isna(), "Protein1"],
missing_alias_edges.loc[missing_alias_edges["Protein2_uniprot"].isna(), "Protein2"],
],
ignore_index=True,
)
.drop_duplicates()
.reset_index(drop=True)
)
proteins_without_aliases = proteins_without_aliases.to_frame(name="protein")

removed_edges = missing_alias_edges[["Protein1", "Protein2"]]
removed_edges = removed_edges.drop_duplicates().reset_index(drop=True)

(current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes").mkdir(exist_ok=True, parents=True)
proteins_without_aliases.to_csv(
current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "proteins_missing_aliases.csv",
sep="\t",
index=False,
header=True,
)
removed_edges.to_csv(
current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "removed_edges.txt",
sep="\t",
index=False,
header=True,
)
interactome_df = interactome_df.dropna(subset=["Protein1_uniprot", "Protein2_uniprot"]).reset_index(drop=True)
interactome_df = interactome_df[["Protein1_uniprot", "Protein2_uniprot", "Weight", "Direction"]]

print("Counting weight counts...")
interactome_df["Weight"].value_counts(sort=False).to_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t")

print("Saving interactome...")
interactome_df.to_csv(current_directory / ".." / "processed" / "interactome.tsv", sep="\t", header=False, index=False)


if __name__ == "__main__":
main()
Loading
Loading